Skip to content

Commit

Permalink
[RW-5226] Setup sumologic egress alerting in terraform (#7)
Browse files Browse the repository at this point in the history
* sumologic hello world

* egress detection module

* try templating the query text separately

* progress

* try other provider stmnt

* fix vars

* nit

* merge master

* major refactor

* major refactor

* hex

* fix

* finalize

* test

* test

* tmp

* tmp

* tmp

* address comment

* address comment

* add more

* address comment

* address comment

Co-authored-by: Jay Carlton <[email protected]>
  • Loading branch information
yonghaoy and jaycarlton authored Jan 14, 2021
1 parent 686dfa1 commit cf5869e
Show file tree
Hide file tree
Showing 12 changed files with 387 additions and 1 deletion.
5 changes: 5 additions & 0 deletions modules/workbench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,8 @@ it makes sense to add these sorts of things:
In other words, the primary focus of the module is the Reporting system, but it may be convenient to
add reporting-specific artifacts that might otherwise be concerned with Monitoring or other auxiliary
services.

### Egress alert
Generates sumologic contents to analyse a set of Google VPC flow logs for high-egress events. The incoming set of flow
logs are bucketed into non-overlapping time slices and grouped by project / VM name to identify VMs with high amounts
of egress in a given period of time.
8 changes: 8 additions & 0 deletions modules/workbench/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,11 @@ module "reporting" {
# provider
project_id = var.project_id
}

module "egress_detection" {
source = "./modules/egress_detection"
aou_env = var.aou_env
sumologic_egress_thresholds = var.sumologic_egress_thresholds
sumologic_parent_folder_id_hexadecimal = var.sumologic_parent_folder_id_hexadecimal
sumologic_webhook_id_hexadecimal = var.sumologic_webhook_id_hexadecimal
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
{
"type": "SavedSearchWithScheduleSyncDefinition",
"name": "${aou_env} / ${tier_name}-tier / ${egress_threshold_mib}-Mib / ${egress_window_sec}-sec egress event",
"search": {
"queryText": ${query_text},
"defaultTimeRange": "-120m",
"byReceiptTime": true,
"viewName": "",
"viewStartTime": "1970-01-01T00:00:00Z",
"queryParameters": [
{
"name": "environment",
"label": "environment",
"description": "",
"dataType": "QUERY_FRAGMENT",
"value": "${aou_env}",
"autoComplete": {
"autoCompleteType": "SKIP_AUTOCOMPLETE",
"autoCompleteKey": null,
"autoCompleteValues": [],
"lookupFileName": null,
"lookupLabelColumn": null,
"lookupValueColumn": null
}
},
{
"name": "window_in_seconds",
"label": "window_in_seconds",
"description": "",
"dataType": "NUMBER",
"value": "${egress_window_sec}",
"autoComplete": {
"autoCompleteType": "SKIP_AUTOCOMPLETE",
"autoCompleteKey": null,
"autoCompleteValues": [],
"lookupFileName": null,
"lookupLabelColumn": null,
"lookupValueColumn": null
}
},
{
"name": "egress_threshold_in_mib",
"label": "egress_threshold_in_mib",
"description": "",
"dataType": "NUMBER",
"value": "${egress_threshold_mib}",
"autoComplete": {
"autoCompleteType": "SKIP_AUTOCOMPLETE",
"autoCompleteKey": null,
"autoCompleteValues": [],
"lookupFileName": null,
"lookupLabelColumn": null,
"lookupValueColumn": null
}
}
],
"parsingMode": "Manual"
},
"searchSchedule": {
"cronExpression": "${cron_expression}",
"displayableTimeRange": "${time_range}",
"parseableTimeRange": {
"type": "BeginBoundedTimeRange",
"from": {
"type": "RelativeTimeRangeBoundary",
"relativeTime": "${time_range}"
},
"to": null
},
"timeZone": "America/New_York",
"threshold": {
"thresholdType": "group",
"operator": "gt",
"count": 0
},
"notification": {
"taskType": "WebhookSearchNotificationSyncDefinition",
"webhookId": "${webhook_id}",
"payload": null,
"itemizeAlerts": false,
"maxItemizedAlerts": 1
},
"scheduleType": "${schedule_type}",
"muteErrorEmails": false,
"parameters": [
{
"name": "environment",
"value": "${aou_env}"
},
{
"name": "window_in_seconds",
"value": "${egress_window_sec}"
},
{
"name": "egress_threshold_in_mib",
"value": "${egress_threshold_mib}"
}
]
},
"description": ""
}
125 changes: 125 additions & 0 deletions modules/workbench/modules/egress_detection/assets/content/query.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
// This query analyses a set of Google VPC flow logs for high-egress events.
// The incoming set of flow logs are bucketed into non-overlapping time slices
// and grouped by project / VM name to identify VMs with high amounts of egress
// in a given period of time.
//
// All SumoLogic saved searches are collected in the "AoU RW Egress Alerts" folder is created in sumologic UI then passed
// here as variable
// There should be one saved search for each {environment, window width} tuple.
//
// Note: the "time range" parameter is set to be double the window_in_seconds duration,
// causing the search to query across multiple windows' worth of log messages. This may
// result in duplicate high-egress event notifications being sent, but it may also increase
// resiliency to delays or outages in SumoLogic's execution of saved searches.

// Each environment should be set up with the following searches. The test environment
// is shown as an example.
//
// - 3 minutes, 100Mib
// - Parameters: environment=test, window_in_seconds=180, egress_threshold_in_mib=100
// - Time range: -6m ("use receipt time" is checked)
// - Search schedule:
// - Run frequency: real time
// - Time range for scheduled search: -6m
//
// - 10 minutes, 150 Mib
// - Parameters: environment=test, window_in_seconds=600, egress_threshold_in_mib=150
// - Time range: -20m ("use receipt time" is checked)
// - Search schedule:
// - Run frequency: Every 15 minutes
// - Time range for scheduled search: -20m
//
// - 60 minutes, 200 Mib
// - Parameters: environment=test, window_in_seconds=3600, egress_threshold_in_mib=200
// - Time range: -120m ("use receipt time" is checked)
// - Search schedule:
// - Run frequency: Hourly
// - Time range for scheduled search: -120m

_sourceCategory = gcp/vpcflowlogs/aou/{{environment}} logName resource timestamp

// Parse the common JSON objects we'll use below
| json "message.data.jsonPayload" as payload
| json "message.data.resource" as resource

//
// Filter down to the set of logs we are analyzing.
//

// Filter on VPC flow logs, in case some other log type ends up here unexpectedly.
| parse regex "\"logName\":\"(?<log_name>[^\"]+)\""
| where log_name matches "projects/*/logs/compute.googleapis.com%2Fvpc_flows"

// Show only logs related to GCE subnet activity.
| json field=resource "type" as type
| where type = "gce_subnetwork"

// Show only logs in the egress direction
| json field=payload "reporter" as reporter
| where reporter matches "SRC"

// Exclude traffic whose destination IP is within the static IP range for Private Google Access.
// This ensures that most Google API traffic is excluded from being considered for high-egress
// alerts. See ticket RW-4738 for more details and breadcrumbs.
| json field = payload "connection.dest_ip" as dest_ip
| where !(dest_ip in (
"199.36.153.4",
"199.36.153.5",
"199.36.153.6",
"199.36.153.7"))

// Extract some output fields from the log JSON.
| json field=resource "labels.project_id" as project_name
| json field=payload "bytes_sent", "start_time", "end_time" as bytes_sent, start_time, end_time
// "nodrop" means it's OK if vm_name does not exist
| json field=payload "src_instance.vm_name" as vm_name nodrop

// There are 3 types of expected VM names:
// 1. GCE VMs: all-of-us-<user_id>
// 2. Dataproc master nodes: all-of-us-<user_id>-m
// 3. Dataproc worker nodes: all-of-us-<user_id>-w-<index>
//
// All three of these should contribute towards a single user's egress. In the
// event that the VM naming convention changes, egress will instead be accumulated
// at the project level, which should only be noisier than this
| parse regex field=vm_name "^(?<vm_prefix>all-of-us-\d+)(?:$|-[mw].*)" nodrop

| if (vm_name matches /^all-of-us-\d+$/, bytes_sent, 0) as gce_bytes_sent
| if (vm_name matches /^all-of-us-\d+-m$/, bytes_sent, 0) as dataproc_master_bytes_sent
| if (vm_name matches /^all-of-us-\d+-w-\d+$/, bytes_sent, 0) as dataproc_worker_bytes_sent

// Timeslice creates a _timeslice variable, which is the message's timestamp
// rounded to the nearest timeslice window start. We'll use this to aggregate
// and calculate per-window egress.
| timeslice {{window_in_seconds}}s

// Breakdown by window, project, and VM prefix
| sum(bytes_sent) as bytes_sent,
sum(gce_bytes_sent) as gce_bytes_sent,
sum(dataproc_master_bytes_sent) as dataproc_master_bytes_sent,
sum(dataproc_worker_bytes_sent) as dataproc_worker_bytes_sent
by _timeslice, project_name, vm_prefix

// Collect all fields for display
| bytes_sent / 1Mi as egress_mib
| gce_bytes_sent / 1Mi as gce_egress_mib
| dataproc_master_bytes_sent / 1Mi as dataproc_master_egress_mib
| dataproc_worker_bytes_sent / 1Mi as dataproc_worker_egress_mib
| toLong(_timeslice) as time_window_start
| "{{environment}}" as environment
| "{{window_in_seconds}}" as time_window_duration
| "{{egress_threshold_in_mib}}" as egress_mib_threshold
| fields
environment,
time_window_duration,
time_window_start,
egress_mib,
egress_mib_threshold,
project_name,
vm_prefix,
gce_egress_mib,
dataproc_master_egress_mib,
dataproc_worker_egress_mib

// Only export rows passing our desired threshold
| where egress_mib > {{egress_threshold_in_mib}}
45 changes: 45 additions & 0 deletions modules/workbench/modules/egress_detection/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
locals {
content_dir = pathexpand("${path.module}/assets/content")
search_configs = fileset(local.content_dir, "*.json")
content_template_path = pathexpand("${local.content_dir}/egress_window_template.json")
query_path = pathexpand("${local.content_dir}/query.txt")

queries_rendered = { for egress_rule, threshold in var.sumologic_egress_thresholds :
tostring(egress_rule) => templatefile(local.query_path, {
aou_env = var.aou_env
tier_name = lookup(threshold, "tier_name", 0)
egress_threshold_mib = lookup(threshold, "egress_threshold_mib", 0)
egress_window_sec = lookup(threshold, "egress_window_sec", 0)
})
}

queries_encoded = { for egress_rule, query_text in local.queries_rendered :
egress_rule => jsonencode(query_text)
}

# Build a map of rendered Content templates for use in the sumologic_content resource and
# module outputs
egress_rule_to_config = { for egress_rule, threshold in var.sumologic_egress_thresholds :
egress_rule => templatefile(local.content_template_path, {
aou_env = var.aou_env
webhook_id = var.sumologic_webhook_id_hexadecimal
tier_name = lookup(threshold, "tier_name", 0)
egress_threshold_mib = lookup(threshold, "egress_threshold_mib", 0)
egress_window_sec = lookup(threshold, "egress_window_sec", 0)
cron_expression = lookup(threshold, "cron_expression", 0)
schedule_type = lookup(threshold, "schedule_type", 0)
time_range = lookup(threshold, "time_range", 0)
query_text = lookup(local.queries_encoded, egress_rule)
})
}
}

# Simply export a content file or folder and put the JSON file in ./assets/content.
# Since the query is so long (and critical) and is json-encoded, it's easier
# to configure it separately.
resource "sumologic_content" "main" {
for_each = var.sumologic_egress_thresholds
parent_id = var.sumologic_parent_folder_id_hexadecimal
config = lookup(local.egress_rule_to_config, each.key, "")
}

6 changes: 6 additions & 0 deletions modules/workbench/modules/egress_detection/outputs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
output "egress_alert_rendered_queries" {
description = "Queries for each egress rule, in human-readable form"
value = [for egress_rule in keys(var.sumologic_egress_thresholds) :
local.queries_rendered
]
}
21 changes: 21 additions & 0 deletions modules/workbench/modules/egress_detection/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
variable aou_env {
description = "Short name (all lowercase) of All of Us Workbench deployed environments, e.g. local, test, staging, prod."
type = string
}

variable sumologic_egress_thresholds {
description = "Configuration values for egress search content in SumoLogic. The name (key) describes simply the tier name and config: "
type = map(map(any))
default = {
}
}

variable sumologic_parent_folder_id_hexadecimal {
description = "The folder to create alert within, in hexadecimal format. It is generated manually outside of Terraform"
type = string
}

variable sumologic_webhook_id_hexadecimal {
description = "The webhook ID to notify the alert to, in hexadecimal format. It is generated manually outside of Terraform"
type = string
}
8 changes: 8 additions & 0 deletions modules/workbench/modules/egress_detection/versions.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
terraform {
required_providers {
sumologic = {
source = "terraform-providers/sumologic"
}
}
required_version = ">= 0.13"
}
5 changes: 4 additions & 1 deletion modules/workbench/outputs.tf
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

output "bigquery_dataset" {
value = module.reporting.bigquery_dataset
}
Expand All @@ -10,3 +9,7 @@ output "bigquery_views" {
output "table_names" {
value = module.reporting.table_names
}

output "egress_alert_rendered_queries" {
value = module.egress_detection.egress_alert_rendered_queries
}
11 changes: 11 additions & 0 deletions modules/workbench/providers.tf
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ terraform {
google = {
source = "hashicorp/google"
}
sumologic = {
source = "SumoLogic/sumologic"
version = "2.6.0"
}
}
}

Expand All @@ -27,3 +31,10 @@ provider "google" {
// zone = var.zone
//}

# Define sensitive keys as env vars. All three must be absent from the provider block
# and exported for this to work.
# TODO(RW-6103): Integrate with Vault and let Terraform pull those secret from Vault.
# $ export SUMOLOGIC_ACCESSID="your-access-id"
# $ export SUMOLOGIC_ACCESSKEY="your-access-key"
# $ export SUMOLOGIC_ENVIRONMENT=us2
provider "sumologic" {}
Loading

0 comments on commit cf5869e

Please sign in to comment.