Skip to content

Commit

Permalink
Adding base cirrus alarms
Browse files Browse the repository at this point in the history
  • Loading branch information
hectormachin committed Sep 27, 2024
1 parent 8d9ad18 commit f6472ff
Show file tree
Hide file tree
Showing 29 changed files with 561 additions and 113 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,6 @@ or by changing all infrastructure flags to false in your tfvars and performing a
deploy_vpc = false
deploy_vpc_search = false
deploy_log_archive = false
deploy_alarms = false
deploy_stac_server = false
deploy_analytics = false
deploy_titiler = false
Expand Down
9 changes: 5 additions & 4 deletions ci.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,6 @@ public_subnets_az_to_id_map = {}
private_subnets_az_to_id_map = {}

##### ALARM VARIABLES ####
sns_topics_map = {}
cloudwatch_warning_alarms_map = {}
cloudwatch_critical_alarms_map = {}
sns_warning_subscriptions_map = {}
sns_critical_subscriptions_map = {}

Expand Down Expand Up @@ -153,6 +150,11 @@ cirrus_inputs = {
data_bucket = "cirrus-data-bucket-name"
payload_bucket = "cirrus-payload-bucket-name"
log_level = "DEBUG"
deploy_alarms = true
custom_alarms = {
warning = {}
critical = {}
}
process = {
sqs_timeout = 180
sqs_max_receive_count = 5
Expand Down Expand Up @@ -218,7 +220,6 @@ cirrus_dashboard_inputs = {
deploy_vpc = false
deploy_vpc_search = true
deploy_log_archive = true
deploy_alarms = false
deploy_stac_server_opensearch_serverless = true
deploy_stac_server = true
deploy_stac_server_outside_vpc = false
Expand Down
9 changes: 5 additions & 4 deletions default.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@ public_subnets_az_to_id_map = {}
private_subnets_az_to_id_map = {}

##### ALARM VARIABLES ####
sns_topics_map = {}
cloudwatch_warning_alarms_map = {}
cloudwatch_critical_alarms_map = {}
sns_warning_subscriptions_map = {}
sns_critical_subscriptions_map = {}

Expand Down Expand Up @@ -157,6 +154,11 @@ cirrus_inputs = {
data_bucket = "cirrus-data-bucket-name"
payload_bucket = "cirrus-payload-bucket-name"
log_level = "DEBUG"
deploy_alarms = true
custom_alarms = {
warning = {}
critical = {}
}
process = {
sqs_timeout = 180
sqs_max_receive_count = 5
Expand Down Expand Up @@ -222,7 +224,6 @@ cirrus_dashboard_inputs = {
deploy_vpc = false
deploy_vpc_search = true
deploy_log_archive = true
deploy_alarms = false
deploy_stac_server_opensearch_serverless = false
deploy_stac_server = true
deploy_stac_server_outside_vpc = false
Expand Down
4 changes: 0 additions & 4 deletions filmdrop.tf
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,6 @@ module "filmdrop" {
public_subnets_az_to_id_map = var.public_subnets_az_to_id_map
private_subnets_az_to_id_map = var.private_subnets_az_to_id_map
security_group_id = var.security_group_id
sns_topics_map = var.sns_topics_map
cloudwatch_warning_alarms_map = var.cloudwatch_warning_alarms_map
cloudwatch_critical_alarms_map = var.cloudwatch_critical_alarms_map
sns_warning_subscriptions_map = var.sns_warning_subscriptions_map
sns_critical_subscriptions_map = var.sns_critical_subscriptions_map
s3_access_log_bucket = var.s3_access_log_bucket
Expand All @@ -30,7 +27,6 @@ module "filmdrop" {
deploy_vpc = var.deploy_vpc
deploy_vpc_search = var.deploy_vpc_search
deploy_log_archive = var.deploy_log_archive
deploy_alarms = var.deploy_alarms
deploy_stac_server = var.deploy_stac_server
deploy_stac_server_opensearch_serverless = var.deploy_stac_server_opensearch_serverless
deploy_stac_server_outside_vpc = var.deploy_stac_server_outside_vpc
Expand Down
31 changes: 10 additions & 21 deletions inputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -46,21 +46,6 @@ variable "security_group_id" {
default = ""
}

variable "sns_topics_map" {
type = map(any)
default = {}
}

variable "cloudwatch_warning_alarms_map" {
type = map(any)
default = {}
}

variable "cloudwatch_critical_alarms_map" {
type = map(any)
default = {}
}

variable "sns_warning_subscriptions_map" {
type = map(any)
default = {}
Expand Down Expand Up @@ -354,6 +339,11 @@ variable "cirrus_inputs" {
data_bucket = string
payload_bucket = string
log_level = string
deploy_alarms = bool
custom_alarms = object({
warning = map(any)
critical = map(any)
})
process = object({
sqs_timeout = number
sqs_max_receive_count = number
Expand Down Expand Up @@ -388,6 +378,11 @@ variable "cirrus_inputs" {
data_bucket = "cirrus-data-bucket-name"
payload_bucket = "cirrus-payload-bucket-name"
log_level = "INFO"
deploy_alarms = true
custom_alarms = {
warning = {}
critical = {}
}
process = {
sqs_timeout = 180
sqs_max_receive_count = 5
Expand Down Expand Up @@ -495,12 +490,6 @@ variable "deploy_log_archive" {
description = "Deploy FilmDrop Log Archive Bucket"
}

variable "deploy_alarms" {
type = bool
default = false
description = "Deploy FilmDrop Alarms stack"
}

variable "deploy_stac_server" {
type = bool
default = true
Expand Down
6 changes: 6 additions & 0 deletions modules/base_infra/sns/sns.tf
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,10 @@ resource "aws_sns_topic_policy" "sns_topic_polcies" {

arn = aws_sns_topic.sns_topics[each.key].arn
policy = templatefile(lookup(each.value, "policy_file_path_name", local.default_sns_policy_file_path_name), { resource = aws_sns_topic.sns_topics[each.key].arn, account_id = data.aws_caller_identity.current.account_id })

lifecycle {
ignore_changes = [
policy
]
}
}
84 changes: 84 additions & 0 deletions modules/cirrus/base/db.tf
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,87 @@ resource "aws_timestreamwrite_table" "cirrus_state_event_timestreamwrite_table"
memory_store_retention_period_in_hours = var.cirrus_timestream_memory_store_retention_period_in_hours
}
}

resource "aws_cloudwatch_metric_alarm" "cirrus_state_event_system_errors_warning_alarm" {
count = var.deploy_alarms ? 1 : 0
alarm_name = "WARNING: ${var.cirrus_prefix}-state DynamoDB System Errors Warning Alarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
metric_name = "SystemErrors"
namespace = "AWS/DynamoDB"
period = 60
statistic = "Sum"
threshold = 1
treat_missing_data = "notBreaching"
alarm_description = "${var.cirrus_prefix} Cirrus State DynamoDB System Errors Warning Alarm"
alarm_actions = [var.warning_sns_topic_arn]
ok_actions = [var.warning_sns_topic_arn]
insufficient_data_actions = []

dimensions = {
TableName = aws_dynamodb_table.cirrus_state_dynamodb_table.name
}
}

resource "aws_cloudwatch_metric_alarm" "cirrus_state_user_errors_warning_alarm" {
count = var.deploy_alarms ? 1 : 0
alarm_name = "WARNING: ${var.cirrus_prefix}-state DynamoDB User Errors Warning Alarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
metric_name = "UserErrors"
namespace = "AWS/DynamoDB"
period = 60
statistic = "Sum"
threshold = 1
treat_missing_data = "notBreaching"
alarm_description = "${var.cirrus_prefix} Cirrus State DynamoDB User Errors Warning Alarm"
alarm_actions = [var.warning_sns_topic_arn]
ok_actions = [var.warning_sns_topic_arn]
insufficient_data_actions = []

dimensions = {
TableName = aws_dynamodb_table.cirrus_state_dynamodb_table.name
}
}

resource "aws_cloudwatch_metric_alarm" "cirrus_state_events_system_errors_warning_alarm" {
count = var.deploy_alarms ? 1 : 0
alarm_name = "WARNING: ${var.cirrus_prefix}-state-events Timestream Events System Errors Warning Alarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
metric_name = "SystemErrors"
namespace = "AWS/Timestream"
period = 60
statistic = "Sum"
threshold = 1
treat_missing_data = "notBreaching"
alarm_description = "${var.cirrus_prefix} Cirrus State Timestream Events System Errors Warning Alarm"
alarm_actions = [var.warning_sns_topic_arn]
ok_actions = [var.warning_sns_topic_arn]
insufficient_data_actions = []

dimensions = {
DatabaseName = aws_timestreamwrite_database.cirrus_state_event_timestreamwrite_database.database_name
}
}

resource "aws_cloudwatch_metric_alarm" "cirrus_state_events_user_errors_warning_alarm" {
count = var.deploy_alarms ? 1 : 0
alarm_name = "WARNING: ${var.cirrus_prefix}-state-events Timestream Events User Errors Warning Alarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
metric_name = "UserErrors"
namespace = "AWS/Timestream"
period = 60
statistic = "Sum"
threshold = 1
treat_missing_data = "notBreaching"
alarm_description = "${var.cirrus_prefix} Cirrus State Timestream Events User Errors Warning Alarm"
alarm_actions = [var.warning_sns_topic_arn]
ok_actions = [var.warning_sns_topic_arn]
insufficient_data_actions = []

dimensions = {
DatabaseName = aws_timestreamwrite_database.cirrus_state_event_timestreamwrite_database.database_name
}
}
11 changes: 11 additions & 0 deletions modules/cirrus/base/inputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,14 @@ variable "cirrus_payload_bucket" {
description = "Cirrus payload bucket"
type = string
}

variable "warning_sns_topic_arn" {
description = "String with FilmDrop Warning SNS topic ARN"
type = string
}

variable "deploy_alarms" {
type = bool
default = true
description = "Deploy Cirrus Alarms stack"
}
42 changes: 42 additions & 0 deletions modules/cirrus/base/sns.tf
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,45 @@ resource "aws_sns_topic" "cirrus_publish_sns_topic" {
resource "aws_sns_topic" "cirrus_workflow_event_sns_topic" {
name = "${var.cirrus_prefix}-workflow-event"
}

resource "aws_cloudwatch_metric_alarm" "cirrus_publish_sns_topic_notifications_failed_warning_alarm" {
count = var.deploy_alarms ? 1 : 0
alarm_name = "WARNING: ${var.cirrus_prefix}-publish SNS Topic Notifications Failed Warning Alarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
metric_name = "NumberOfNotificationsFailed"
namespace = "AWS/SNS"
period = 60
statistic = "Sum"
threshold = 1
treat_missing_data = "notBreaching"
alarm_description = "${var.cirrus_prefix}-publish SNS Topic Notifications Failed Warning Alarm"
alarm_actions = [var.warning_sns_topic_arn]
ok_actions = [var.warning_sns_topic_arn]
insufficient_data_actions = []

dimensions = {
TopicName = aws_sns_topic.cirrus_publish_sns_topic.name
}
}

resource "aws_cloudwatch_metric_alarm" "cirrus_workflow_event_sns_topic_notifications_failed_warning_alarm" {
count = var.deploy_alarms ? 1 : 0
alarm_name = "WARNING: ${var.cirrus_prefix}-workflow-event SNS Topic Notifications Failed Warning Alarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
metric_name = "NumberOfNotificationsFailed"
namespace = "AWS/SNS"
period = 60
statistic = "Sum"
threshold = 1
treat_missing_data = "notBreaching"
alarm_description = "${var.cirrus_prefix}-workflow-event SNS Topic Notifications Failed Warning Alarm"
alarm_actions = [var.warning_sns_topic_arn]
ok_actions = [var.warning_sns_topic_arn]
insufficient_data_actions = []

dimensions = {
TopicName = aws_sns_topic.cirrus_workflow_event_sns_topic.name
}
}
21 changes: 21 additions & 0 deletions modules/cirrus/base/sqs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,24 @@ resource "aws_sqs_queue" "cirrus_process_dead_letter_sqs_queue" {
resource "aws_sqs_queue" "cirrus_update_state_dead_letter_sqs_queue" {
name = "${var.cirrus_prefix}-update-state-dead-letter"
}

resource "aws_cloudwatch_metric_alarm" "cirrus_update_state_dead_letter_sqs_queue_warning_alarm" {
count = var.deploy_alarms ? 1 : 0
alarm_name = "WARNING: ${var.cirrus_prefix}-update-state-dead-letter SQS DLQ Warning Alarm"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = 1
metric_name = "ApproximateNumberOfMessagesVisible"
namespace = "AWS/SQS"
period = 60
statistic = "Sum"
threshold = 1
treat_missing_data = "notBreaching"
alarm_description = "${var.cirrus_prefix}-update-state-dead-letter DLQ Warning Alarm"
alarm_actions = [var.warning_sns_topic_arn]
ok_actions = [var.warning_sns_topic_arn]
insufficient_data_actions = []

dimensions = {
QueueName = aws_sqs_queue.cirrus_update_state_dead_letter_sqs_queue.name
}
}
2 changes: 2 additions & 0 deletions modules/cirrus/builtins.tf
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,6 @@ module "base-builtins" {
cirrus_timestream_memory_store_retention_period_in_hours = var.cirrus_timestream_memory_store_retention_period_in_hours
cirrus_data_bucket = var.cirrus_data_bucket
cirrus_payload_bucket = var.cirrus_payload_bucket
warning_sns_topic_arn = var.warning_sns_topic_arn
deploy_alarms = var.deploy_alarms
}
3 changes: 3 additions & 0 deletions modules/cirrus/functions.tf
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,7 @@ module "functions" {
cirrus_process_sqs_queue_arn = module.base-builtins.cirrus_process_sqs_queue_arn
cirrus_process_sqs_queue_url = module.base-builtins.cirrus_process_sqs_queue_url
cirrus_update_state_dead_letter_sqs_queue_arn = module.base-builtins.cirrus_update_state_dead_letter_sqs_queue_arn
warning_sns_topic_arn = var.warning_sns_topic_arn
critical_sns_topic_arn = var.critical_sns_topic_arn
deploy_alarms = var.deploy_alarms
}
Loading

0 comments on commit f6472ff

Please sign in to comment.