diff --git a/README.md b/README.md index fe75f618..2a4a012e 100644 --- a/README.md +++ b/README.md @@ -237,7 +237,6 @@ or by changing all infrastructure flags to false in your tfvars and performing a deploy_vpc = false deploy_vpc_search = false deploy_log_archive = false -deploy_alarms = false deploy_stac_server = false deploy_analytics = false deploy_titiler = false diff --git a/ci.tfvars b/ci.tfvars index 9df66b24..0e9c322a 100644 --- a/ci.tfvars +++ b/ci.tfvars @@ -13,9 +13,6 @@ public_subnets_az_to_id_map = {} private_subnets_az_to_id_map = {} ##### ALARM VARIABLES #### -sns_topics_map = {} -cloudwatch_warning_alarms_map = {} -cloudwatch_critical_alarms_map = {} sns_warning_subscriptions_map = {} sns_critical_subscriptions_map = {} @@ -153,6 +150,11 @@ cirrus_inputs = { data_bucket = "cirrus-data-bucket-name" payload_bucket = "cirrus-payload-bucket-name" log_level = "DEBUG" + deploy_alarms = true + custom_alarms = { + warning = {} + critical = {} + } process = { sqs_timeout = 180 sqs_max_receive_count = 5 @@ -218,7 +220,6 @@ cirrus_dashboard_inputs = { deploy_vpc = false deploy_vpc_search = true deploy_log_archive = true -deploy_alarms = false deploy_stac_server_opensearch_serverless = true deploy_stac_server = true deploy_stac_server_outside_vpc = false diff --git a/default.tfvars b/default.tfvars index ef4cf5dc..100a0e09 100644 --- a/default.tfvars +++ b/default.tfvars @@ -15,9 +15,6 @@ public_subnets_az_to_id_map = {} private_subnets_az_to_id_map = {} ##### ALARM VARIABLES #### -sns_topics_map = {} -cloudwatch_warning_alarms_map = {} -cloudwatch_critical_alarms_map = {} sns_warning_subscriptions_map = {} sns_critical_subscriptions_map = {} @@ -157,6 +154,11 @@ cirrus_inputs = { data_bucket = "cirrus-data-bucket-name" payload_bucket = "cirrus-payload-bucket-name" log_level = "DEBUG" + deploy_alarms = true + custom_alarms = { + warning = {} + critical = {} + } process = { sqs_timeout = 180 sqs_max_receive_count = 5 @@ -222,7 +224,6 @@ cirrus_dashboard_inputs = { deploy_vpc = false deploy_vpc_search = true deploy_log_archive = true -deploy_alarms = false deploy_stac_server_opensearch_serverless = false deploy_stac_server = true deploy_stac_server_outside_vpc = false diff --git a/filmdrop.tf b/filmdrop.tf index 28c01779..29ba5f69 100644 --- a/filmdrop.tf +++ b/filmdrop.tf @@ -13,9 +13,6 @@ module "filmdrop" { public_subnets_az_to_id_map = var.public_subnets_az_to_id_map private_subnets_az_to_id_map = var.private_subnets_az_to_id_map security_group_id = var.security_group_id - sns_topics_map = var.sns_topics_map - cloudwatch_warning_alarms_map = var.cloudwatch_warning_alarms_map - cloudwatch_critical_alarms_map = var.cloudwatch_critical_alarms_map sns_warning_subscriptions_map = var.sns_warning_subscriptions_map sns_critical_subscriptions_map = var.sns_critical_subscriptions_map s3_access_log_bucket = var.s3_access_log_bucket @@ -30,7 +27,6 @@ module "filmdrop" { deploy_vpc = var.deploy_vpc deploy_vpc_search = var.deploy_vpc_search deploy_log_archive = var.deploy_log_archive - deploy_alarms = var.deploy_alarms deploy_stac_server = var.deploy_stac_server deploy_stac_server_opensearch_serverless = var.deploy_stac_server_opensearch_serverless deploy_stac_server_outside_vpc = var.deploy_stac_server_outside_vpc diff --git a/inputs.tf b/inputs.tf index 6b96f40b..167439a3 100644 --- a/inputs.tf +++ b/inputs.tf @@ -46,21 +46,6 @@ variable "security_group_id" { default = "" } -variable "sns_topics_map" { - type = map(any) - default = {} -} - -variable "cloudwatch_warning_alarms_map" { - type = map(any) - default = {} -} - -variable "cloudwatch_critical_alarms_map" { - type = map(any) - default = {} -} - variable "sns_warning_subscriptions_map" { type = map(any) default = {} @@ -354,6 +339,11 @@ variable "cirrus_inputs" { data_bucket = string payload_bucket = string log_level = string + deploy_alarms = bool + custom_alarms = object({ + warning = map(any) + critical = map(any) + }) process = object({ sqs_timeout = number sqs_max_receive_count = number @@ -388,6 +378,11 @@ variable "cirrus_inputs" { data_bucket = "cirrus-data-bucket-name" payload_bucket = "cirrus-payload-bucket-name" log_level = "INFO" + deploy_alarms = true + custom_alarms = { + warning = {} + critical = {} + } process = { sqs_timeout = 180 sqs_max_receive_count = 5 @@ -495,12 +490,6 @@ variable "deploy_log_archive" { description = "Deploy FilmDrop Log Archive Bucket" } -variable "deploy_alarms" { - type = bool - default = false - description = "Deploy FilmDrop Alarms stack" -} - variable "deploy_stac_server" { type = bool default = true diff --git a/modules/base_infra/sns/sns.tf b/modules/base_infra/sns/sns.tf index 2a9c337b..ee3c0887 100644 --- a/modules/base_infra/sns/sns.tf +++ b/modules/base_infra/sns/sns.tf @@ -9,4 +9,10 @@ resource "aws_sns_topic_policy" "sns_topic_polcies" { arn = aws_sns_topic.sns_topics[each.key].arn policy = templatefile(lookup(each.value, "policy_file_path_name", local.default_sns_policy_file_path_name), { resource = aws_sns_topic.sns_topics[each.key].arn, account_id = data.aws_caller_identity.current.account_id }) + + lifecycle { + ignore_changes = [ + policy + ] + } } diff --git a/modules/cirrus/base/db.tf b/modules/cirrus/base/db.tf index beaf1ca2..d4b6bfd9 100644 --- a/modules/cirrus/base/db.tf +++ b/modules/cirrus/base/db.tf @@ -52,3 +52,87 @@ resource "aws_timestreamwrite_table" "cirrus_state_event_timestreamwrite_table" memory_store_retention_period_in_hours = var.cirrus_timestream_memory_store_retention_period_in_hours } } + +resource "aws_cloudwatch_metric_alarm" "cirrus_state_event_system_errors_warning_alarm" { + count = var.deploy_alarms ? 1 : 0 + alarm_name = "WARNING: ${var.cirrus_prefix}-state DynamoDB System Errors Warning Alarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + metric_name = "SystemErrors" + namespace = "AWS/DynamoDB" + period = 60 + statistic = "Sum" + threshold = 1 + treat_missing_data = "notBreaching" + alarm_description = "${var.cirrus_prefix} Cirrus State DynamoDB System Errors Warning Alarm" + alarm_actions = [var.warning_sns_topic_arn] + ok_actions = [var.warning_sns_topic_arn] + insufficient_data_actions = [] + + dimensions = { + TableName = aws_dynamodb_table.cirrus_state_dynamodb_table.name + } +} + +resource "aws_cloudwatch_metric_alarm" "cirrus_state_user_errors_warning_alarm" { + count = var.deploy_alarms ? 1 : 0 + alarm_name = "WARNING: ${var.cirrus_prefix}-state DynamoDB User Errors Warning Alarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + metric_name = "UserErrors" + namespace = "AWS/DynamoDB" + period = 60 + statistic = "Sum" + threshold = 1 + treat_missing_data = "notBreaching" + alarm_description = "${var.cirrus_prefix} Cirrus State DynamoDB User Errors Warning Alarm" + alarm_actions = [var.warning_sns_topic_arn] + ok_actions = [var.warning_sns_topic_arn] + insufficient_data_actions = [] + + dimensions = { + TableName = aws_dynamodb_table.cirrus_state_dynamodb_table.name + } +} + +resource "aws_cloudwatch_metric_alarm" "cirrus_state_events_system_errors_warning_alarm" { + count = var.deploy_alarms ? 1 : 0 + alarm_name = "WARNING: ${var.cirrus_prefix}-state-events Timestream Events System Errors Warning Alarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + metric_name = "SystemErrors" + namespace = "AWS/Timestream" + period = 60 + statistic = "Sum" + threshold = 1 + treat_missing_data = "notBreaching" + alarm_description = "${var.cirrus_prefix} Cirrus State Timestream Events System Errors Warning Alarm" + alarm_actions = [var.warning_sns_topic_arn] + ok_actions = [var.warning_sns_topic_arn] + insufficient_data_actions = [] + + dimensions = { + DatabaseName = aws_timestreamwrite_database.cirrus_state_event_timestreamwrite_database.database_name + } +} + +resource "aws_cloudwatch_metric_alarm" "cirrus_state_events_user_errors_warning_alarm" { + count = var.deploy_alarms ? 1 : 0 + alarm_name = "WARNING: ${var.cirrus_prefix}-state-events Timestream Events User Errors Warning Alarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + metric_name = "UserErrors" + namespace = "AWS/Timestream" + period = 60 + statistic = "Sum" + threshold = 1 + treat_missing_data = "notBreaching" + alarm_description = "${var.cirrus_prefix} Cirrus State Timestream Events User Errors Warning Alarm" + alarm_actions = [var.warning_sns_topic_arn] + ok_actions = [var.warning_sns_topic_arn] + insufficient_data_actions = [] + + dimensions = { + DatabaseName = aws_timestreamwrite_database.cirrus_state_event_timestreamwrite_database.database_name + } +} diff --git a/modules/cirrus/base/inputs.tf b/modules/cirrus/base/inputs.tf index e969b260..c8bbd9b0 100644 --- a/modules/cirrus/base/inputs.tf +++ b/modules/cirrus/base/inputs.tf @@ -36,3 +36,14 @@ variable "cirrus_payload_bucket" { description = "Cirrus payload bucket" type = string } + +variable "warning_sns_topic_arn" { + description = "String with FilmDrop Warning SNS topic ARN" + type = string +} + +variable "deploy_alarms" { + type = bool + default = true + description = "Deploy Cirrus Alarms stack" +} diff --git a/modules/cirrus/base/sns.tf b/modules/cirrus/base/sns.tf index 75a3e40b..5f6fedae 100644 --- a/modules/cirrus/base/sns.tf +++ b/modules/cirrus/base/sns.tf @@ -5,3 +5,45 @@ resource "aws_sns_topic" "cirrus_publish_sns_topic" { resource "aws_sns_topic" "cirrus_workflow_event_sns_topic" { name = "${var.cirrus_prefix}-workflow-event" } + +resource "aws_cloudwatch_metric_alarm" "cirrus_publish_sns_topic_notifications_failed_warning_alarm" { + count = var.deploy_alarms ? 1 : 0 + alarm_name = "WARNING: ${var.cirrus_prefix}-publish SNS Topic Notifications Failed Warning Alarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + metric_name = "NumberOfNotificationsFailed" + namespace = "AWS/SNS" + period = 60 + statistic = "Sum" + threshold = 1 + treat_missing_data = "notBreaching" + alarm_description = "${var.cirrus_prefix}-publish SNS Topic Notifications Failed Warning Alarm" + alarm_actions = [var.warning_sns_topic_arn] + ok_actions = [var.warning_sns_topic_arn] + insufficient_data_actions = [] + + dimensions = { + TopicName = aws_sns_topic.cirrus_publish_sns_topic.name + } +} + +resource "aws_cloudwatch_metric_alarm" "cirrus_workflow_event_sns_topic_notifications_failed_warning_alarm" { + count = var.deploy_alarms ? 1 : 0 + alarm_name = "WARNING: ${var.cirrus_prefix}-workflow-event SNS Topic Notifications Failed Warning Alarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + metric_name = "NumberOfNotificationsFailed" + namespace = "AWS/SNS" + period = 60 + statistic = "Sum" + threshold = 1 + treat_missing_data = "notBreaching" + alarm_description = "${var.cirrus_prefix}-workflow-event SNS Topic Notifications Failed Warning Alarm" + alarm_actions = [var.warning_sns_topic_arn] + ok_actions = [var.warning_sns_topic_arn] + insufficient_data_actions = [] + + dimensions = { + TopicName = aws_sns_topic.cirrus_workflow_event_sns_topic.name + } +} diff --git a/modules/cirrus/base/sqs.tf b/modules/cirrus/base/sqs.tf index abfd0994..79e2c544 100644 --- a/modules/cirrus/base/sqs.tf +++ b/modules/cirrus/base/sqs.tf @@ -15,3 +15,24 @@ resource "aws_sqs_queue" "cirrus_process_dead_letter_sqs_queue" { resource "aws_sqs_queue" "cirrus_update_state_dead_letter_sqs_queue" { name = "${var.cirrus_prefix}-update-state-dead-letter" } + +resource "aws_cloudwatch_metric_alarm" "cirrus_update_state_dead_letter_sqs_queue_warning_alarm" { + count = var.deploy_alarms ? 1 : 0 + alarm_name = "WARNING: ${var.cirrus_prefix}-update-state-dead-letter SQS DLQ Warning Alarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 1 + metric_name = "ApproximateNumberOfMessagesVisible" + namespace = "AWS/SQS" + period = 60 + statistic = "Sum" + threshold = 1 + treat_missing_data = "notBreaching" + alarm_description = "${var.cirrus_prefix}-update-state-dead-letter DLQ Warning Alarm" + alarm_actions = [var.warning_sns_topic_arn] + ok_actions = [var.warning_sns_topic_arn] + insufficient_data_actions = [] + + dimensions = { + QueueName = aws_sqs_queue.cirrus_update_state_dead_letter_sqs_queue.name + } +} diff --git a/modules/cirrus/builtins.tf b/modules/cirrus/builtins.tf index 519b67c7..68166389 100644 --- a/modules/cirrus/builtins.tf +++ b/modules/cirrus/builtins.tf @@ -8,4 +8,6 @@ module "base-builtins" { cirrus_timestream_memory_store_retention_period_in_hours = var.cirrus_timestream_memory_store_retention_period_in_hours cirrus_data_bucket = var.cirrus_data_bucket cirrus_payload_bucket = var.cirrus_payload_bucket + warning_sns_topic_arn = var.warning_sns_topic_arn + deploy_alarms = var.deploy_alarms } diff --git a/modules/cirrus/functions.tf b/modules/cirrus/functions.tf index 5c3ef643..11b84040 100644 --- a/modules/cirrus/functions.tf +++ b/modules/cirrus/functions.tf @@ -27,4 +27,7 @@ module "functions" { cirrus_process_sqs_queue_arn = module.base-builtins.cirrus_process_sqs_queue_arn cirrus_process_sqs_queue_url = module.base-builtins.cirrus_process_sqs_queue_url cirrus_update_state_dead_letter_sqs_queue_arn = module.base-builtins.cirrus_update_state_dead_letter_sqs_queue_arn + warning_sns_topic_arn = var.warning_sns_topic_arn + critical_sns_topic_arn = var.critical_sns_topic_arn + deploy_alarms = var.deploy_alarms } diff --git a/modules/cirrus/functions/api.tf b/modules/cirrus/functions/api.tf index 4271c867..6e9874f3 100644 --- a/modules/cirrus/functions/api.tf +++ b/modules/cirrus/functions/api.tf @@ -287,3 +287,87 @@ resource "aws_iam_role_policy_attachment" "cirrus_api_gw_base_policy" { role = aws_iam_role.cirrus_api_gw_role.name policy_arn = aws_iam_policy.cirrus_api_gw_policy.arn } + +resource "aws_cloudwatch_metric_alarm" "cirrus_api_lambda_errors_warning_alarm" { + count = var.deploy_alarms ? 1 : 0 + alarm_name = "WARNING: ${var.cirrus_prefix}-api Lambda Errors Warning Alarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 5 + metric_name = "Errors" + namespace = "AWS/Lambda" + period = 60 + statistic = "Sum" + threshold = 10 + treat_missing_data = "notBreaching" + alarm_description = "${var.cirrus_prefix}-api Cirrus Update-State Lambda Errors Warning Alarm" + alarm_actions = [var.warning_sns_topic_arn] + ok_actions = [var.warning_sns_topic_arn] + insufficient_data_actions = [] + + dimensions = { + FunctionName = aws_lambda_function.cirrus_api.function_name + } +} + +resource "aws_cloudwatch_metric_alarm" "cirrus_api_lambda_errors_critical_alarm" { + count = var.deploy_alarms ? 1 : 0 + alarm_name = "CRITICAL: ${var.cirrus_prefix}-api Lambda Errors Critical Alarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 5 + metric_name = "Errors" + namespace = "AWS/Lambda" + period = 60 + statistic = "Sum" + threshold = 100 + treat_missing_data = "notBreaching" + alarm_description = "${var.cirrus_prefix}-api Cirrus Update-State Lambda Errors Critical Alarm" + alarm_actions = [var.critical_sns_topic_arn] + ok_actions = [var.warning_sns_topic_arn] + insufficient_data_actions = [] + + dimensions = { + FunctionName = aws_lambda_function.cirrus_api.function_name + } +} + +resource "aws_cloudwatch_metric_alarm" "cirrus_api_gw_errors_warning_alarm" { + count = var.deploy_alarms ? 1 : 0 + alarm_name = "WARNING: ${aws_api_gateway_rest_api.cirrus_api_gateway.name} API Gateway 5XX Errors Warning Alarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 5 + metric_name = "5XXError" + namespace = "AWS/ApiGateway" + period = 60 + statistic = "Sum" + threshold = 10 + treat_missing_data = "notBreaching" + alarm_description = "${aws_api_gateway_rest_api.cirrus_api_gateway.name} Cirrus API Gateway 5XX Errors Warning Alarm" + alarm_actions = [var.warning_sns_topic_arn] + ok_actions = [var.warning_sns_topic_arn] + insufficient_data_actions = [] + + dimensions = { + FunctionName = aws_api_gateway_rest_api.cirrus_api_gateway.name + } +} + +resource "aws_cloudwatch_metric_alarm" "cirrus_api_gw_errors_critical_alarm" { + count = var.deploy_alarms ? 1 : 0 + alarm_name = "CRITICAL: ${aws_api_gateway_rest_api.cirrus_api_gateway.name} API Gateway 5XX Errors Critical Alarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 5 + metric_name = "5XXError" + namespace = "AWS/ApiGateway" + period = 60 + statistic = "Sum" + threshold = 100 + treat_missing_data = "notBreaching" + alarm_description = "${aws_api_gateway_rest_api.cirrus_api_gateway.name} Cirrus API Gateway 5XX Errors Critical Alarm" + alarm_actions = [var.critical_sns_topic_arn] + ok_actions = [var.warning_sns_topic_arn] + insufficient_data_actions = [] + + dimensions = { + FunctionName = aws_api_gateway_rest_api.cirrus_api_gateway.name + } +} diff --git a/modules/cirrus/functions/inputs.tf b/modules/cirrus/functions/inputs.tf index 08e96b65..9cc0da7d 100644 --- a/modules/cirrus/functions/inputs.tf +++ b/modules/cirrus/functions/inputs.tf @@ -157,3 +157,19 @@ variable "cirrus_api_stage_description" { type = string default = "" } + +variable "warning_sns_topic_arn" { + description = "String with FilmDrop Warning SNS topic ARN" + type = string +} + +variable "critical_sns_topic_arn" { + description = "String with FilmDrop Critical SNS topic ARN" + type = string +} + +variable "deploy_alarms" { + type = bool + default = true + description = "Deploy Cirrus Alarms stack" +} diff --git a/modules/cirrus/functions/post-batch.tf b/modules/cirrus/functions/post-batch.tf index 3c8740d0..2d2cff9d 100644 --- a/modules/cirrus/functions/post-batch.tf +++ b/modules/cirrus/functions/post-batch.tf @@ -98,3 +98,45 @@ resource "aws_lambda_function" "cirrus_post_batch" { subnet_ids = var.vpc_subnet_ids } } + +resource "aws_cloudwatch_metric_alarm" "cirrus_post_batch_lambda_errors_warning_alarm" { + count = var.deploy_alarms ? 1 : 0 + alarm_name = "WARNING: ${var.cirrus_prefix}-post-batch Lambda Errors Warning Alarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 5 + metric_name = "Errors" + namespace = "AWS/Lambda" + period = 60 + statistic = "Sum" + threshold = 10 + treat_missing_data = "notBreaching" + alarm_description = "${var.cirrus_prefix}-post-batch Cirrus Update-State Lambda Errors Warning Alarm" + alarm_actions = [var.warning_sns_topic_arn] + ok_actions = [var.warning_sns_topic_arn] + insufficient_data_actions = [] + + dimensions = { + FunctionName = aws_lambda_function.cirrus_post_batch.function_name + } +} + +resource "aws_cloudwatch_metric_alarm" "cirrus_post_batch_lambda_errors_critical_alarm" { + count = var.deploy_alarms ? 1 : 0 + alarm_name = "CRITICAL: ${var.cirrus_prefix}-post-batch Lambda Errors Critical Alarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 5 + metric_name = "Errors" + namespace = "AWS/Lambda" + period = 60 + statistic = "Sum" + threshold = 100 + treat_missing_data = "notBreaching" + alarm_description = "${var.cirrus_prefix}-post-batch Cirrus Update-State Lambda Errors Critical Alarm" + alarm_actions = [var.critical_sns_topic_arn] + ok_actions = [var.warning_sns_topic_arn] + insufficient_data_actions = [] + + dimensions = { + FunctionName = aws_lambda_function.cirrus_post_batch.function_name + } +} diff --git a/modules/cirrus/functions/pre-batch.tf b/modules/cirrus/functions/pre-batch.tf index f32c391f..3ea1cb60 100644 --- a/modules/cirrus/functions/pre-batch.tf +++ b/modules/cirrus/functions/pre-batch.tf @@ -91,3 +91,45 @@ resource "aws_lambda_function" "cirrus_pre_batch" { subnet_ids = var.vpc_subnet_ids } } + +resource "aws_cloudwatch_metric_alarm" "cirrus_pre_batch_lambda_errors_warning_alarm" { + count = var.deploy_alarms ? 1 : 0 + alarm_name = "WARNING: ${var.cirrus_prefix}-pre-batch Lambda Errors Warning Alarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 5 + metric_name = "Errors" + namespace = "AWS/Lambda" + period = 60 + statistic = "Sum" + threshold = 10 + treat_missing_data = "notBreaching" + alarm_description = "${var.cirrus_prefix}-pre-batch Cirrus Update-State Lambda Errors Warning Alarm" + alarm_actions = [var.warning_sns_topic_arn] + ok_actions = [var.warning_sns_topic_arn] + insufficient_data_actions = [] + + dimensions = { + FunctionName = aws_lambda_function.cirrus_pre_batch.function_name + } +} + +resource "aws_cloudwatch_metric_alarm" "cirrus_pre_batch_lambda_errors_critical_alarm" { + count = var.deploy_alarms ? 1 : 0 + alarm_name = "CRITICAL: ${var.cirrus_prefix}-pre-batch Lambda Errors Critical Alarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 5 + metric_name = "Errors" + namespace = "AWS/Lambda" + period = 60 + statistic = "Sum" + threshold = 100 + treat_missing_data = "notBreaching" + alarm_description = "${var.cirrus_prefix}-pre-batch Cirrus Update-State Lambda Errors Critical Alarm" + alarm_actions = [var.critical_sns_topic_arn] + ok_actions = [var.warning_sns_topic_arn] + insufficient_data_actions = [] + + dimensions = { + FunctionName = aws_lambda_function.cirrus_pre_batch.function_name + } +} diff --git a/modules/cirrus/functions/process.tf b/modules/cirrus/functions/process.tf index 7dee649c..a678736f 100644 --- a/modules/cirrus/functions/process.tf +++ b/modules/cirrus/functions/process.tf @@ -163,3 +163,45 @@ resource "aws_lambda_permission" "cirrus_process_sqs_lambda_permission" { principal = "sqs.amazonaws.com" source_arn = var.cirrus_process_sqs_queue_arn } + +resource "aws_cloudwatch_metric_alarm" "cirrus_process_lambda_errors_warning_alarm" { + count = var.deploy_alarms ? 1 : 0 + alarm_name = "WARNING: ${var.cirrus_prefix}-process Lambda Errors Warning Alarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 5 + metric_name = "Errors" + namespace = "AWS/Lambda" + period = 60 + statistic = "Sum" + threshold = 10 + treat_missing_data = "notBreaching" + alarm_description = "${var.cirrus_prefix}-process Cirrus Update-State Lambda Errors Warning Alarm" + alarm_actions = [var.warning_sns_topic_arn] + ok_actions = [var.warning_sns_topic_arn] + insufficient_data_actions = [] + + dimensions = { + FunctionName = aws_lambda_function.cirrus_process.function_name + } +} + +resource "aws_cloudwatch_metric_alarm" "cirrus_process_lambda_errors_critical_alarm" { + count = var.deploy_alarms ? 1 : 0 + alarm_name = "CRITICAL: ${var.cirrus_prefix}-process Lambda Errors Critical Alarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 5 + metric_name = "Errors" + namespace = "AWS/Lambda" + period = 60 + statistic = "Sum" + threshold = 100 + treat_missing_data = "notBreaching" + alarm_description = "${var.cirrus_prefix}-process Cirrus Update-State Lambda Errors Critical Alarm" + alarm_actions = [var.critical_sns_topic_arn] + ok_actions = [var.warning_sns_topic_arn] + insufficient_data_actions = [] + + dimensions = { + FunctionName = aws_lambda_function.cirrus_process.function_name + } +} diff --git a/modules/cirrus/functions/update-state.tf b/modules/cirrus/functions/update-state.tf index 8c5e8c21..d4e63f3c 100644 --- a/modules/cirrus/functions/update-state.tf +++ b/modules/cirrus/functions/update-state.tf @@ -180,3 +180,45 @@ resource "aws_lambda_permission" "cirrus_update_state_event_bridge_lambda_permis principal = "events.amazonaws.com" source_arn = aws_cloudwatch_event_rule.cirrus_update_state_rule.arn } + +resource "aws_cloudwatch_metric_alarm" "cirrus_update_state_lambda_errors_warning_alarm" { + count = var.deploy_alarms ? 1 : 0 + alarm_name = "WARNING: ${var.cirrus_prefix}-update-state Lambda Errors Warning Alarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 5 + metric_name = "Errors" + namespace = "AWS/Lambda" + period = 60 + statistic = "Sum" + threshold = 10 + treat_missing_data = "notBreaching" + alarm_description = "${var.cirrus_prefix}-update-state Cirrus Update-State Lambda Errors Warning Alarm" + alarm_actions = [var.warning_sns_topic_arn] + ok_actions = [var.warning_sns_topic_arn] + insufficient_data_actions = [] + + dimensions = { + FunctionName = aws_lambda_function.cirrus_update_state.function_name + } +} + +resource "aws_cloudwatch_metric_alarm" "cirrus_update_state_lambda_errors_critical_alarm" { + count = var.deploy_alarms ? 1 : 0 + alarm_name = "CRITICAL: ${var.cirrus_prefix}-update-state Lambda Errors Critical Alarm" + comparison_operator = "GreaterThanOrEqualToThreshold" + evaluation_periods = 5 + metric_name = "Errors" + namespace = "AWS/Lambda" + period = 60 + statistic = "Sum" + threshold = 100 + treat_missing_data = "notBreaching" + alarm_description = "${var.cirrus_prefix}-update-state Cirrus Update-State Lambda Errors Critical Alarm" + alarm_actions = [var.critical_sns_topic_arn] + ok_actions = [var.warning_sns_topic_arn] + insufficient_data_actions = [] + + dimensions = { + FunctionName = aws_lambda_function.cirrus_update_state.function_name + } +} diff --git a/modules/cirrus/inputs.tf b/modules/cirrus/inputs.tf index 1512018d..c8bbc9d6 100644 --- a/modules/cirrus/inputs.tf +++ b/modules/cirrus/inputs.tf @@ -131,3 +131,31 @@ variable "vpc_security_group_ids" { description = "List of security groups in the FilmDrop vpc" type = list(string) } + +variable "warning_sns_topic_arn" { + description = "String with FilmDrop Warning SNS topic ARN" + type = string +} + +variable "critical_sns_topic_arn" { + description = "String with FilmDrop Critical SNS topic ARN" + type = string +} + +variable "deploy_alarms" { + type = bool + default = true + description = "Deploy Cirrus Alarms stack" +} + +variable "custom_cloudwatch_warning_alarms_map" { + description = "Map with custom CloudWatch Warning Alarms" + type = map(any) + default = {} +} + +variable "custom_cloudwatch_critical_alarms_map" { + description = "Map with custom CloudWatch Critical Alarms" + type = map(any) + default = {} +} diff --git a/outputs.tf b/outputs.tf index 811622df..7f9939be 100644 --- a/outputs.tf +++ b/outputs.tf @@ -141,3 +141,11 @@ output "cirrus_data_bucket" { output "cirrus_payload_bucket" { value = module.filmdrop.cirrus_payload_bucket } + +output "warning_sns_topic_arn" { + value = module.filmdrop.warning_sns_topic_arn +} + +output "critical_sns_topic_arn" { + value = module.filmdrop.critical_sns_topic_arn +} diff --git a/profiles/base/inputs.tf b/profiles/base/inputs.tf index 4d36f2de..f10d08bd 100644 --- a/profiles/base/inputs.tf +++ b/profiles/base/inputs.tf @@ -46,21 +46,6 @@ variable "security_group_id" { default = "" } -variable "sns_topics_map" { - type = map(any) - default = {} -} - -variable "cloudwatch_warning_alarms_map" { - type = map(any) - default = {} -} - -variable "cloudwatch_critical_alarms_map" { - type = map(any) - default = {} -} - variable "sns_warning_subscriptions_map" { type = map(any) default = {} @@ -95,12 +80,6 @@ variable "deploy_log_archive" { description = "Deploy FilmDrop Log Archive Bucket" } -variable "deploy_alarms" { - type = bool - default = false - description = "Deploy FilmDrop Alarms stack" -} - variable "deploy_vpc_search" { type = bool default = true diff --git a/profiles/base/main.tf b/profiles/base/main.tf index de0bf22d..0afdc0fd 100644 --- a/profiles/base/main.tf +++ b/profiles/base/main.tf @@ -22,44 +22,26 @@ module "filmdrop_vpc" { } module "sns_alarm_topics" { - count = var.deploy_alarms ? 1 : 0 source = "../../modules/base_infra/sns" - sns_topics_map = var.sns_topics_map -} - -module "base_warning_alarms" { - count = var.deploy_alarms ? 1 : 0 - source = "../../modules/base_infra/alerts" - - cloudwatch_alarms_map = var.cloudwatch_warning_alarms_map - alarm_actions_list = [module.sns_alarm_topics[0].sns_topic_arns["FilmDropWarning"]] - ok_actions_list = [module.sns_alarm_topics[0].sns_topic_arns["FilmDropWarning"]] -} - -module "base_critical_alarms" { - count = var.deploy_alarms ? 1 : 0 - source = "../../modules/base_infra/alerts" - - cloudwatch_alarms_map = var.cloudwatch_critical_alarms_map - alarm_actions_list = [module.sns_alarm_topics[0].sns_topic_arns["FilmDropCritical"]] - ok_actions_list = [module.sns_alarm_topics[0].sns_topic_arns["FilmDropWarning"]] + sns_topics_map = { + "fd-${var.project_name}-${var.environment}-AlarmWarning" = {} + "fd-${var.project_name}-${var.environment}-AlarmCritical" = {} + } } module "sns_warning_subscriptions" { - count = var.deploy_alarms ? 1 : 0 source = "../../modules/base_infra/sns_subscriptions" sns_topics_subscriptions_map = var.sns_warning_subscriptions_map - sns_topic_arn = module.sns_alarm_topics[0].sns_topic_arns["FilmDropWarning"] + sns_topic_arn = module.sns_alarm_topics[0].sns_topic_arns["fd-${var.project_name}-${var.environment}-AlarmWarning"] } module "sns_critical_subscriptions" { - count = var.deploy_alarms ? 1 : 0 source = "../../modules/base_infra/sns_subscriptions" sns_topics_subscriptions_map = var.sns_critical_subscriptions_map - sns_topic_arn = module.sns_alarm_topics[0].sns_topic_arns["FilmDropCritical"] + sns_topic_arn = module.sns_alarm_topics[0].sns_topic_arns["fd-${var.project_name}-${var.environment}-AlarmCritical"] } module "fd_waf_acl" { diff --git a/profiles/base/outputs.tf b/profiles/base/outputs.tf index e89f3126..b3ba968e 100644 --- a/profiles/base/outputs.tf +++ b/profiles/base/outputs.tf @@ -47,3 +47,13 @@ output "web_acl_id" { description = "The id of the FilmDrop WAF ACL" value = var.deploy_waf_rule ? module.fd_waf_acl[0].web_acl_id : var.ext_web_acl_id } + +output "warning_sns_topic_arn" { + description = "The ARN of the FilmDrop Warning SNS Topic" + value = module.sns_alarm_topics.sns_topic_arns["fd-${var.project_name}-${var.environment}-AlarmWarning"] +} + +output "critical_sns_topic_arn" { + description = "The ARN of the FilmDrop Warning SNS Topic" + value = module.sns_alarm_topics.sns_topic_arns["fd-${var.project_name}-${var.environment}-AlarmCritical"] +} diff --git a/profiles/cirrus/inputs.tf b/profiles/cirrus/inputs.tf index 92902629..6218ba80 100644 --- a/profiles/cirrus/inputs.tf +++ b/profiles/cirrus/inputs.tf @@ -32,6 +32,11 @@ variable "cirrus_inputs" { data_bucket = string payload_bucket = string log_level = string + deploy_alarms = bool + custom_alarms = object({ + warning = map(any) + critical = map(any) + }) process = object({ sqs_timeout = number sqs_max_receive_count = number @@ -66,6 +71,11 @@ variable "cirrus_inputs" { data_bucket = "cirrus-data-bucket-name" payload_bucket = "cirrus-payload-bucket-name" log_level = "INFO" + deploy_alarms = true + custom_alarms = { + warning = {} + critical = {} + } process = { sqs_timeout = 180 sqs_max_receive_count = 5 @@ -97,3 +107,13 @@ variable "cirrus_inputs" { } } } + +variable "warning_sns_topic_arn" { + description = "String with FilmDrop Warning SNS topic ARN" + type = string +} + +variable "critical_sns_topic_arn" { + description = "String with FilmDrop Critical SNS topic ARN" + type = string +} diff --git a/profiles/cirrus/main.tf b/profiles/cirrus/main.tf index 5bcb10ed..0b252126 100644 --- a/profiles/cirrus/main.tf +++ b/profiles/cirrus/main.tf @@ -23,4 +23,7 @@ module "cirrus" { cirrus_pre_batch_lambda_memory = var.cirrus_inputs.pre_batch_lambda.memory cirrus_post_batch_lambda_timeout = var.cirrus_inputs.post_batch_lambda.timeout cirrus_post_batch_lambda_memory = var.cirrus_inputs.post_batch_lambda.memory + warning_sns_topic_arn = var.warning_sns_topic_arn + critical_sns_topic_arn = var.critical_sns_topic_arn + deploy_alarms = var.cirrus_inputs.deploy_alarms } diff --git a/profiles/console-ui/inputs.tf b/profiles/console-ui/inputs.tf index 1baa6506..46e774f7 100644 --- a/profiles/console-ui/inputs.tf +++ b/profiles/console-ui/inputs.tf @@ -73,8 +73,8 @@ variable "console_ui_inputs" { } ] version = "v5.3.0" - filmdrop_ui_config_file = "./profiles/console-ui/default-config/config.dev.json" - filmdrop_ui_logo_file = "./profiles/console-ui/default-config/logo.png" + filmdrop_ui_config_file = "./default-config/config.dev.json" + filmdrop_ui_logo_file = "./default-config/logo.png" filmdrop_ui_logo = "bm9uZQo=" # Base64: 'none' auth_function = { cf_function_name = "" diff --git a/profiles/core/inputs.tf b/profiles/core/inputs.tf index 3de9931f..215a05a3 100644 --- a/profiles/core/inputs.tf +++ b/profiles/core/inputs.tf @@ -46,21 +46,6 @@ variable "security_group_id" { default = "" } -variable "sns_topics_map" { - type = map(any) - default = {} -} - -variable "cloudwatch_warning_alarms_map" { - type = map(any) - default = {} -} - -variable "cloudwatch_critical_alarms_map" { - type = map(any) - default = {} -} - variable "sns_warning_subscriptions_map" { type = map(any) default = {} @@ -332,8 +317,8 @@ variable "console_ui_inputs" { } ] version = "v5.3.0" - filmdrop_ui_config_file = "./profiles/console-ui/default-config/config.dev.json" - filmdrop_ui_logo_file = "./profiles/console-ui/default-config/logo.png" + filmdrop_ui_config_file = "../console-ui/default-config/config.dev.json" + filmdrop_ui_logo_file = "../console-ui/default-config/logo.png" filmdrop_ui_logo = "bm9uZQo=" # Base64: 'none' auth_function = { cf_function_name = "" @@ -354,6 +339,11 @@ variable "cirrus_inputs" { data_bucket = string payload_bucket = string log_level = string + deploy_alarms = bool + custom_alarms = object({ + warning = map(any) + critical = map(any) + }) process = object({ sqs_timeout = number sqs_max_receive_count = number @@ -388,6 +378,11 @@ variable "cirrus_inputs" { data_bucket = "cirrus-data-bucket-name" payload_bucket = "cirrus-payload-bucket-name" log_level = "INFO" + deploy_alarms = true + custom_alarms = { + warning = {} + critical = {} + } process = { sqs_timeout = 180 sqs_max_receive_count = 5 @@ -495,12 +490,6 @@ variable "deploy_log_archive" { description = "Deploy FilmDrop Log Archive Bucket" } -variable "deploy_alarms" { - type = bool - default = false - description = "Deploy FilmDrop Alarms stack" -} - variable "deploy_stac_server" { type = bool default = true diff --git a/profiles/core/main.tf b/profiles/core/main.tf index f354e405..3f586d4e 100644 --- a/profiles/core/main.tf +++ b/profiles/core/main.tf @@ -7,7 +7,6 @@ module "base_infra" { deploy_vpc = var.deploy_vpc deploy_vpc_search = var.deploy_vpc_search - deploy_alarms = var.deploy_alarms deploy_log_archive = var.deploy_log_archive deploy_waf_rule = var.deploy_waf_rule ext_web_acl_id = var.ext_web_acl_id @@ -17,12 +16,9 @@ module "base_infra" { project_name = var.project_name vpc_cidr = var.vpc_cidr vpc_id = var.vpc_id - sns_topics_map = var.sns_topics_map security_group_id = var.security_group_id private_subnets_az_to_id_map = var.private_subnets_az_to_id_map public_subnets_az_to_id_map = var.public_subnets_az_to_id_map - cloudwatch_warning_alarms_map = var.cloudwatch_warning_alarms_map - cloudwatch_critical_alarms_map = var.cloudwatch_critical_alarms_map sns_warning_subscriptions_map = var.sns_warning_subscriptions_map sns_critical_subscriptions_map = var.sns_critical_subscriptions_map s3_access_log_bucket = var.s3_access_log_bucket @@ -128,11 +124,13 @@ module "cirrus" { count = var.deploy_cirrus ? 1 : 0 source = "../cirrus" - project_name = var.project_name - environment = var.environment - private_subnet_ids = module.base_infra.private_subnet_ids - security_group_id = module.base_infra.security_group_id - cirrus_inputs = var.cirrus_inputs + project_name = var.project_name + environment = var.environment + private_subnet_ids = module.base_infra.private_subnet_ids + security_group_id = module.base_infra.security_group_id + cirrus_inputs = var.cirrus_inputs + warning_sns_topic_arn = module.base_infra.warning_sns_topic_arn + critical_sns_topic_arn = module.base_infra.critical_sns_topic_arn } module "cirrus-dashboard" { diff --git a/profiles/core/outputs.tf b/profiles/core/outputs.tf index 8946b975..63390d92 100644 --- a/profiles/core/outputs.tf +++ b/profiles/core/outputs.tf @@ -141,3 +141,11 @@ output "cirrus_data_bucket" { output "cirrus_payload_bucket" { value = var.deploy_cirrus ? module.cirrus[0].cirrus_payload_bucket : "" } + +output "warning_sns_topic_arn" { + value = module.base_infra.warning_sns_topic_arn +} + +output "critical_sns_topic_arn" { + value = module.base_infra.critical_sns_topic_arn +}