From 587064f0d6b9c26952f4743e167af79cd969eaf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Thu, 5 Sep 2024 14:43:14 +0100 Subject: [PATCH 1/3] Remove catalogue DLQ alerts from Slack (#5777) --- .../src/metric_to_slack_alert.py | 45 ++++++------------- 1 file changed, 14 insertions(+), 31 deletions(-) diff --git a/monitoring/slack_alerts/metric_to_slack_alert/src/metric_to_slack_alert.py b/monitoring/slack_alerts/metric_to_slack_alert/src/metric_to_slack_alert.py index e323f3d4..4410b9cd 100755 --- a/monitoring/slack_alerts/metric_to_slack_alert/src/metric_to_slack_alert.py +++ b/monitoring/slack_alerts/metric_to_slack_alert/src/metric_to_slack_alert.py @@ -98,9 +98,9 @@ def create_context_url(alarm_info): url_template = """https://logging.wellcomecollection.org/app/discover#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:'{from_date}',to:'{to_date}'))&_a=(columns:!(log),filters:!(('$state':(store:appState),meta:(alias:!n,disabled:!f,index:'{index_pattern_id}',key:ecs_cluster,negate:!f,params:(query:{cluster_name}),type:phrase),query:(match_phrase:(ecs_cluster:{cluster_name})))),index:'{index_pattern_id}',interval:auto,query:(language:kuery,query:'not%20log:%22*HTTP%2F1.1%5C%22%20200*%22%20and%20not%20log:%22*HTTP%2F1.1%5C%22%20206*%22%20and%20not%20log:%22*HTTP%2F1.1%5C%22%20302*%22%20and%20not%20log:%22*HTTP%2F1.1%5C%22%20304*%22%20and%20not%20log:%22*HTTP%2F1.1%5C%22%20307*%22%20and%20not%20log:%22*HTTP%2F1.1%5C%22%20308*%22%20and%20not%20log:%22*HTTP%2F1.1%5C%22%20400*%22%20and%20not%20log:%22*HTTP%2F1.1%5C%22%20401*%22%20and%20not%20log:%22*HTTP%2F1.1%5C%22%20404*%22%20and%20not%20log:%22*HTTP%2F1.1%5C%22%20410*%22%20and%20not%20log:%22*HTTP%2F1.1%5C%22%20414*%22%20and%20not%20log:%22*HTTP%2F1.1%5C%22%20499*%22%20and%20not%20log:%22*GET%20%2Faccount%2Fapi%2Fusers%2Fme%20401*%22%20and%20not%20log:%22*GET%20%2Faccount%2Fapi%2Fauth%2Fme%20401*%22%20and%20not%20log:%22*%3C--%20GET%20%2Faccount%2Fapi%2Fauth%2Fme*%22%20and%20not%20log:%22*%2Fmanagement%2Fhealthcheck*%22%20and%20not%20log:%22*--%3E%20GET%20%2Faccount%2Fapi%2Fusers%2Fme%2Fitem-requests%20304*%22%20and%20not%20log:%22*-x-%20GET%20%2Faccount%2Fapi%2Fusers%2Fme%2Fitem-requests%20304*%22%20and%20not%20log:%22*--%3E%20GET%20%2Faccount%2Fapi%2Fauth%2Flogin%20302*%22%20and%20not%20log:%22-x-%3E%20GET%20%2Faccount%2Fapi%2Fauth%2Flogin%20302*%22%20and%20not%20log:%22*--%3E%20GET%20%2Faccount*%20200*%22%20and%20not%20log:%22*-x-%3E%20GET%20%2Faccount*%20200*%22%20and%20not%20log:%22%3C--%20GET%20%2Faccount%2Fapi%2Fusers%2Fme%2Fitem-requests%22%20and%20not%20log:%22%3C--%20GET%20%2Faccount%2Fapi%2Fauth%2Flogin%22'),sort:!(!('@timestamp',desc)))""" if alarm_info["name"] == "cloudfront_wc.org_error_5xx": - cluster_name = f"experience-frontend-prod" + cluster_name = "experience-frontend-prod" elif alarm_info["name"] == "cloudfront_stage.wc.org_error_5xx": - cluster_name = f"experience-frontend-stage" + cluster_name = "experience-frontend-stage" else: return @@ -133,29 +133,6 @@ def create_context_url(alarm_info): "label": "View logs in Kibana", } - if os.environ.get("CONTEXT_URL_TEMPLATE") == "platform-dlq-alerts" and alarm_info[ - "name" - ].startswith("catalogue-"): - # The alarm name will be something like: - # - # catalogue-2022-03-10_id_minter_input_dlq_not_empty - # - cluster_name = alarm_info["name"].split("_")[0] - service_name = alarm_info["name"].replace("_input_dlq_not_empty", "") - - url_template = """https://logging.wellcomecollection.org/app/discover#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:'{from_date}',to:'{to_date}'))&_a=(columns:!(log),filters:!(('$state':(store:appState),meta:(alias:!n,disabled:!f,index:'{index_pattern_id}',key:ecs_cluster,negate:!f,params:(query:{cluster_name}),type:phrase),query:(match_phrase:(ecs_cluster:{cluster_name}))),('$state':(store:appState),meta:(alias:!n,disabled:!f,index:'{index_pattern_id}',key:service_name,negate:!f,params:(query:{service_name}),type:phrase),query:(match_phrase:(service_name:{service_name})))),index:'{index_pattern_id}',interval:auto,query:(language:kuery,query:''),sort:!(!('@timestamp',desc)))""" - - return { - "url": url_template.format( - cluster_name=cluster_name, - service_name=service_name, - to_date=kibana_to_date, - from_date=kibana_from_date, - index_pattern_id=index_pattern_id, - ), - "label": "View logs in Kibana", - } - def get_alarm_info(alarm): # This will be a message of the form: @@ -236,13 +213,8 @@ def is_alarm_count_very_big(alarm_count, environ): return superplural_threshold and alarm_count > int(superplural_threshold) -@log_on_error -def main(event, _ctxt=None): +def send_slack_notification(alarm_info): account = os.environ["ACCOUNT_NAME"] - - alarm = json.loads(event["Records"][0]["Sns"]["Message"]) - alarm_info = get_alarm_info(alarm) - webhook_url = get_secret_string(secret_id="monitoring/critical_slack_webhook") (icon_emoji, color) = get_alarm_level(alarm_info, os.environ) @@ -272,3 +244,14 @@ def main(event, _ctxt=None): urllib.request.urlopen(req) except HTTPError as err: raise Exception(f"{err} - {err.read()}") + + +@log_on_error +def main(event, _ctxt=None): + alarm = json.loads(event["Records"][0]["Sns"]["Message"]) + alarm_info = get_alarm_info(alarm) + + if os.environ.get("CONTEXT_URL_TEMPLATE") == "platform-dlq-alerts" and alarm_info["name"].startswith("catalogue-"): + return + + send_slack_notification(alarm_info) From 44d46e9299ba45ab066908c8df2bdff7572c8028 Mon Sep 17 00:00:00 2001 From: Buildkite on behalf of Wellcome Collection Date: Thu, 5 Sep 2024 13:43:31 +0000 Subject: [PATCH 2/3] Apply auto-formatting rules --- .../metric_to_slack_alert/src/metric_to_slack_alert.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/monitoring/slack_alerts/metric_to_slack_alert/src/metric_to_slack_alert.py b/monitoring/slack_alerts/metric_to_slack_alert/src/metric_to_slack_alert.py index 4410b9cd..642034ca 100755 --- a/monitoring/slack_alerts/metric_to_slack_alert/src/metric_to_slack_alert.py +++ b/monitoring/slack_alerts/metric_to_slack_alert/src/metric_to_slack_alert.py @@ -251,7 +251,9 @@ def main(event, _ctxt=None): alarm = json.loads(event["Records"][0]["Sns"]["Message"]) alarm_info = get_alarm_info(alarm) - if os.environ.get("CONTEXT_URL_TEMPLATE") == "platform-dlq-alerts" and alarm_info["name"].startswith("catalogue-"): + if os.environ.get("CONTEXT_URL_TEMPLATE") == "platform-dlq-alerts" and alarm_info[ + "name" + ].startswith("catalogue-"): return send_slack_notification(alarm_info) From 8bdb7980479b6467952cc02b7b88ea0f93890f48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0t=C4=9Bp=C3=A1n=20Brychta?= Date: Fri, 6 Sep 2024 14:56:01 +0100 Subject: [PATCH 3/3] Update metric_to_slack_alert.py --- .../src/metric_to_slack_alert.py | 20 ++++++------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/monitoring/slack_alerts/metric_to_slack_alert/src/metric_to_slack_alert.py b/monitoring/slack_alerts/metric_to_slack_alert/src/metric_to_slack_alert.py index 642034ca..4e05fd2f 100755 --- a/monitoring/slack_alerts/metric_to_slack_alert/src/metric_to_slack_alert.py +++ b/monitoring/slack_alerts/metric_to_slack_alert/src/metric_to_slack_alert.py @@ -213,8 +213,13 @@ def is_alarm_count_very_big(alarm_count, environ): return superplural_threshold and alarm_count > int(superplural_threshold) -def send_slack_notification(alarm_info): +@log_on_error +def main(event, _ctxt=None): account = os.environ["ACCOUNT_NAME"] + + alarm = json.loads(event["Records"][0]["Sns"]["Message"]) + alarm_info = get_alarm_info(alarm) + webhook_url = get_secret_string(secret_id="monitoring/critical_slack_webhook") (icon_emoji, color) = get_alarm_level(alarm_info, os.environ) @@ -244,16 +249,3 @@ def send_slack_notification(alarm_info): urllib.request.urlopen(req) except HTTPError as err: raise Exception(f"{err} - {err.read()}") - - -@log_on_error -def main(event, _ctxt=None): - alarm = json.loads(event["Records"][0]["Sns"]["Message"]) - alarm_info = get_alarm_info(alarm) - - if os.environ.get("CONTEXT_URL_TEMPLATE") == "platform-dlq-alerts" and alarm_info[ - "name" - ].startswith("catalogue-"): - return - - send_slack_notification(alarm_info)