[AWS] 클라우드 와치 ECS 모니터링 웹훅 (파이썬)

고민사항

ECS fargate를 사용해서 모니터링을 하고있는데, CPU와 Memory에 이상 상황이 생겼을 때 슬랙 웹훅으로 알림을 받아서 편리하게 알림을 받고싶어요! 

 

해결해볼까요!

슬랙봇을 만들지는 않고, Incoming Webhook을 사용해서 구현했음을 알려드립니다.

이러한 코드를 인터넷에서 찾았습니다. 파이썬을 잘 모르지만 조금 개조해서 사용해 볼 거예요!

import boto3
import json
import logging
import os

from base64 import b64decode
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError

HOOK_URL = os.environ['HOOK_URL']

logger = logging.getLogger()
logger.setLevel(logging.INFO)


def lambda_handler(event, context):
    logger.info("Event: " + str(event))
    message = json.loads(event['Records'][0]['Sns']['Message'])
    logger.info("Message: " + str(message))

    alarm_name = message['AlarmName']
    #old_state = message['OldStateValue']
    new_state = message['NewStateValue']
    reason = message['NewStateReason']

    slack_message = {
        'text': "%s state is now %s: %s" % (alarm_name, new_state, reason)
    }

    req = Request(HOOK_URL, json.dumps(slack_message).encode('utf-8'))
    try:
        response = urlopen(req)
        response.read()
        logger.info("Message posted")
    except HTTPError as e:
        logger.error("Request failed: %d %s", e.code, e.reason)
    except URLError as e:
        logger.error("Server connection failed: %s", e.reason)

테스트코드

{
  "Records": [
    {
      "EventSource": "aws:sns",
      "EventVersion": "1.0",
      "EventSubscriptionArn": "arn:aws:sns:eu-west-1:000000000000:cloudwatch-alarms:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx",
      "Sns": {
        "Type": "Notification",
        "MessageId": "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx",
        "TopicArn": "arn:aws:sns:eu-west-1:000000000000:cloudwatch-alarms",
        "Subject": "ALARM: \"Example alarm name\" in EU - Ireland",
        "Message": "{\"AlarmName\":\"Example alarm name\",\"AlarmDescription\":\"Example alarm description.\",\"AWSAccountId\":\"000000000000\",\"NewStateValue\":\"ALARM\",\"NewStateReason\":\"Threshold Crossed: 1 datapoint (10.0) was greater than or equal to the threshold (1.0).\",\"StateChangeTime\":\"2017-01-12T16:30:42.236+0000\",\"Region\":\"EU - Ireland\",\"OldStateValue\":\"OK\",\"Trigger\":{\"MetricName\":\"DeliveryErrors\",\"Namespace\":\"ExampleNamespace\",\"Statistic\":\"SUM\",\"Unit\":null,\"Dimensions\":[],\"Period\":300,\"EvaluationPeriods\":1,\"ComparisonOperator\":\"GreaterThanOrEqualToThreshold\",\"Threshold\":1.0}}",
        "Timestamp": "2017-01-12T16:30:42.318Z",
        "SignatureVersion": "1",
        "Signature": "Cg==",
        "SigningCertUrl": "https://sns.eu-west-1.amazonaws.com/SimpleNotificationService-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.pem",
        "UnsubscribeUrl": "https://sns.eu-west-1.amazonaws.com/?Action=Unsubscribe&SubscriptionArn=arn:aws:sns:eu-west-1:000000000000:cloudwatch-alarms:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx",
        "MessageAttributes": {}
      }
    }
  ]
}

중간 성과

import boto3
import json
import logging
import os

from base64 import b64decode
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError

HOOK_URL = os.environ['HOOK_URL']

logger = logging.getLogger()
logger.setLevel(logging.INFO)


def lambda_handler(event, context):
    logger.info("Event: " + str(event))
    message = json.loads(event['Records'][0]['Sns']['Message'])
    logger.info("Message: " + str(message))

 
    alarm_name = message['AlarmName']           # 지표 이름
    new_state = message['NewStateValue']        # 상태
    detail_reason = message['NewStateReason']   # 상세 내용
    resource_id = message['Trigger']['Dimensions']  #자원_id
    resource_name = message['Trigger']['Dimensions'] #자원_name
    compar_state = message['Trigger']['ComparisonOperator']    # 임계치 값
    minutes = message['Trigger']['Period']                 # 기간
    minutes = minutes / 60
    evaluationPeriods = message['Trigger']['EvaluationPeriods']  #횟수
    first_value = int(evaluationPeriods * minutes)   #원인
    state_time = message['StateChangeTime'][:19]     # 발생 시간
    metricName = message['Trigger']['MetricName']   # metricName
    threshold = message['Trigger']['Threshold']     # Threshold
    
    # Threshold type 임계치 부등호 변경
    if compar_state == "GreaterThanThreshold":
        compar_state = ">"
    elif compar_state == "GreaterThanOrEqualToThreshold":
        compar_state = ">="
    elif compar_state == "LowerThanOrEqualToThreshold":
        compar_state = "<="
    elif compar_state == "LessThanThreshold":
        compar_state = "<"
    else:
        compar_state = " "
        
    # 원인 문장 만들기
    cause = "%s 분 동안 %s 회 %s %s %s" %(first_value, evaluationPeriods, metricName, compar_state, threshold)
    
    slack_message = {
        'text': "*[%s]*\n*상태*\n%s\n*자원*\n(%s)%s\n*원인*\n%s\n*상세 내용*\n%s" % (alarm_name, new_state, resource_name, resource_id, cause, detail_reason)
    }
    req = Request(HOOK_URL, json.dumps(slack_message).encode('utf-8'))
    try:
        response = urlopen(req)
        response.read()
        logger.info("Message posted")
    except HTTPError as e:
        logger.error("Request failed: %d %s", e.code, e.reason)
    except URLError as e:
        logger.error("Server connection failed: %s", e.reason)

[CloudWatch-CPUUtilization-test-Alarm-over45]
상태
ALARM
자원
([{'value': 'simple-node', 'name': 'ServiceName'}, {'value': 'ecs-atuo', 'name': 'ClusterName'}])[{'value': 'simple-node', 'name': 'ServiceName'}, {'value': 'ecs-atuo', 'name': 'ClusterName'}]
원인
1 분 동안 1 회 CpuUtilized > 40.0
상세 내용
Threshold Crossed: 1 out of the last 1 datapoints [62.412968750000005 (26/01/23 08:45:00)] was greater than the threshold (40.0) (minimum 1 datapoint for OK -> ALARM transition).


마무리 코드

import boto3
import json
import logging
import os

from base64 import b64decode
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
from datetime import date, datetime, timezone, timedelta

exp_day = str(date.today())

KST = timezone(timedelta(hours=9))
time_record = datetime.now(KST)
_day = str(time_record)[:10]
_time = str(time_record.time())[:8]

HOOK_URL = os.environ['HOOK_URL']

logger = logging.getLogger()
logger.setLevel(logging.INFO)


def lambda_handler(event, context):
    logger.info("Event: " + str(event))
    message = json.loads(event['Records'][0]['Sns']['Message'])
    logger.info("Message: " + str(message))

 
    alarm_name = message['AlarmName']           # 지표 이름
    new_state = message['NewStateValue']        # 상태
    detail_reason = message['NewStateReason']   # 상세 내용
    resource_id = message['Trigger']['Dimensions'][0]['value']  #자원_id
    resource_name = message['Trigger']['Dimensions'][0]['name'] #자원_name
    compar_state = message['Trigger']['ComparisonOperator']    # 임계치 값
    minutes = message['Trigger']['Period']                 # 기간
    minutes = minutes / 60
    evaluationPeriods = message['Trigger']['EvaluationPeriods']  #횟수
    first_value = int(evaluationPeriods * minutes)   #원인
    state_time = message['StateChangeTime'][:19]     # 발생 시간
    metricName = message['Trigger']['MetricName']   # metricName
    threshold = message['Trigger']['Threshold']     # Threshold
    kst_time = datetime.now(KST) #KST 시간 변환
    
    # Threshold type 임계치 부등호 변경
    if compar_state == "GreaterThanThreshold":
        compar_state = ">"
    elif compar_state == "GreaterThanOrEqualToThreshold":
        compar_state = ">="
    elif compar_state == "LowerThanOrEqualToThreshold":
        compar_state = "<="
    elif compar_state == "LessThanThreshold":
        compar_state = "<"
    else:
        compar_state = " "
        
    # 원인 문장 만들기
    cause = "%s 분 동안 %s 회 %s %s %s" %(first_value, evaluationPeriods, metricName, compar_state, threshold)
    
    slack_message = {
        'text': "*[%s]*\n*발생시간*\n%s\n*상태*\n%s\n*리소스*\n(%s)%s\n*원인*\n%s\n*상세 지표*\n%s" % (alarm_name, kst_time, new_state, resource_name, resource_id, cause, detail_reason)
    }
    req = Request(HOOK_URL, json.dumps(slack_message).encode('utf-8'))
    try:
        response = urlopen(req)
        response.read()
        logger.info("Message posted")
    except HTTPError as e:
        logger.error("Request failed: %d %s", e.code, e.reason)
    except URLError as e:
        logger.error("Server connection failed: %s", e.reason)

웹훅 내용 변경

이런 모양으로 깔끔하게 슬랙에서 보고싶습니다. 
import boto3
import json
import logging
import os

from base64 import b64decode
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
from datetime import date, datetime, timezone, timedelta

exp_day = str(date.today())

KST = timezone(timedelta(hours=9))
time_record = datetime.now(KST)
_day = str(time_record)[:10]
_time = str(time_record.time())[:8]

HOOK_URL = os.environ['HOOK_URL']

logger = logging.getLogger()
logger.setLevel(logging.INFO)


def lambda_handler(event, context):
    logger.info("Event: " + str(event))
    message = json.loads(event['Records'][0]['Sns']['Message'])
    logger.info("Message: " + str(message))

 
    alarm_name = message['AlarmName']           # 지표 이름
    new_state = message['NewStateValue']        # 상태
    detail_reason = message['NewStateReason']   # 상세 내용
    resource_id = message['Trigger']['Dimensions'][0]['value']  #자원_id
    resource_name = message['Trigger']['Dimensions'][0]['name'] #자원_name
    compar_state = message['Trigger']['ComparisonOperator']    # 임계치 값
    minutes = message['Trigger']['Period']                 # 기간
    minutes = minutes / 60
    evaluationPeriods = message['Trigger']['EvaluationPeriods']  #횟수
    first_value = int(evaluationPeriods * minutes)   #원인
    state_time = message['StateChangeTime'][:19]     # 발생 시간
    metricName = message['Trigger']['MetricName']   # metricName
    threshold = message['Trigger']['Threshold']     # Threshold
    kst_time = datetime.now(KST) #KST 시간 변환
    
    # Threshold type 임계치 부등호 변경
    if compar_state == "GreaterThanThreshold":
        compar_state = ">"
    elif compar_state == "GreaterThanOrEqualToThreshold":
        compar_state = ">="
    elif compar_state == "LowerThanOrEqualToThreshold":
        compar_state = "<="
    elif compar_state == "LessThanThreshold":
        compar_state = "<"
    else:
        compar_state = " "
        
    # 원인 문장 만들기
    cause = "%s 분 동안 %s 회 %s %s %s" %(first_value, evaluationPeriods, metricName, compar_state, threshold)
    
    slack_message = {
        'text': " \n *[%s]*\n> *Link* : <확인할 수 있는 대시보드의 링크> \n> *Time* : %s \n> *State* : %s\n> *Resource* : (%s)%s\n> *Reason* : %s\n> *Metric* : %s" % (alarm_name, kst_time, new_state, resource_name, resource_id, cause, detail_reason)
    }
    req = Request(HOOK_URL, json.dumps(slack_message).encode('utf-8'))
    try:
        response = urlopen(req)
        response.read()
        logger.info("Message posted")
    except HTTPError as e:
        logger.error("Request failed: %d %s", e.code, e.reason)
    except URLError as e:
        logger.error("Server connection failed: %s", e.reason)

이렇게 코드를 다듬어보았습니다.

이쁜모양으로 잘 들어오는군요 ㅎㅎ

이 코드는 재사용성이 좋아서 설정만 바꿔서 CPU뿐만 아니라 Memory사용량, 그리고 로드밸런서의 Unhealthy카운트까지 적용해서 잘 사용하고있습니다!

출처 도움:

https://longtermsad.tistory.com/49

 

AWS Monitoring[CloudWatch] - 1. CloudWatch Alarm을 Slack 연동하기

AWS에서는 Amazon CloudWatch Alarm(경보)를 사용하여 모니터링 할 수 있으며 Alarm(경보)를 설정하고 해당 내용에 대해서 알람을 받을 수 있습니다. https://docs.aws.amazon.com/ko_kr/AmazonCloudWatch/latest/monitoring/Ala

longtermsad.tistory.com

 

JUNE .

20'S LIFE IN SYDNEY and BUSAN

    이미지 맵

    DevOps Study/AWS 다른 글

    이전 글

    다음 글