Skip to content

Commit

Permalink
Merge pull request #115 from ericvaandering/test_new_pusher
Browse files Browse the repository at this point in the history
Common: create and use new context manager for prometheus metrics - fixes #119
  • Loading branch information
dchristidis authored Jan 11, 2024
2 parents d560481 + 528aaf3 commit d5f9921
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 50 deletions.
44 changes: 44 additions & 0 deletions cms/check_expected_total_number_of_files_per_rse
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python3
# Copyright European Organization for Nuclear Research (CERN) 2013
#
# Licensed under the Apache License, Version 2.0 (the "License");
# You may not use this file except in compliance with the License.
# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
#
# Authors:
# - Donata Mielaikaite, <[email protected]>, 2020
# - Eric Vaandering, <[email protected]>, 2022

"""
Probe to check the number of expected files per rse.
"""

import sys
import traceback

from rucio.db.sqla.session import BASE, get_session

from utils import common

PrometheusPusher = common.PrometheusPusher

# Exit statuses
OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3

if BASE.metadata.schema:
schema = BASE.metadata.schema + '.'
else:
schema = ''

if __name__ == "__main__":
try:
session = get_session()
with PrometheusPusher() as manager:
sql = 'SELECT rse_expression, sum(length*copies) as sum_length from {schema}DIDS join {schema}RULES on DIDS.name = RULES.name where state!=\'O\' and (length !=locks_ok_cnt ) and length is not null and locks_ok_cnt is not null group by rse_expression'.format(schema=schema)
result = session.execute(sql).fetchall()
for rse, count_files in result:
manager.gauge(name='judge.expected_number_of_files.{dst_rse}').labels(dst_rse=rse).set(count_files)
except:
print(traceback.format_exc())
sys.exit(UNKNOWN)
sys.exit(OK)
46 changes: 46 additions & 0 deletions cms/check_expiring_rules_per_rse
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env python3
# Copyright European Organization for Nuclear Research (CERN) 2013
#
# Licensed under the Apache License, Version 2.0 (the "License");
# You may not use this file except in compliance with the License.
# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
#
# Authors:
# - Donata Mielaikaite, <[email protected]>, 2020
# - Eric Vaandering, <[email protected]>, 2022

"""
Probe to check the number of expiring rules.
"""
import sys
import traceback

from rucio.db.sqla.session import BASE, get_session

from utils import common

PrometheusPusher = common.PrometheusPusher

# Exit statuses
OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3

if BASE.metadata.schema:
schema = BASE.metadata.schema + '.'
else:
schema = ''

if __name__ == "__main__":
try:
session = get_session()
with PrometheusPusher() as manager:
sql = 'SELECT rse_expression, count(*) from {schema}RULES where expires_at is not null group by rse_expression'.format(schema=schema)
result = session.execute(sql).fetchall()
for rse, count_files in result:
(manager.gauge(name='judge.expiring_rules_number.{dst_rse}',
documentation='The number of expiring rules at an RSE')
.labels(dst_rse=rse).set(count_files))

except:
print(traceback.format_exc())
sys.exit(UNKNOWN)
sys.exit(OK)
20 changes: 3 additions & 17 deletions cms/check_report_used_space
Original file line number Diff line number Diff line change
Expand Up @@ -25,33 +25,21 @@ Probe to check used space.
import sys
import traceback

from prometheus_client import CollectorRegistry, Gauge
from rucio.core.rse import list_rses, get_rse_usage, list_rse_attributes
from rucio.db.sqla import models
from rucio.db.sqla.session import get_session

from utils import common

probe_metrics = common.probe_metrics
PrometheusPusher = common.PrometheusPusher

# Exit statuses
OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3

if __name__ == '__main__':
registry = CollectorRegistry()
labelnames = ['rse', 'country', 'rse_type', 'source']

try:
session = get_session()
with PrometheusPusher(registry, job_name='check_report_used_space') as prometheus_config:
extra_prom_labels = prometheus_config['labels']
labelnames.extend(extra_prom_labels.keys())

prefix: str = prometheus_config['prefix']
used_space_gauge = Gauge(prefix + 'report_used_space', 'Space used at an RSE from various sources',
labelnames=labelnames, registry=registry)

with PrometheusPusher() as manager:
for rse in list_rses():
sources = get_rse_usage(rse['id'])
attributes = list_rse_attributes(rse['id'])
Expand All @@ -61,10 +49,8 @@ if __name__ == '__main__':
for usage in sources:
source = usage['source']
prom_labels = {'rse': rse['rse'], 'country': country, 'rse_type': rse_type, 'source': source}
prom_labels.update(extra_prom_labels)
used_space_gauge.labels(**prom_labels).set(usage['used'])
(probe_metrics.gauge(name='judge.used_space_rucio.{rse}.{country}.{rse_type}.{source}',
documentation='Space used at an RSE from various sources')
(manager.gauge(name='report_used_space.{rse}.{country}.{rse_type}.{source}',
documentation='Space used at an RSE from various sources')
.labels(rse=rse['rse'], country=country, rse_type=rse_type, source=source)
.set(usage['used']))
print(rse['rse'], country, rse_type, source, usage['used'])
Expand Down
20 changes: 6 additions & 14 deletions common/check_expired_rules
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# Authors:
# - Vincent Garonne, <[email protected]>, 2013
# - Thomas Beermann, <[email protected]>, 2019
# - Eric Vaandering, <[email protected]>, 2020-2022
# - Eric Vaandering, <[email protected]>, 2020-2023

"""
Probe to check the backlog of expired rules.
Expand All @@ -17,13 +17,9 @@ Probe to check the backlog of expired rules.
import sys
import traceback

from prometheus_client import CollectorRegistry, Gauge
from rucio.db.sqla.session import BASE, get_session

from utils import common

PrometheusPusher = common.PrometheusPusher
probe_metrics = common.probe_metrics
from utils.common import PrometheusPusher

# Exit statuses
OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3
Expand All @@ -35,20 +31,16 @@ else:

if __name__ == "__main__":
try:
registry = CollectorRegistry()
session = get_session()
with PrometheusPusher(registry, job_name='check_expired_rules') as prometheus_config:
prefix: str = prometheus_config['prefix']
with PrometheusPusher() as manager:
expired_rules = 'select count(1) from {schema}rules where expires_at < sys_extract_utc(localtimestamp)'.format(schema=schema)
result = session.execute(expired_rules).fetchone()[0]
probe_metrics.gauge('judge.expired_rules').set(result)
Gauge(prefix + 'judge_expired_rules', '', registry=registry).set(result)
manager.gauge(name='judge.expired_rules').set(result)

lifetimed_rules = 'select count(1) from {schema}rules where expires_at > sys_extract_utc(localtimestamp)'.format(schema=schema)
result = session.execute(lifetimed_rules).fetchone()[0]
print(result)
probe_metrics.gauge('judge.lifetimed_rules').set(result)
Gauge(prefix + 'judge_lifetimed_rules', '', registry=registry).set(result)

manager.gauge(name='judge.lifetimed_rules').set(result)
except:
print(traceback.format_exc())
sys.exit(UNKNOWN)
Expand Down
32 changes: 13 additions & 19 deletions common/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@
# - Eric Vaandering, <[email protected]>, 2022

import json
import sys
from typing import Tuple, Dict, List, Optional

from prometheus_client import push_to_gateway
from rucio.common.config import config_get
from rucio.core.monitor import MetricManager

probe_metrics = MetricManager(prefix='rucio.probes')
PROBES_PREFIX = 'rucio.probes'
probe_metrics = MetricManager(prefix=PROBES_PREFIX)


def get_prometheus_config() -> Tuple[List, str, Dict]:
prom_servers = config_get('monitor', 'prometheus_servers', raise_exception=False, default='')
Expand All @@ -41,25 +41,19 @@ class PrometheusPusher:
A context manager to abstract the business of configuring and pushing to prometheus
"""

def __init__(self, registry: object, job_name: Optional[str] = None):
self.registry = registry
if job_name:
self.job_name = job_name
else:
self.job_name = sys.argv[0]
self.servers, self.prefix, self.labels = get_prometheus_config()
def __init__(self, prefix: "Optional[str]" = PROBES_PREFIX, job_name: "Optional[str]" = None):
self.job_name = job_name
self.servers, _dummy, self.labels = get_prometheus_config()
self.prefix = prefix

self.manager = MetricManager(prefix=self.prefix, push_gateways=self.servers)

def __enter__(self) -> Dict:
def __enter__(self) -> "MetricManager":
"""
Give the caller everything it might need (prefix is all it does need)
Return the Rucio metrics manager
:return:
"""
config = {'servers': self.servers, 'prefix': self.prefix, 'labels': self.labels, 'job': self.job_name}
return config
return self.manager

def __exit__(self, exc_type, exc_value, exc_traceback):
for server in self.servers:
try:
push_to_gateway(server.strip(), job=self.job_name, registry=self.registry, grouping_key=self.labels)
except:
continue
self.manager.push_metrics_to_gw(job=self.job_name, grouping_key=self.labels)

0 comments on commit d5f9921

Please sign in to comment.