Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(backend): Demonstrate metrics #1488

Draft
wants to merge 4 commits into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions antarest/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,36 @@ def from_dict(data: JSON) -> "ServerConfig":
)


@dataclass(frozen=True)
class PrometheusConfig:
"""
Sub config object dedicated to prometheus metrics
"""

multiprocess: bool = False

@classmethod
def from_dict(cls, data: JSON) -> "PrometheusConfig":
return cls(multiprocess=bool(data["multiprocess"]))


@dataclass(frozen=True)
class MetricsConfig:
"""
Sub config object dedicated to metrics
"""

prometheus: Optional[PrometheusConfig] = None

@classmethod
def from_dict(cls, data: JSON) -> "MetricsConfig":
return cls(
prometheus=PrometheusConfig.from_dict(data["prometheus"])
if "prometheus" in data
else None
)


@dataclass(frozen=True)
class Config:
"""
Expand All @@ -383,6 +413,7 @@ class Config:
eventbus: EventBusConfig = EventBusConfig()
cache: CacheConfig = CacheConfig()
tasks: TaskConfig = TaskConfig()
metrics: MetricsConfig = MetricsConfig()
root_path: str = ""

@staticmethod
Expand Down Expand Up @@ -421,6 +452,9 @@ def from_dict(data: JSON, res: Optional[Path] = None) -> "Config":
server=ServerConfig.from_dict(data["server"])
if "server" in data
else ServerConfig(),
metrics=MetricsConfig.from_dict(data["metrics"])
if "metrics" in data
else MetricsConfig(),
)

@staticmethod
Expand Down
7 changes: 7 additions & 0 deletions antarest/core/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,3 +224,10 @@ def __init__(self) -> None:
HTTPStatus.BAD_REQUEST,
"You cannot scan the default internal workspace",
)


class ConfigurationError(RuntimeError):
"""
Raised when some configuration is invalid.
"""
pass
91 changes: 91 additions & 0 deletions antarest/core/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import logging
import os
import time

import prometheus_client
from fastapi import FastAPI
from prometheus_client import (
CollectorRegistry,
Counter,
Histogram,
make_asgi_app,
)
from prometheus_client import multiprocess
from starlette.requests import Request

from antarest.core.config import Config
from antarest.core.exceptions import ConfigurationError

logger = logging.getLogger(__name__)


_PROMETHEUS_MULTIPROCESS_ENV_VAR = "PROMETHEUS_MULTIPROC_DIR"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a short comment to explain the purpose of this environment variable, for instance:

# The `PROMETHEUS_MULTIPROC_DIR` environment variable is used by
# the Python Prometheus client library to configure process-level metrics
# when running in a multi-process environment.



def _add_metrics_middleware(
application: FastAPI, registry: CollectorRegistry, worker_id: str
):
"""
Registers an HTTP middleware to report metrics about requests count and duration
"""

request_counter = Counter(
"request_count",
"App Request Count",
["worker_id", "method", "endpoint", "http_status"],
registry=registry,
)
request_duration_histo = Histogram(
"request_duration_seconds",
"Request duration",
["worker_id", "method", "endpoint", "http_status"],
registry=registry,
)

@application.middleware("http")
async def add_metrics(request: Request, call_next):
start_time = time.time()
response = await call_next(request)
process_time = time.time() - start_time

if "route" in request.scope:
request_path = (
request.scope["root_path"] + request.scope["route"].path
)
else:
request_path = request.url.path

request_counter.labels(
worker_id, request.method, request_path, response.status_code
).inc()
request_duration_histo.labels(
worker_id, request.method, request_path, response.status_code
).observe(process_time)
return response


def add_metrics(application: FastAPI, config: Config) -> None:
"""
If configured, adds "/metrics" endpoint to report metrics to prometheus.
Also registers metrics for HTTP requests.
"""
prometheus_config = config.metrics.prometheus
if not prometheus_config:
return

if prometheus_config.multiprocess:
if _PROMETHEUS_MULTIPROCESS_ENV_VAR not in os.environ:
raise ConfigurationError(
f"Environment variable {_PROMETHEUS_MULTIPROCESS_ENV_VAR} must be defined for use of prometheus in a multiprocess environment"
)
registry = CollectorRegistry()
multiprocess.MultiProcessCollector(registry)
worker_id = os.environ["APP_WORKER_ID"]
else:
registry = prometheus_client.REGISTRY
worker_id = "0"

metrics_app = make_asgi_app(registry=registry)
application.mount("/metrics", metrics_app)

_add_metrics_middleware(application, registry, worker_id)
9 changes: 8 additions & 1 deletion antarest/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from antarest.core.config import Config
from antarest.core.core_blueprint import create_utils_routes
from antarest.core.logging.utils import configure_logger, LoggingMiddleware
from antarest.core.metrics import add_metrics
from antarest.core.requests import RATE_LIMIT_CONFIG
from antarest.core.swagger import customize_openapi
from antarest.core.utils.utils import get_local_path
Expand Down Expand Up @@ -135,6 +136,8 @@ def fastapi_app(

application.add_middleware(LoggingMiddleware)

add_metrics(application, config)

if mount_front:
application.mount(
"/static",
Expand Down Expand Up @@ -273,7 +276,7 @@ def handle_all_exception(request: Request, exc: Exception) -> Any:
return application, services


if __name__ == "__main__":
def main():
(
config_file,
display_version,
Expand All @@ -297,3 +300,7 @@ def handle_all_exception(request: Request, exc: Exception) -> Any:
else:
services = SingletonServices(config_file, [module])
services.start()


if __name__ == "__main__":
main()
15 changes: 15 additions & 0 deletions conf/gunicorn.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import multiprocessing
import os
from prometheus_client import multiprocess

bind = "0.0.0.0:5000"

Expand All @@ -28,3 +29,17 @@
errorlog = "-"
accesslog = "-"
preload_app = False


def post_fork(server, worker):
"""
Put the worker_id into an env variable for further use within the app.
"""
os.environ["APP_WORKER_ID"] = str(worker.age)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I had a choice, I would prefer GUNICORN_WORKER_ID for that.

Why do you use worker.age instead of worker.id?

worker.age is a number that indicates how many requests the worker process has handled since it was started.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From what I understood, worker.age is a counter incremented every time gunicorn creates a new worker (for ex. when one dies).
Using the age here allows to have an almost stable set of worker IDs between application runs (after a restart of the app, for example, we sill have IDs in the range [1-8] for 8 workers), to have mostly stable metrics labels.
I don't see any worker.id, we have the PID as an alternative, but it's not stable at all between runs.



def child_exit(server, worker):
"""
Notify prometheus that this worker stops
"""
multiprocess.mark_process_dead(worker.pid)
6 changes: 5 additions & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,13 @@ services:
volumes:
- ./resources/deploy/db:/var/lib/postgresql/data
command: [ "postgres", "-c", "log_statement=all", "-c", "log_destination=stderr" ]
ports:
- 5432:5432
redis:
image: redis:latest
container_name : redis
ports:
- 6379:6379
nginx:
image: nginx:latest
container_name: nginx
Expand All @@ -62,4 +66,4 @@ services:
volumes:
- ./resources/deploy/nginx.conf:/etc/nginx/conf.d/default.conf:ro
- ./webapp/build:/www
- ./resources/deploy/web.config.json:/www/config.json:ro
- ./resources/deploy/web.config.json:/www/config.json:ro
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ checksumdir~=1.2.0
click~=8.0.3
contextvars~=2.4
fastapi-jwt-auth~=0.5.0
fastapi[all]~=0.73.0
fastapi[all]~=0.74.0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI: A version upgrade would be welcome to take advantage of the latest developments that simplify the update of the Swagger API documentation. But, we need to analyse the impacts...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok! This was the minimum upgrade to get the "route" information from the request

filelock~=3.4.2
gunicorn~=20.1.0
Jinja2~=3.0.3
Expand All @@ -18,6 +18,7 @@ MarkupSafe~=2.0.1
numpy~=1.22.1
pandas~=1.4.0
plyer~=2.0.0
prometheus-client~=0.16.0
psycopg2-binary==2.9.4
pydantic~=1.9.0
PyQt5~=5.15.6
Expand Down
6 changes: 6 additions & 0 deletions scripts/start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ set -e
CURDIR=$(cd `dirname $0` && pwd)
BASEDIR=`dirname $CURDIR`

if [[ -v PROMETHEUS_MULTIPROC_DIR ]]; then
rm ${PROMETHEUS_MULTIPROC_DIR}/*.db
mkdir -p ${PROMETHEUS_MULTIPROC_DIR}
echo "Concatenating metrics into ${PROMETHEUS_MULTIPROC_DIR}"
fi

if [ -z "$1" ] ; then
sh $CURDIR/pre-start.sh
gunicorn --config $BASEDIR/conf/gunicorn.py --worker-class=uvicorn.workers.UvicornWorker antarest.wsgi:app
Expand Down