From c7bc9e429e50978e40afd181dd27e7d280181410 Mon Sep 17 00:00:00 2001 From: David Straub Date: Mon, 21 Oct 2024 19:49:37 +0200 Subject: [PATCH] Update chat docs & resource limits --- docs/install_setup/chat.md | 3 + docs/install_setup/cpu-limited.md | 57 ++++++++++++++----- docs/user-guide/chat.md | 4 ++ examples/caprover-one-click-app.yml | 2 +- .../digitalocean-1click/docker-compose.yml | 2 +- .../docker-compose-base/docker-compose.yml | 2 +- .../docker-compose.yml | 2 +- mkdocs.yml | 2 +- 8 files changed, 56 insertions(+), 18 deletions(-) diff --git a/docs/install_setup/chat.md b/docs/install_setup/chat.md index 18745f2..3dbbb2a 100644 --- a/docs/install_setup/chat.md +++ b/docs/install_setup/chat.md @@ -52,6 +52,9 @@ If the model is not present in the local cache, it will be downloaded when Gramp Please share learnings about different models with the community! +!!! info + The sentence transformers library consumes a significant amount of memory, which might cause worker processes being killed. As a rule of thumb, with semantic search enabled, each Gunicorn worker consumes around 200 MB of memory and each celery worker around 500 MB of memory even when idle, and up to 1 GB when computing embeddings. See [Limit CPU and memory usage](cpu-limited.md) for settings that limit memory usage. In addition, it is advisable to provision a sufficiently large swap partition to prevent OOM errors due to transient memory usage spikes. + ## Setting up an LLM provider Communication with the LLM uses an OpenAI compatible API using the `openai-python` library. This allows using a locally deployed LLM via Ollama (see [Ollama OpenAI compatibility](https://ollama.com/blog/openai-compatibility)) or an API like OpenAI or Huggingface TGI. The LLM is configured via the configuration parameters `LLM_MODEL` and `LLM_BASE_URL`. diff --git a/docs/install_setup/cpu-limited.md b/docs/install_setup/cpu-limited.md index 29967c1..c8e0c95 100644 --- a/docs/install_setup/cpu-limited.md +++ b/docs/install_setup/cpu-limited.md @@ -1,27 +1,58 @@ -# Limit CPU usage +# Limit CPU and memory usage -In order to avoid high CPU/RAM usage, it is possible to set the number of workers -using the environment variable `GUNICORN_NUM_WORKERS`. +In the recommended docker-based setup, Gramps Web uses [Gunicorn](https://gunicorn.org/) to serve the +backend and [Celery](https://docs.celeryq.dev) for background tasks. In both cases, several worker +processes can be run in parallel, which makes the application more responsive from a user perspective. +However, increasing the number of workers also increase the amount of RAM used (even when the application is idle) +and allowing requests to be processed in parallel can lead to high CPU usage (in particular when many users +are using the application simultaneously). Both Gunicorn and Celery allow to limit the number of parallel workers. -Here, we will take a number of workers = 2. Adjust it according to your needs. -It may be a good idea to check the CPU/Threads available before choosing the value: +## Get information about your system -> lscpu | grep CPU +On Linux, you can check the number of cores available on your system with the following command: -The easiest way is to declare the variable in the `docker-compose.yml` file, -under the "environment". +```bash +lscpu | grep CPU +``` +To see how much memory and swap space you have available, use + +```bash +free -h ``` -version: "3.7" + + +## Limiting the number of Gunicorn workers + +The easiest way to set the number of Gunicorn workers when using the default Gramps Web +docker image is to set the environment variable `GUNICORN_NUM_WORKERS`, e.g. by declaring it +in the `docker-compose.yml` file, +under the "environment". + +```yaml services: grampsweb: environment: GUNICORN_NUM_WORKERS: 2 ``` -Other ways are possible, for example by storing the variable in a file, -and calling it in the startup command: +See [the Gunicorn documentation](https://docs.gunicorn.org/en/stable/design.html#how-many-workers) to decide +about the ideal number of workers. + + + +## Limiting the number of Celery workers + +To set the number of Celery workers, adapt the `concurrency` setting in the Docker compose file: + +```yaml + grampsweb_celery: + command: celery -A gramps_webapi.celery worker --loglevel=INFO --concurrency=2 +``` -> docker compose --env-file ./env up +See [the Celery documentation](https://docs.celeryq.dev/en/stable/userguide/workers.html#concurrency) to decide +about the ideal number of workers. -In this case, the `env` file would contain a single line: GUNICORN_NUM_WORKERS=2 \ No newline at end of file +!!! info + If the `concurrency` flag is omitted (which was the case in the Gramps Web documentation until v2.5.0), it + defaults to the number of CPU cores available on the system, which might consume a substantial amount of memory. \ No newline at end of file diff --git a/docs/user-guide/chat.md b/docs/user-guide/chat.md index 0b9d712..fa47acd 100644 --- a/docs/user-guide/chat.md +++ b/docs/user-guide/chat.md @@ -1,5 +1,9 @@ # Using AI chat +!!! info + AI chat requires Gramps Web API version 2.5.0 or higher and Gramps Web version 24.10.0 or higher. + + The chat view in Gramps Web (if available in your installation) gives access to an AI assistant that can answer questions about your family tree. !!! warning diff --git a/examples/caprover-one-click-app.yml b/examples/caprover-one-click-app.yml index 845902e..a6cd474 100644 --- a/examples/caprover-one-click-app.yml +++ b/examples/caprover-one-click-app.yml @@ -49,7 +49,7 @@ services: notExposeAsWebApp: 'true' dockerfileLines: - FROM ghcr.io/gramps-project/grampsweb:$$cap_version - - CMD exec celery -A gramps_webapi.celery worker --loglevel=INFO + - CMD exec celery -A gramps_webapi.celery worker --loglevel=INFO --concurrency=2 volumes: $$cap_appname-users: $$cap_appname-index: diff --git a/examples/digitalocean-1click/docker-compose.yml b/examples/digitalocean-1click/docker-compose.yml index 77dd068..b6d9894 100644 --- a/examples/digitalocean-1click/docker-compose.yml +++ b/examples/digitalocean-1click/docker-compose.yml @@ -36,7 +36,7 @@ services: VIRTUAL_HOST: "" LETSENCRYPT_HOST: "" LETSENCRYPT_EMAIL: "" - command: celery -A gramps_webapi.celery worker --loglevel=INFO + command: celery -A gramps_webapi.celery worker --loglevel=INFO --concurrency=2 grampsweb_redis: image: docker.io/library/redis:7.2.4-alpine diff --git a/examples/docker-compose-base/docker-compose.yml b/examples/docker-compose-base/docker-compose.yml index 4e3afcc..53becd3 100644 --- a/examples/docker-compose-base/docker-compose.yml +++ b/examples/docker-compose-base/docker-compose.yml @@ -27,7 +27,7 @@ services: container_name: grampsweb_celery depends_on: - grampsweb_redis - command: celery -A gramps_webapi.celery worker --loglevel=INFO + command: celery -A gramps_webapi.celery worker --loglevel=INFO --concurrency=2 grampsweb_redis: image: docker.io/library/redis:7.2.4-alpine diff --git a/examples/docker-compose-letsencrypt/docker-compose.yml b/examples/docker-compose-letsencrypt/docker-compose.yml index 62e7962..d3d1ba1 100644 --- a/examples/docker-compose-letsencrypt/docker-compose.yml +++ b/examples/docker-compose-letsencrypt/docker-compose.yml @@ -37,7 +37,7 @@ services: VIRTUAL_HOST: "" LETSENCRYPT_HOST: "" LETSENCRYPT_EMAIL: "" - command: celery -A gramps_webapi.celery worker --loglevel=INFO + command: celery -A gramps_webapi.celery worker --loglevel=INFO --concurrency=2 grampsweb_redis: image: docker.io/library/redis:7.2.4-alpine diff --git a/mkdocs.yml b/mkdocs.yml index 304ad86..f056b34 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -18,7 +18,7 @@ nav: - Update: install_setup/update.md - Using PostgreSQL: install_setup/postgres.md - Hosting media on S3: install_setup/s3.md - - Limit CPU usage: install_setup/cpu-limited.md + - Limit CPU & memory usage: install_setup/cpu-limited.md - 2.0 upgrade guide: install_setup/v2.md - Administration: - Introduction: administration/admin.md