From 488534b03e545bb2199a2da9102c9ee773408778 Mon Sep 17 00:00:00 2001 From: Josh Bailey Date: Sun, 29 Oct 2023 22:44:52 +0000 Subject: [PATCH] Common infrastructure for torchserve containers, wrap entrypoint. --- .github/workflows/docker-extras.yml | 4 ++-- .github/workflows/docker-test.yml | 4 +--- README.md | 12 +++++++++++- docker/Dockerfile.cuda-torchserve | 17 +++++------------ docker/Dockerfile.torchserve | 17 +++++------------ docs/README-airt.md | 18 ++++++++---------- orchestrator.yml | 16 ++++++++++++---- specgram.yml | 4 ++-- tests/test_torchserve.sh | 8 +++----- torchserve-cuda.yml | 22 ++++++++++++++++++++++ torchserve.yml | 15 +++++++++++++++ torchserve/config.properties | 2 ++ torchserve/custom_handler.py | 2 ++ torchserve/install-torchserve.sh | 16 ++++++++++++++++ torchserve/torchserve-entrypoint.sh | 2 ++ worker.yml | 4 ++-- 16 files changed, 110 insertions(+), 53 deletions(-) create mode 100644 torchserve-cuda.yml create mode 100644 torchserve.yml create mode 100755 torchserve/install-torchserve.sh create mode 100755 torchserve/torchserve-entrypoint.sh diff --git a/.github/workflows/docker-extras.yml b/.github/workflows/docker-extras.yml index 76a56bea..3a5fee81 100644 --- a/.github/workflows/docker-extras.yml +++ b/.github/workflows/docker-extras.yml @@ -34,7 +34,7 @@ jobs: - name: Build and push platforms uses: docker/build-push-action@v5 with: - context: docker + context: . file: docker/Dockerfile.torchserve platforms: linux/amd64,linux/arm64 push: true @@ -68,7 +68,7 @@ jobs: - name: Build and push platforms uses: docker/build-push-action@v5 with: - context: docker + context: . file: docker/Dockerfile.cuda-torchserve platforms: linux/amd64 push: true diff --git a/.github/workflows/docker-test.yml b/.github/workflows/docker-test.yml index 8e7473eb..7f61bfac 100644 --- a/.github/workflows/docker-test.yml +++ b/.github/workflows/docker-test.yml @@ -7,9 +7,7 @@ jobs: - uses: actions/checkout@v4 - name: docker build run: | - cd docker - docker build -f Dockerfile.torchserve . -t iqtlabs/gamutrf-torchserve:latest - cd .. + docker build -f docker/Dockerfile.torchserve . -t iqtlabs/gamutrf-torchserve:latest ./tests/test_torchserve.sh test-gamutrf-extra-images: runs-on: ubuntu-latest diff --git a/README.md b/README.md index 4586a80c..b2a3785f 100644 --- a/README.md +++ b/README.md @@ -175,12 +175,22 @@ Run ```echo 0 > /sys/module/usbcore/parameters/usbfs_memory_mb``` as root before ##### ```[ERROR] [USB] USB open failed: insufficient permissions``` -Ettus SDRs download firmware and switch USB identities when first powered up. Restart the affected container to work around this. +Ettus SDRs download firmware and switch USB identities when first powered up. Restart the affected container to work around this (if run with docker compose, restart will happen automatically). ##### ```[ERROR] [UHD] An unexpected exception was caught in a task loop.The task loop will now exit, things may not work.boost: mutex lock failed in pthread_mutex_lock: Invalid argument``` UHD driver arguments ```num_recv_frames``` or ```recv_frame_size``` may be too high. The defaults are defined as ETTUS_ARGS in [utils.py](gamutrf/utils.py). Try reducing one or both via ```--sdrargs```. For example, ```--sdrargs num_recv_frames=64,recv_frame_size=8200,type=b200```. +#### ```[ERROR] [UHD] EnvironmentError: IOError: usb rx6 transfer status: LIBUSB_TRANSFER_OVERFLOW``` + +Stop containers, and reset the Ettus as follows: + +``` +$ /usr/lib/uhd/utils/b2xx_fx3_utils -D +$ /usr/lib/uhd/utils/b2xx_fx3_utils -U +$ /usr/lib/uhd/utils/b2xx_fx3_utils -S +``` + #### Scanner with Ettus SDR shows implausible low power at approx 100MHz intervals Ettus radios periodically need extra time to produce good data when being retuned rapidly by the scanner. Increasing the value of ```--db_clamp_floor``` will cause the scanner to discard windows after retuning (effectively waiting for the retune command to be executed and produce good data before proceeding). diff --git a/docker/Dockerfile.cuda-torchserve b/docker/Dockerfile.cuda-torchserve index 795bb800..3cc0166a 100644 --- a/docker/Dockerfile.cuda-torchserve +++ b/docker/Dockerfile.cuda-torchserve @@ -1,19 +1,12 @@ FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04 ENV DEBIAN_FRONTEND=noninteractive WORKDIR /root -RUN apt-get update && \ - apt-get install -y \ - git \ - python3-pip -RUN pip config set global.no-cache-dir false && \ - git clone https://github.com/pytorch/serve -b v0.9.0 && \ - cd serve && \ - python3 ./ts_scripts/install_dependencies.py --cuda cu118 --environment prod && \ - pip3 install . && \ - pip3 install -r examples/object_detector/yolo/yolov8/requirements.txt && \ - cd .. && \ - rm -rf serve +COPY torchserve/install-torchserve.sh /torchserve/install-torchserve.sh +RUN /torchserve/install-torchserve.sh --cuda cu118 RUN /usr/local/bin/torchserve --help +COPY torchserve/config.properties /torchserve/config.properties +COPY torchserve/torchserve-entrypoint.sh /torchserve/torchserve-entrypoint.sh +ENTRYPOINT ["/torchserve/torchserve-entrypoint.sh"] # see Dockerfile.torchserve for example, but use # docker run --gpus all -ti iqtlabs/gamutrf-cuda-torchserve:latest bash diff --git a/docker/Dockerfile.torchserve b/docker/Dockerfile.torchserve index 931a75a2..eb0609a2 100644 --- a/docker/Dockerfile.torchserve +++ b/docker/Dockerfile.torchserve @@ -1,16 +1,9 @@ FROM ubuntu:22.04 ENV DEBIAN_FRONTEND=noninteractive WORKDIR /root -RUN apt-get update && \ - apt-get install -y \ - git \ - python3-pip -RUN pip config set global.no-cache-dir false && \ - git clone https://github.com/pytorch/serve -b v0.8.2 && \ - cd serve && \ - python3 ./ts_scripts/install_dependencies.py --environment prod && \ - pip3 install . && \ - pip3 install -r examples/object_detector/yolo/yolov8/requirements.txt && \ - cd .. && \ - rm -rf serve +COPY torchserve/install-torchserve.sh /torchserve/install-torchserve.sh +RUN /torchserve/install-torchserve.sh RUN /usr/local/bin/torchserve --help +COPY torchserve/config.properties /torchserve/config.properties +COPY torchserve/torchserve-entrypoint.sh /torchserve/torchserve-entrypoint.sh +CMD ["/torchserve/torchserve-entrypoint.sh"] diff --git a/docs/README-airt.md b/docs/README-airt.md index 0085b493..696e9826 100644 --- a/docs/README-airt.md +++ b/docs/README-airt.md @@ -132,24 +132,22 @@ On a non-AIRT machine that the AIRT can reach over the network, that has an nvid See https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html -# start torchserve +# create model archive -From gamutRF's source directory: +From gamutRF's source directory, and having obtained mini2_snr.pt: ``` -$ mkdir /tmp/torchserve -$ cp torchserve/config.properities /tmp/torchserve -$ docker run --gpus all -p 8081:8081 -p 8080:8080 -v /tmp/torchserve:/torchserve -d iqtlabs/gamutrf-cuda-torchserve torchserve --start --model-store /torchserve --ts-config /torchserve/config.properties --ncs --foreground +$ pip3 install torch-model-archiver +$ mkdir /tmp/model_store +$ torch-model-archiver --force --model-name mini2_snr --version 1.0 --serialized-file /PATH/TO/mini2_snr.pt --handler torchserve/custom_handler.py --export-path /tmp/model_store ``` -# create and register model +# start torchserve -From gamutRF's source directory, and having obtained mini2_snr.pt: +From gamutRF's source directory (mini2_snr is the default model name in torchserve-cuda.yml): ``` -$ pip3 install torch-model-archiver -$ torch-model-archiver --force --model-name mini2_snr --version 1.0 --serialized-file /PATH/TO/mini2_snr.pt --handler torchserve/custom_handler.py --export-path /tmp/torchserve -$ curl -X POST "localhost:8081/models?model_name=mini2_snr&url=mini2_snr.mar&initial_workers=4&batch_size=2" +$ VOL_PREFIX=/tmp/model_store docker compose -f orchestrator.yml -f torchserve-cuda.yml up -d torchserve ``` Now, when starting the scanner, on the AIRT: diff --git a/orchestrator.yml b/orchestrator.yml index ea7395ba..79131d9f 100644 --- a/orchestrator.yml +++ b/orchestrator.yml @@ -41,13 +41,15 @@ services: - /dev/bus/usb:/dev/bus/usb - /dev/dri/renderD128:/dev/dri/renderD128 # Uncomment when using Nvidia GPU (container toolkit etc must be installed) - #deploy: - # resources: + # deploy: + # resources: # reservations: # devices: # - driver: nvidia # count: 1 # capabilities: [gpu] + volumes: + - '${VOL_PREFIX}:/logs' command: - gamutrf-scan - --logaddr=0.0.0.0 @@ -57,9 +59,14 @@ services: - '--freq-end=${FREQ_END}' - --samp-rate=8.192e6 - --nfft=256 - - --sweep-sec=8 + - --tune-dwell-ms=100 + - --tune-step-fft=0 - --db_clamp_floor=-150 - --fft_batch_size=256 + # - --inference_min_db=-50 + # - --inference_model_name=mini2_snr + # - --inference_model_server=torchserve:8080 + # - --inference_output_dir=/logs/inference healthcheck: test: [CMD, "/gamutrf/bin/scanhc.sh", "9000"] interval: 10s @@ -68,7 +75,7 @@ services: sigfinder: restart: always image: iqtlabs/gamutrf:latest - shm_size: 128m + shm_size: 128m privileged: true networks: - gamutrf @@ -103,6 +110,7 @@ services: - --save_path=/logs - --port=9003 - --detection_type=narrowband + - --n_detect=1 - --width=12 - --height=6 - --min_freq=0 diff --git a/specgram.yml b/specgram.yml index 1468d5d2..2915757c 100644 --- a/specgram.yml +++ b/specgram.yml @@ -1,5 +1,5 @@ -# On Pi4/Ubuntu, also requires systemd.unified_cgroup_hierarchy=0 added to /boot/firmware/cmdline.txt, -# to fall back to cgroup v1. +# On Pi4/Ubuntu, also requires systemd.unified_cgroup_hierarchy=0 added to +# /boot/firmware/cmdline.txt, to fall back to cgroup v1. version: "3.3" networks: gamutrf: diff --git a/tests/test_torchserve.sh b/tests/test_torchserve.sh index 4aff7fc3..f702de8a 100755 --- a/tests/test_torchserve.sh +++ b/tests/test_torchserve.sh @@ -2,7 +2,7 @@ set -e TMPDIR=/tmp -sudo apt-get update && sudo apt-get install -y curl jq wget +sudo apt-get update && sudo apt-get install -y jq wget sudo pip3 install torch-model-archiver cp torchserve/custom_handler.py $TMPDIR/ cd $TMPDIR @@ -11,9 +11,7 @@ wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8n.pt torch-model-archiver --force --model-name yolov8n --version 1.0 --serialized-file yolov8n.pt --handler custom_handler.py rm -rf model_store && mkdir model_store mv yolov8n.mar model_store/ -docker run -v $(pwd)/model_store:/model_store --net host -d iqtlabs/gamutrf-torchserve timeout 60s torchserve --start --model-store /model_store --ncs --foreground -sleep 5 -curl -X POST "localhost:8081/models?model_name=yolov8n&url=yolov8n.mar&initial_workers=4&batch_size=2" +docker run -v $(pwd)/model_store:/model_store --net host --entrypoint timeout -d iqtlabs/gamutrf-torchserve 60s /torchserve/torchserve-entrypoint.sh --models yolov8n=yolov8n.mar # TODO: use gamutRF test spectogram image wget https://github.com/pytorch/serve/raw/master/examples/object_detector/yolo/yolov8/persons.jpg -curl http://127.0.0.1:8080/predictions/yolov8n -T persons.jpg | jq +wget -q --retry-connrefused --retry-on-host-error --body-file=persons.jpg --method=PUT -O- --header='Content-Type: image/jpg' http://127.0.0.1:8080/predictions/yolov8n | jq diff --git a/torchserve-cuda.yml b/torchserve-cuda.yml new file mode 100644 index 00000000..8d6e6043 --- /dev/null +++ b/torchserve-cuda.yml @@ -0,0 +1,22 @@ +version: "3.3" +networks: + gamutrf: +services: + torchserve: + restart: always + image: iqtlabs/gamutrf-cuda-torchserve:latest + networks: + - gamutrf + ports: + - '8080:8080' + volumes: + - '${VOL_PREFIX}:/model_store' + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + command: + - --models mini2_snr=mini2_snr.mar diff --git a/torchserve.yml b/torchserve.yml new file mode 100644 index 00000000..9092acb5 --- /dev/null +++ b/torchserve.yml @@ -0,0 +1,15 @@ +version: "3.3" +networks: + gamutrf: +services: + torchserve: + restart: always + image: iqtlabs/gamutrf-torchserve:latest + networks: + - gamutrf + ports: + - '8080:8080' + volumes: + - '${VOL_PREFIX}:/model_store' + command: + - --models mini2_snr=mini2_snr.mar diff --git a/torchserve/config.properties b/torchserve/config.properties index c0dd1d46..8390f10c 100644 --- a/torchserve/config.properties +++ b/torchserve/config.properties @@ -1,3 +1,5 @@ inference_address=http://0.0.0.0:8080 management_address=http://0.0.0.0:8081 metrics_address=http://0.0.0.0:8082 +# batch_size=16 +# max_batch_delay=1000 diff --git a/torchserve/custom_handler.py b/torchserve/custom_handler.py index c2d35f85..ab40dbd7 100644 --- a/torchserve/custom_handler.py +++ b/torchserve/custom_handler.py @@ -31,10 +31,12 @@ class Yolov8Handler(ObjectDetector): def initialize(self, context): if torch.cuda.is_available(): self.device = torch.device("cuda") + print("Yolov8Handler: using cuda") elif XLA_AVAILABLE: self.device = xm.xla_device() else: self.device = torch.device("cpu") + print("Yolov8Handler: using cpu") properties = context.system_properties self.manifest = context.manifest diff --git a/torchserve/install-torchserve.sh b/torchserve/install-torchserve.sh new file mode 100755 index 00000000..99e8d3d9 --- /dev/null +++ b/torchserve/install-torchserve.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -e +apt-get update && \ + apt-get install -y \ + git \ + python3-pip +pip config set global.no-cache-dir false && \ + git clone https://github.com/pytorch/serve -b v0.9.0 && \ + cd serve && \ + python3 ./ts_scripts/install_dependencies.py --environment prod $* && \ + pip3 install . && \ + pip3 install -r examples/object_detector/yolo/yolov8/requirements.txt && \ + cd .. && \ + rm -rf serve + diff --git a/torchserve/torchserve-entrypoint.sh b/torchserve/torchserve-entrypoint.sh new file mode 100755 index 00000000..f14b3adb --- /dev/null +++ b/torchserve/torchserve-entrypoint.sh @@ -0,0 +1,2 @@ +#!/bin/sh +exec /usr/local/bin/torchserve --start --model-store /model_store --ts-config /torchserve/config.properties --ncs --foreground $* diff --git a/worker.yml b/worker.yml index 124e3335..d858b959 100644 --- a/worker.yml +++ b/worker.yml @@ -1,5 +1,5 @@ -# On Pi4/Ubuntu, also requires systemd.unified_cgroup_hierarchy=0 added to /boot/firmware/cmdline.txt, -# to fall back to cgroup v1. +# On Pi4/Ubuntu, also requires systemd.unified_cgroup_hierarchy=0 added to +# /boot/firmware/cmdline.txt, to fall back to cgroup v1. version: "3.3" networks: gamutrf: