Common infrastructure for torchserve containers, wrap entrypoint.

IQTLabs · Oct 30, 2023 · 488534b · 488534b
1 parent 0dfccdd
commit 488534b
Show file tree

Hide file tree

Showing 16 changed files with 110 additions and 53 deletions.
diff --git a/.github/workflows/docker-extras.yml b/.github/workflows/docker-extras.yml
@@ -34,7 +34,7 @@ jobs:
     - name: Build and push platforms
       uses: docker/build-push-action@v5
       with:
-        context: docker
+        context: .
         file: docker/Dockerfile.torchserve
         platforms: linux/amd64,linux/arm64
         push: true
@@ -68,7 +68,7 @@ jobs:
     - name: Build and push platforms
       uses: docker/build-push-action@v5
       with:
-        context: docker
+        context: .
         file: docker/Dockerfile.cuda-torchserve
         platforms: linux/amd64
         push: true

diff --git a/.github/workflows/docker-test.yml b/.github/workflows/docker-test.yml
@@ -7,9 +7,7 @@ jobs:
     - uses: actions/checkout@v4
     - name: docker build
       run: |
-        cd docker
-        docker build -f Dockerfile.torchserve . -t iqtlabs/gamutrf-torchserve:latest
-        cd ..
+        docker build -f docker/Dockerfile.torchserve . -t iqtlabs/gamutrf-torchserve:latest
         ./tests/test_torchserve.sh
   test-gamutrf-extra-images:
     runs-on: ubuntu-latest

diff --git a/README.md b/README.md
@@ -175,12 +175,22 @@ Run ```echo 0 > /sys/module/usbcore/parameters/usbfs_memory_mb``` as root before
 
 ##### ```[ERROR] [USB] USB open failed: insufficient permissions```
 
-Ettus SDRs download firmware and switch USB identities when first powered up. Restart the affected container to work around this.
+Ettus SDRs download firmware and switch USB identities when first powered up. Restart the affected container to work around this (if run with docker compose, restart will happen automatically).
 
 ##### ```[ERROR] [UHD] An unexpected exception was caught in a task loop.The task loop will now exit, things may not work.boost: mutex lock failed in pthread_mutex_lock: Invalid argument```
 
 UHD driver arguments ```num_recv_frames``` or ```recv_frame_size``` may be too high. The defaults are defined as ETTUS_ARGS in [utils.py](gamutrf/utils.py). Try reducing one or both via ```--sdrargs```. For example, ```--sdrargs num_recv_frames=64,recv_frame_size=8200,type=b200```.
 
+#### ```[ERROR] [UHD] EnvironmentError: IOError: usb rx6 transfer status: LIBUSB_TRANSFER_OVERFLOW```
+
+Stop containers, and reset the Ettus as follows:
+
+```
+$ /usr/lib/uhd/utils/b2xx_fx3_utils -D
+$ /usr/lib/uhd/utils/b2xx_fx3_utils -U
+$ /usr/lib/uhd/utils/b2xx_fx3_utils -S
+```
+
 #### Scanner with Ettus SDR shows implausible low power at approx 100MHz intervals
 
 Ettus radios periodically need extra time to produce good data when being retuned rapidly by the scanner. Increasing the value of ```--db_clamp_floor``` will cause the scanner to discard windows after retuning (effectively waiting for the retune command to be executed and produce good data before proceeding).

diff --git a/docker/Dockerfile.cuda-torchserve b/docker/Dockerfile.cuda-torchserve
@@ -1,19 +1,12 @@
 FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive
 WORKDIR /root
-RUN apt-get update && \
-    apt-get install -y \
-      git \
-      python3-pip
-RUN pip config set global.no-cache-dir false && \
-  git clone https://github.com/pytorch/serve -b v0.9.0 && \
-  cd serve && \
-  python3 ./ts_scripts/install_dependencies.py --cuda cu118 --environment prod && \
-  pip3 install . && \
-  pip3 install -r examples/object_detector/yolo/yolov8/requirements.txt && \
-  cd .. && \
-  rm -rf serve
+COPY torchserve/install-torchserve.sh /torchserve/install-torchserve.sh
+RUN /torchserve/install-torchserve.sh --cuda cu118
 RUN /usr/local/bin/torchserve --help
+COPY torchserve/config.properties /torchserve/config.properties
+COPY torchserve/torchserve-entrypoint.sh /torchserve/torchserve-entrypoint.sh
+ENTRYPOINT ["/torchserve/torchserve-entrypoint.sh"]
 
 # see Dockerfile.torchserve for example, but use
 # docker run --gpus all -ti iqtlabs/gamutrf-cuda-torchserve:latest bash

diff --git a/docker/Dockerfile.torchserve b/docker/Dockerfile.torchserve
@@ -1,16 +1,9 @@
 FROM ubuntu:22.04
 ENV DEBIAN_FRONTEND=noninteractive
 WORKDIR /root
-RUN apt-get update && \
-    apt-get install -y \
-      git \
-      python3-pip
-RUN pip config set global.no-cache-dir false && \
-  git clone https://github.com/pytorch/serve -b v0.8.2 && \
-  cd serve && \
-  python3 ./ts_scripts/install_dependencies.py --environment prod && \
-  pip3 install . && \
-  pip3 install -r examples/object_detector/yolo/yolov8/requirements.txt && \
-  cd .. && \
-  rm -rf serve
+COPY torchserve/install-torchserve.sh /torchserve/install-torchserve.sh
+RUN /torchserve/install-torchserve.sh
 RUN /usr/local/bin/torchserve --help
+COPY torchserve/config.properties /torchserve/config.properties
+COPY torchserve/torchserve-entrypoint.sh /torchserve/torchserve-entrypoint.sh
+CMD ["/torchserve/torchserve-entrypoint.sh"]
diff --git a/docs/README-airt.md b/docs/README-airt.md
@@ -132,24 +132,22 @@ On a non-AIRT machine that the AIRT can reach over the network, that has an nvid
 
 See https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
 
-# start torchserve
+# create model archive
 
-From gamutRF's source directory:
+From gamutRF's source directory, and having obtained mini2_snr.pt:
 
 ```
-$ mkdir /tmp/torchserve
-$ cp torchserve/config.properities /tmp/torchserve
-$ docker run --gpus all -p 8081:8081 -p 8080:8080 -v /tmp/torchserve:/torchserve -d iqtlabs/gamutrf-cuda-torchserve torchserve --start --model-store /torchserve --ts-config /torchserve/config.properties --ncs --foreground
+$ pip3 install torch-model-archiver
+$ mkdir /tmp/model_store
+$ torch-model-archiver --force --model-name mini2_snr --version 1.0 --serialized-file /PATH/TO/mini2_snr.pt --handler torchserve/custom_handler.py --export-path /tmp/model_store
 ```
 
-# create and register model
+# start torchserve
 
-From gamutRF's source directory, and having obtained mini2_snr.pt:
+From gamutRF's source directory (mini2_snr is the default model name in torchserve-cuda.yml):
 
 ```
-$ pip3 install torch-model-archiver
-$ torch-model-archiver --force --model-name mini2_snr --version 1.0 --serialized-file /PATH/TO/mini2_snr.pt --handler torchserve/custom_handler.py --export-path /tmp/torchserve
-$ curl -X POST "localhost:8081/models?model_name=mini2_snr&url=mini2_snr.mar&initial_workers=4&batch_size=2"
+$ VOL_PREFIX=/tmp/model_store docker compose -f orchestrator.yml -f torchserve-cuda.yml up -d torchserve
 ```
 
 Now, when starting the scanner, on the AIRT:

diff --git a/orchestrator.yml b/orchestrator.yml
@@ -41,13 +41,15 @@ services:
       - /dev/bus/usb:/dev/bus/usb
       - /dev/dri/renderD128:/dev/dri/renderD128
     # Uncomment when using Nvidia GPU (container toolkit etc must be installed)
-    #deploy:
-    #  resources:
+    # deploy:
+    #   resources:
     #     reservations:
     #        devices:
     #          - driver: nvidia
     #            count: 1
     #            capabilities: [gpu]
+    volumes:
+      - '${VOL_PREFIX}:/logs'
     command:
       - gamutrf-scan
       - --logaddr=0.0.0.0
@@ -57,9 +59,14 @@ services:
       - '--freq-end=${FREQ_END}'
       - --samp-rate=8.192e6
       - --nfft=256
-      - --sweep-sec=8
+      - --tune-dwell-ms=100
+      - --tune-step-fft=0
       - --db_clamp_floor=-150
       - --fft_batch_size=256
+      # - --inference_min_db=-50
+      # - --inference_model_name=mini2_snr
+      # - --inference_model_server=torchserve:8080
+      # - --inference_output_dir=/logs/inference
     healthcheck:
       test: [CMD, "/gamutrf/bin/scanhc.sh", "9000"]
       interval: 10s
@@ -68,7 +75,7 @@ services:
   sigfinder:
     restart: always
     image: iqtlabs/gamutrf:latest
-    shm_size: 128m 
+    shm_size: 128m
     privileged: true
     networks:
       - gamutrf
@@ -103,6 +110,7 @@ services:
       - --save_path=/logs
       - --port=9003
       - --detection_type=narrowband
+      - --n_detect=1
       - --width=12
       - --height=6
       - --min_freq=0

diff --git a/specgram.yml b/specgram.yml
@@ -1,5 +1,5 @@
-# On Pi4/Ubuntu, also requires systemd.unified_cgroup_hierarchy=0 added to /boot/firmware/cmdline.txt,
-# to fall back to cgroup v1.
+# On Pi4/Ubuntu, also requires systemd.unified_cgroup_hierarchy=0 added to
+# /boot/firmware/cmdline.txt, to fall back to cgroup v1.
 version: "3.3"
 networks:
   gamutrf:

diff --git a/tests/test_torchserve.sh b/tests/test_torchserve.sh
@@ -2,7 +2,7 @@
 
 set -e
 TMPDIR=/tmp
-sudo apt-get update && sudo apt-get install -y curl jq wget
+sudo apt-get update && sudo apt-get install -y jq wget
 sudo pip3 install torch-model-archiver
 cp torchserve/custom_handler.py $TMPDIR/
 cd $TMPDIR
@@ -11,9 +11,7 @@ wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8n.pt
 torch-model-archiver --force --model-name yolov8n --version 1.0 --serialized-file yolov8n.pt --handler custom_handler.py
 rm -rf model_store && mkdir model_store
 mv yolov8n.mar model_store/
-docker run -v $(pwd)/model_store:/model_store --net host -d iqtlabs/gamutrf-torchserve timeout 60s torchserve --start --model-store /model_store --ncs --foreground
-sleep 5
-curl -X POST "localhost:8081/models?model_name=yolov8n&url=yolov8n.mar&initial_workers=4&batch_size=2"
+docker run -v $(pwd)/model_store:/model_store --net host --entrypoint timeout -d iqtlabs/gamutrf-torchserve 60s /torchserve/torchserve-entrypoint.sh --models yolov8n=yolov8n.mar
 # TODO: use gamutRF test spectogram image
 wget https://github.com/pytorch/serve/raw/master/examples/object_detector/yolo/yolov8/persons.jpg
-curl http://127.0.0.1:8080/predictions/yolov8n -T persons.jpg | jq
+wget -q --retry-connrefused --retry-on-host-error --body-file=persons.jpg --method=PUT -O- --header='Content-Type: image/jpg' http://127.0.0.1:8080/predictions/yolov8n | jq
diff --git a/torchserve-cuda.yml b/torchserve-cuda.yml
@@ -0,0 +1,22 @@
+version: "3.3"
+networks:
+  gamutrf:
+services:
+  torchserve:
+    restart: always
+    image: iqtlabs/gamutrf-cuda-torchserve:latest
+    networks:
+      - gamutrf
+    ports:
+      - '8080:8080'
+    volumes:
+      - '${VOL_PREFIX}:/model_store'
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    command:
+      - --models mini2_snr=mini2_snr.mar
diff --git a/torchserve.yml b/torchserve.yml
@@ -0,0 +1,15 @@
+version: "3.3"
+networks:
+  gamutrf:
+services:
+  torchserve:
+    restart: always
+    image: iqtlabs/gamutrf-torchserve:latest
+    networks:
+      - gamutrf
+    ports:
+      - '8080:8080'
+    volumes:
+      - '${VOL_PREFIX}:/model_store'
+    command:
+      - --models mini2_snr=mini2_snr.mar
diff --git a/torchserve/config.properties b/torchserve/config.properties
@@ -1,3 +1,5 @@
 inference_address=http://0.0.0.0:8080
 management_address=http://0.0.0.0:8081
 metrics_address=http://0.0.0.0:8082
+# batch_size=16
+# max_batch_delay=1000
diff --git a/torchserve/custom_handler.py b/torchserve/custom_handler.py
@@ -31,10 +31,12 @@ class Yolov8Handler(ObjectDetector):
     def initialize(self, context):
         if torch.cuda.is_available():
             self.device = torch.device("cuda")
+            print("Yolov8Handler: using cuda")
         elif XLA_AVAILABLE:
             self.device = xm.xla_device()
         else:
             self.device = torch.device("cpu")
+            print("Yolov8Handler: using cpu")
 
         properties = context.system_properties
         self.manifest = context.manifest

diff --git a/torchserve/install-torchserve.sh b/torchserve/install-torchserve.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+set -e
+apt-get update && \
+  apt-get install -y \
+    git \
+    python3-pip
+pip config set global.no-cache-dir false && \
+  git clone https://github.com/pytorch/serve -b v0.9.0 && \
+  cd serve && \
+  python3 ./ts_scripts/install_dependencies.py --environment prod $* && \
+  pip3 install . && \
+  pip3 install -r examples/object_detector/yolo/yolov8/requirements.txt && \
+  cd .. && \
+  rm -rf serve
+
diff --git a/torchserve/torchserve-entrypoint.sh b/torchserve/torchserve-entrypoint.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+exec /usr/local/bin/torchserve --start --model-store /model_store --ts-config /torchserve/config.properties --ncs --foreground $*
diff --git a/worker.yml b/worker.yml
@@ -1,5 +1,5 @@
-# On Pi4/Ubuntu, also requires systemd.unified_cgroup_hierarchy=0 added to /boot/firmware/cmdline.txt,
-# to fall back to cgroup v1.
+# On Pi4/Ubuntu, also requires systemd.unified_cgroup_hierarchy=0 added to
+# /boot/firmware/cmdline.txt, to fall back to cgroup v1.
 version: "3.3"
 networks:
   gamutrf:
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		#!/bin/sh
		exec /usr/local/bin/torchserve --start --model-store /model_store --ts-config /torchserve/config.properties --ncs --foreground $*