From 488534b03e545bb2199a2da9102c9ee773408778 Mon Sep 17 00:00:00 2001
From: Josh Bailey <josh@vandervecken.com>
Date: Sun, 29 Oct 2023 22:44:52 +0000
Subject: [PATCH] Common infrastructure for torchserve containers, wrap
 entrypoint.

---
 .github/workflows/docker-extras.yml |  4 ++--
 .github/workflows/docker-test.yml   |  4 +---
 README.md                           | 12 +++++++++++-
 docker/Dockerfile.cuda-torchserve   | 17 +++++------------
 docker/Dockerfile.torchserve        | 17 +++++------------
 docs/README-airt.md                 | 18 ++++++++----------
 orchestrator.yml                    | 16 ++++++++++++----
 specgram.yml                        |  4 ++--
 tests/test_torchserve.sh            |  8 +++-----
 torchserve-cuda.yml                 | 22 ++++++++++++++++++++++
 torchserve.yml                      | 15 +++++++++++++++
 torchserve/config.properties        |  2 ++
 torchserve/custom_handler.py        |  2 ++
 torchserve/install-torchserve.sh    | 16 ++++++++++++++++
 torchserve/torchserve-entrypoint.sh |  2 ++
 worker.yml                          |  4 ++--
 16 files changed, 110 insertions(+), 53 deletions(-)
 create mode 100644 torchserve-cuda.yml
 create mode 100644 torchserve.yml
 create mode 100755 torchserve/install-torchserve.sh
 create mode 100755 torchserve/torchserve-entrypoint.sh

diff --git a/.github/workflows/docker-extras.yml b/.github/workflows/docker-extras.yml
index 76a56bea..3a5fee81 100644
--- a/.github/workflows/docker-extras.yml
+++ b/.github/workflows/docker-extras.yml
@@ -34,7 +34,7 @@ jobs:
     - name: Build and push platforms
       uses: docker/build-push-action@v5
       with:
-        context: docker
+        context: .
         file: docker/Dockerfile.torchserve
         platforms: linux/amd64,linux/arm64
         push: true
@@ -68,7 +68,7 @@ jobs:
     - name: Build and push platforms
       uses: docker/build-push-action@v5
       with:
-        context: docker
+        context: .
         file: docker/Dockerfile.cuda-torchserve
         platforms: linux/amd64
         push: true
diff --git a/.github/workflows/docker-test.yml b/.github/workflows/docker-test.yml
index 8e7473eb..7f61bfac 100644
--- a/.github/workflows/docker-test.yml
+++ b/.github/workflows/docker-test.yml
@@ -7,9 +7,7 @@ jobs:
     - uses: actions/checkout@v4
     - name: docker build
       run: |
-        cd docker
-        docker build -f Dockerfile.torchserve . -t iqtlabs/gamutrf-torchserve:latest
-        cd ..
+        docker build -f docker/Dockerfile.torchserve . -t iqtlabs/gamutrf-torchserve:latest
         ./tests/test_torchserve.sh
   test-gamutrf-extra-images:
     runs-on: ubuntu-latest
diff --git a/README.md b/README.md
index 4586a80c..b2a3785f 100644
--- a/README.md
+++ b/README.md
@@ -175,12 +175,22 @@ Run ```echo 0 > /sys/module/usbcore/parameters/usbfs_memory_mb``` as root before
 
 ##### ```[ERROR] [USB] USB open failed: insufficient permissions```
 
-Ettus SDRs download firmware and switch USB identities when first powered up. Restart the affected container to work around this.
+Ettus SDRs download firmware and switch USB identities when first powered up. Restart the affected container to work around this (if run with docker compose, restart will happen automatically).
 
 ##### ```[ERROR] [UHD] An unexpected exception was caught in a task loop.The task loop will now exit, things may not work.boost: mutex lock failed in pthread_mutex_lock: Invalid argument```
 
 UHD driver arguments ```num_recv_frames``` or ```recv_frame_size``` may be too high. The defaults are defined as ETTUS_ARGS in [utils.py](gamutrf/utils.py). Try reducing one or both via ```--sdrargs```. For example, ```--sdrargs num_recv_frames=64,recv_frame_size=8200,type=b200```.
 
+#### ```[ERROR] [UHD] EnvironmentError: IOError: usb rx6 transfer status: LIBUSB_TRANSFER_OVERFLOW```
+
+Stop containers, and reset the Ettus as follows:
+
+```
+$ /usr/lib/uhd/utils/b2xx_fx3_utils -D
+$ /usr/lib/uhd/utils/b2xx_fx3_utils -U
+$ /usr/lib/uhd/utils/b2xx_fx3_utils -S
+```
+
 #### Scanner with Ettus SDR shows implausible low power at approx 100MHz intervals
 
 Ettus radios periodically need extra time to produce good data when being retuned rapidly by the scanner. Increasing the value of ```--db_clamp_floor``` will cause the scanner to discard windows after retuning (effectively waiting for the retune command to be executed and produce good data before proceeding).
diff --git a/docker/Dockerfile.cuda-torchserve b/docker/Dockerfile.cuda-torchserve
index 795bb800..3cc0166a 100644
--- a/docker/Dockerfile.cuda-torchserve
+++ b/docker/Dockerfile.cuda-torchserve
@@ -1,19 +1,12 @@
 FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive
 WORKDIR /root
-RUN apt-get update && \
-    apt-get install -y \
-      git \
-      python3-pip
-RUN pip config set global.no-cache-dir false && \
-  git clone https://github.com/pytorch/serve -b v0.9.0 && \
-  cd serve && \
-  python3 ./ts_scripts/install_dependencies.py --cuda cu118 --environment prod && \
-  pip3 install . && \
-  pip3 install -r examples/object_detector/yolo/yolov8/requirements.txt && \
-  cd .. && \
-  rm -rf serve
+COPY torchserve/install-torchserve.sh /torchserve/install-torchserve.sh
+RUN /torchserve/install-torchserve.sh --cuda cu118
 RUN /usr/local/bin/torchserve --help
+COPY torchserve/config.properties /torchserve/config.properties
+COPY torchserve/torchserve-entrypoint.sh /torchserve/torchserve-entrypoint.sh
+ENTRYPOINT ["/torchserve/torchserve-entrypoint.sh"]
 
 # see Dockerfile.torchserve for example, but use
 # docker run --gpus all -ti iqtlabs/gamutrf-cuda-torchserve:latest bash
diff --git a/docker/Dockerfile.torchserve b/docker/Dockerfile.torchserve
index 931a75a2..eb0609a2 100644
--- a/docker/Dockerfile.torchserve
+++ b/docker/Dockerfile.torchserve
@@ -1,16 +1,9 @@
 FROM ubuntu:22.04
 ENV DEBIAN_FRONTEND=noninteractive
 WORKDIR /root
-RUN apt-get update && \
-    apt-get install -y \
-      git \
-      python3-pip
-RUN pip config set global.no-cache-dir false && \
-  git clone https://github.com/pytorch/serve -b v0.8.2 && \
-  cd serve && \
-  python3 ./ts_scripts/install_dependencies.py --environment prod && \
-  pip3 install . && \
-  pip3 install -r examples/object_detector/yolo/yolov8/requirements.txt && \
-  cd .. && \
-  rm -rf serve
+COPY torchserve/install-torchserve.sh /torchserve/install-torchserve.sh
+RUN /torchserve/install-torchserve.sh
 RUN /usr/local/bin/torchserve --help
+COPY torchserve/config.properties /torchserve/config.properties
+COPY torchserve/torchserve-entrypoint.sh /torchserve/torchserve-entrypoint.sh
+CMD ["/torchserve/torchserve-entrypoint.sh"]
diff --git a/docs/README-airt.md b/docs/README-airt.md
index 0085b493..696e9826 100644
--- a/docs/README-airt.md
+++ b/docs/README-airt.md
@@ -132,24 +132,22 @@ On a non-AIRT machine that the AIRT can reach over the network, that has an nvid
 
 See https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
 
-# start torchserve
+# create model archive
 
-From gamutRF's source directory:
+From gamutRF's source directory, and having obtained mini2_snr.pt:
 
 ```
-$ mkdir /tmp/torchserve
-$ cp torchserve/config.properities /tmp/torchserve
-$ docker run --gpus all -p 8081:8081 -p 8080:8080 -v /tmp/torchserve:/torchserve -d iqtlabs/gamutrf-cuda-torchserve torchserve --start --model-store /torchserve --ts-config /torchserve/config.properties --ncs --foreground
+$ pip3 install torch-model-archiver
+$ mkdir /tmp/model_store
+$ torch-model-archiver --force --model-name mini2_snr --version 1.0 --serialized-file /PATH/TO/mini2_snr.pt --handler torchserve/custom_handler.py --export-path /tmp/model_store
 ```
 
-# create and register model
+# start torchserve
 
-From gamutRF's source directory, and having obtained mini2_snr.pt:
+From gamutRF's source directory (mini2_snr is the default model name in torchserve-cuda.yml):
 
 ```
-$ pip3 install torch-model-archiver
-$ torch-model-archiver --force --model-name mini2_snr --version 1.0 --serialized-file /PATH/TO/mini2_snr.pt --handler torchserve/custom_handler.py --export-path /tmp/torchserve
-$ curl -X POST "localhost:8081/models?model_name=mini2_snr&url=mini2_snr.mar&initial_workers=4&batch_size=2"
+$ VOL_PREFIX=/tmp/model_store docker compose -f orchestrator.yml -f torchserve-cuda.yml up -d torchserve
 ```
 
 Now, when starting the scanner, on the AIRT:
diff --git a/orchestrator.yml b/orchestrator.yml
index ea7395ba..79131d9f 100644
--- a/orchestrator.yml
+++ b/orchestrator.yml
@@ -41,13 +41,15 @@ services:
       - /dev/bus/usb:/dev/bus/usb
       - /dev/dri/renderD128:/dev/dri/renderD128
     # Uncomment when using Nvidia GPU (container toolkit etc must be installed)
-    #deploy:
-    #  resources:
+    # deploy:
+    #   resources:
     #     reservations:
     #        devices:
     #          - driver: nvidia
     #            count: 1
     #            capabilities: [gpu]
+    volumes:
+      - '${VOL_PREFIX}:/logs'
     command:
       - gamutrf-scan
       - --logaddr=0.0.0.0
@@ -57,9 +59,14 @@ services:
       - '--freq-end=${FREQ_END}'
       - --samp-rate=8.192e6
       - --nfft=256
-      - --sweep-sec=8
+      - --tune-dwell-ms=100
+      - --tune-step-fft=0
       - --db_clamp_floor=-150
       - --fft_batch_size=256
+      # - --inference_min_db=-50
+      # - --inference_model_name=mini2_snr
+      # - --inference_model_server=torchserve:8080
+      # - --inference_output_dir=/logs/inference
     healthcheck:
       test: [CMD, "/gamutrf/bin/scanhc.sh", "9000"]
       interval: 10s
@@ -68,7 +75,7 @@ services:
   sigfinder:
     restart: always
     image: iqtlabs/gamutrf:latest
-    shm_size: 128m 
+    shm_size: 128m
     privileged: true
     networks:
       - gamutrf
@@ -103,6 +110,7 @@ services:
       - --save_path=/logs
       - --port=9003
       - --detection_type=narrowband
+      - --n_detect=1
       - --width=12
       - --height=6
       - --min_freq=0
diff --git a/specgram.yml b/specgram.yml
index 1468d5d2..2915757c 100644
--- a/specgram.yml
+++ b/specgram.yml
@@ -1,5 +1,5 @@
-# On Pi4/Ubuntu, also requires systemd.unified_cgroup_hierarchy=0 added to /boot/firmware/cmdline.txt,
-# to fall back to cgroup v1.
+# On Pi4/Ubuntu, also requires systemd.unified_cgroup_hierarchy=0 added to
+# /boot/firmware/cmdline.txt, to fall back to cgroup v1.
 version: "3.3"
 networks:
   gamutrf:
diff --git a/tests/test_torchserve.sh b/tests/test_torchserve.sh
index 4aff7fc3..f702de8a 100755
--- a/tests/test_torchserve.sh
+++ b/tests/test_torchserve.sh
@@ -2,7 +2,7 @@
 
 set -e
 TMPDIR=/tmp
-sudo apt-get update && sudo apt-get install -y curl jq wget
+sudo apt-get update && sudo apt-get install -y jq wget
 sudo pip3 install torch-model-archiver
 cp torchserve/custom_handler.py $TMPDIR/
 cd $TMPDIR
@@ -11,9 +11,7 @@ wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8n.pt
 torch-model-archiver --force --model-name yolov8n --version 1.0 --serialized-file yolov8n.pt --handler custom_handler.py
 rm -rf model_store && mkdir model_store
 mv yolov8n.mar model_store/
-docker run -v $(pwd)/model_store:/model_store --net host -d iqtlabs/gamutrf-torchserve timeout 60s torchserve --start --model-store /model_store --ncs --foreground
-sleep 5
-curl -X POST "localhost:8081/models?model_name=yolov8n&url=yolov8n.mar&initial_workers=4&batch_size=2"
+docker run -v $(pwd)/model_store:/model_store --net host --entrypoint timeout -d iqtlabs/gamutrf-torchserve 60s /torchserve/torchserve-entrypoint.sh --models yolov8n=yolov8n.mar
 # TODO: use gamutRF test spectogram image
 wget https://github.com/pytorch/serve/raw/master/examples/object_detector/yolo/yolov8/persons.jpg
-curl http://127.0.0.1:8080/predictions/yolov8n -T persons.jpg | jq
+wget -q --retry-connrefused --retry-on-host-error --body-file=persons.jpg --method=PUT -O- --header='Content-Type: image/jpg' http://127.0.0.1:8080/predictions/yolov8n | jq
diff --git a/torchserve-cuda.yml b/torchserve-cuda.yml
new file mode 100644
index 00000000..8d6e6043
--- /dev/null
+++ b/torchserve-cuda.yml
@@ -0,0 +1,22 @@
+version: "3.3"
+networks:
+  gamutrf:
+services:
+  torchserve:
+    restart: always
+    image: iqtlabs/gamutrf-cuda-torchserve:latest
+    networks:
+      - gamutrf
+    ports:
+      - '8080:8080'
+    volumes:
+      - '${VOL_PREFIX}:/model_store'
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+    command:
+      - --models mini2_snr=mini2_snr.mar
diff --git a/torchserve.yml b/torchserve.yml
new file mode 100644
index 00000000..9092acb5
--- /dev/null
+++ b/torchserve.yml
@@ -0,0 +1,15 @@
+version: "3.3"
+networks:
+  gamutrf:
+services:
+  torchserve:
+    restart: always
+    image: iqtlabs/gamutrf-torchserve:latest
+    networks:
+      - gamutrf
+    ports:
+      - '8080:8080'
+    volumes:
+      - '${VOL_PREFIX}:/model_store'
+    command:
+      - --models mini2_snr=mini2_snr.mar
diff --git a/torchserve/config.properties b/torchserve/config.properties
index c0dd1d46..8390f10c 100644
--- a/torchserve/config.properties
+++ b/torchserve/config.properties
@@ -1,3 +1,5 @@
 inference_address=http://0.0.0.0:8080
 management_address=http://0.0.0.0:8081
 metrics_address=http://0.0.0.0:8082
+# batch_size=16
+# max_batch_delay=1000
diff --git a/torchserve/custom_handler.py b/torchserve/custom_handler.py
index c2d35f85..ab40dbd7 100644
--- a/torchserve/custom_handler.py
+++ b/torchserve/custom_handler.py
@@ -31,10 +31,12 @@ class Yolov8Handler(ObjectDetector):
     def initialize(self, context):
         if torch.cuda.is_available():
             self.device = torch.device("cuda")
+            print("Yolov8Handler: using cuda")
         elif XLA_AVAILABLE:
             self.device = xm.xla_device()
         else:
             self.device = torch.device("cpu")
+            print("Yolov8Handler: using cpu")
 
         properties = context.system_properties
         self.manifest = context.manifest
diff --git a/torchserve/install-torchserve.sh b/torchserve/install-torchserve.sh
new file mode 100755
index 00000000..99e8d3d9
--- /dev/null
+++ b/torchserve/install-torchserve.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+set -e
+apt-get update && \
+  apt-get install -y \
+    git \
+    python3-pip
+pip config set global.no-cache-dir false && \
+  git clone https://github.com/pytorch/serve -b v0.9.0 && \
+  cd serve && \
+  python3 ./ts_scripts/install_dependencies.py --environment prod $* && \
+  pip3 install . && \
+  pip3 install -r examples/object_detector/yolo/yolov8/requirements.txt && \
+  cd .. && \
+  rm -rf serve
+
diff --git a/torchserve/torchserve-entrypoint.sh b/torchserve/torchserve-entrypoint.sh
new file mode 100755
index 00000000..f14b3adb
--- /dev/null
+++ b/torchserve/torchserve-entrypoint.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+exec /usr/local/bin/torchserve --start --model-store /model_store --ts-config /torchserve/config.properties --ncs --foreground $*
diff --git a/worker.yml b/worker.yml
index 124e3335..d858b959 100644
--- a/worker.yml
+++ b/worker.yml
@@ -1,5 +1,5 @@
-# On Pi4/Ubuntu, also requires systemd.unified_cgroup_hierarchy=0 added to /boot/firmware/cmdline.txt,
-# to fall back to cgroup v1.
+# On Pi4/Ubuntu, also requires systemd.unified_cgroup_hierarchy=0 added to
+# /boot/firmware/cmdline.txt, to fall back to cgroup v1.
 version: "3.3"
 networks:
   gamutrf: