diff --git a/dockerfiles/worker.Dockerfile b/dockerfiles/worker.Dockerfile index aaae7e87..35976677 100644 --- a/dockerfiles/worker.Dockerfile +++ b/dockerfiles/worker.Dockerfile @@ -17,48 +17,46 @@ # under the License. # -FROM nvidia/cuda:9.0-base-ubuntu16.04 +FROM ubuntu:16.04 -RUN apt-get update && apt-get -y upgrade - -# `tensorflow-gpu` dependencies -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - build-essential \ - cuda-command-line-tools-9-0 \ - cuda-cublas-9-0 \ - cuda-cufft-9-0 \ - cuda-curand-9-0 \ - cuda-cusolver-9-0 \ - cuda-cusparse-9-0 \ - libcudnn7=7.2.1.38-1+cuda9.0 \ - libnccl2=2.2.13-1+cuda9.0 \ - libfreetype6-dev \ - libhdf5-serial-dev \ - libpng12-dev \ - libzmq3-dev \ - pkg-config \ - software-properties-common \ - unzip \ - && \ +RUN apt-get update && apt-get -y upgrade && \ + apt-get install -y vim && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -RUN apt-get update && \ - apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \ - apt-get update && \ - apt-get install libnvinfer4=4.1.2-1+cuda9.0 + +# update and install dependencies +RUN apt-get update && \ + apt-get install -y \ + software-properties-common \ + wget \ + && add-apt-repository -y ppa:ubuntu-toolchain-r/test \ + && apt-get update \ + && apt-get install -y \ + make \ + git \ + curl \ + vim \ + vim-gnome \ + && apt-get install -y cmake=3.5.1-1ubuntu3 \ + && apt-get install -y \ + gcc-4.9 g++-4.9 gcc-4.9-base \ + gcc-4.8 g++-4.8 gcc-4.8-base \ + gcc-4.7 g++-4.7 gcc-4.7-base \ + gcc-4.6 g++-4.6 gcc-4.6-base \ + && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 100 \ + && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 100 # Install conda with pip and python 3.6 ARG CONDA_ENVIORNMENT -RUN apt-get -y install curl bzip2 \ - && curl -sSL https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -o /tmp/miniconda.sh \ - && bash /tmp/miniconda.sh -bfp /usr/local \ - && rm -rf /tmp/miniconda.sh \ - && conda create -y --name $CONDA_ENVIORNMENT python=3.6 \ - && conda clean --all --yes +RUN apt-get update --fix-missing && apt-get -y upgrade && \ + apt-get install -y curl bzip2 && \ + curl -sSL https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -o /tmp/miniconda.sh && \ + bash /tmp/miniconda.sh -bfp /usr/local && \ + rm -rf /tmp/miniconda.sh && \ + conda create -y --name $CONDA_ENVIORNMENT python=3.6 && \ + conda clean --all --yes ENV PATH /usr/local/envs/$CONDA_ENVIORNMENT/bin:$PATH - RUN pip install --upgrade pip ENV PYTHONUNBUFFERED 1 @@ -68,22 +66,24 @@ WORKDIR $DOCKER_WORKDIR_PATH ENV PYTHONPATH $DOCKER_WORKDIR_PATH # Install python dependencies -RUN mkdir ~/.pip -#COPY ./pip.conf /root/.pip/pip.conf -COPY singa_auto/requirements.txt singa_auto/requirements.txt +COPY singa_auto/ singa_auto/ + +RUN mkdir -p /root/.config/pip/ +COPY ./.config/pip/pip.conf /root/.config/pip/pip.conf + +COPY ./backup_lib/torch-1.6.0-cp36-cp36m-manylinux1_x86_64.whl /root/torch-1.6.0-cp36-cp36m-manylinux1_x86_64.whl +RUN pip install /root/torch-1.6.0-cp36-cp36m-manylinux1_x86_64.whl +COPY ./backup_lib/opencv_python-4.4.0.46-cp36-cp36m-manylinux2014_x86_64.whl /root/opencv_python-4.4.0.46-cp36-cp36m-manylinux2014_x86_64.whl +RUN pip install /root/opencv_python-4.4.0.46-cp36-cp36m-manylinux2014_x86_64.whl + RUN pip install -r singa_auto/requirements.txt -COPY singa_auto/utils/requirements.txt singa_auto/utils/requirements.txt RUN pip install -r singa_auto/utils/requirements.txt -COPY singa_auto/meta_store/requirements.txt singa_auto/meta_store/requirements.txt RUN pip install -r singa_auto/meta_store/requirements.txt -COPY singa_auto/redis/requirements.txt singa_auto/redis/requirements.txt RUN pip install -r singa_auto/redis/requirements.txt -COPY singa_auto/kafka/requirements.txt singa_auto/kafka/requirements.txt RUN pip install -r singa_auto/kafka/requirements.txt -COPY singa_auto/advisor/requirements.txt singa_auto/advisor/requirements.txt RUN pip install -r singa_auto/advisor/requirements.txt +RUN pip install -r singa_auto/worker/requirements.txt -COPY singa_auto/ singa_auto/ COPY scripts/ scripts/ RUN mkdir data/ diff --git a/dockerfiles/worker_cu100.Dockerfile b/dockerfiles/worker_cu100.Dockerfile new file mode 100644 index 00000000..08d65947 --- /dev/null +++ b/dockerfiles/worker_cu100.Dockerfile @@ -0,0 +1,110 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +FROM nvidia/cuda:10.0-base-ubuntu16.04 + +RUN apt-get update && apt-get -y upgrade && \ + apt-get install -y vim && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# `tensorflow-gpu` dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends --allow-unauthenticated\ + build-essential \ + cuda-command-line-tools-10-0 \ + cuda-cublas-dev-10-0 \ + cuda-cudart-dev-10-0 \ + cuda-cufft-dev-10-0 \ + cuda-curand-dev-10-0 \ + cuda-cusolver-dev-10-0 \ + cuda-cusparse-dev-10-0 \ + libcudnn7=7.5.1.10-1+cuda10.0 \ + libfreetype6-dev \ + libhdf5-serial-dev \ + libnccl-dev=2.4.7-1+cuda10.0 \ + libnccl2=2.4.7-1+cuda10.0 \ + libpng-dev \ + libgl1-mesa-glx \ + libsm6 \ + libxrender1 \ + libzmq3-dev \ + pkg-config \ + software-properties-common \ + unzip \ + lsb-core \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN apt-get update && \ + apt-get install nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + libnvinfer5=5.1.5-1+cuda10.0 \ + libnvinfer6=6.0.1-1+cuda10.0 \ + libnvinfer7=7.0.0-1+cuda10.0 \ + libnvinfer-dev=5.1.5-1+cuda10.0 \ + libnvinfer-dev=6.0.1-1+cuda10.0 \ + libnvinfer-dev=7.0.0-1+cuda10.0 \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + + +# install cuda/bin +# RUN mkdir -p /usr/local/cuda-10.1/bin +# COPY /usr/local/cuda-10.1/bin/ /usr/local/cuda-10.1/bin/ + +# Install conda with pip and python 3.6 +ARG CONDA_ENVIORNMENT +RUN apt-get update --fix-missing && apt-get -y upgrade && \ + apt-get install -y curl bzip2 && \ + curl -sSL https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -o /tmp/miniconda.sh && \ + bash /tmp/miniconda.sh -bfp /usr/local && \ + rm -rf /tmp/miniconda.sh && \ + conda create -y --name $CONDA_ENVIORNMENT python=3.6 && \ + conda clean --all --yes +ENV PATH /usr/local/envs/$CONDA_ENVIORNMENT/bin:$PATH + +RUN pip install --upgrade pip +ENV PYTHONUNBUFFERED 1 + +ARG DOCKER_WORKDIR_PATH +RUN mkdir -p $DOCKER_WORKDIR_PATH +WORKDIR $DOCKER_WORKDIR_PATH +ENV PYTHONPATH $DOCKER_WORKDIR_PATH + +# Install python dependencies +COPY singa_auto/ singa_auto/ + +RUN mkdir -p /root/.config/pip/ +COPY ./.config/pip/pip.conf /root/.config/pip/pip.conf + +RUN pip install -r singa_auto/requirements.txt +RUN pip install -r singa_auto/utils/requirements.txt +RUN pip install -r singa_auto/meta_store/requirements.txt +RUN pip install -r singa_auto/redis/requirements.txt +RUN pip install -r singa_auto/kafka/requirements.txt +RUN pip install -r singa_auto/advisor/requirements.txt + +COPY scripts/ scripts/ +RUN mkdir data/ + +CMD ["python", "scripts/start_worker.py"] diff --git a/dockerfiles/worker_cu101.Dockerfile b/dockerfiles/worker_cu101.Dockerfile new file mode 100644 index 00000000..af161da0 --- /dev/null +++ b/dockerfiles/worker_cu101.Dockerfile @@ -0,0 +1,117 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +FROM nvidia/cuda:10.1-base-ubuntu16.04 + +RUN apt-get update && apt-get -y upgrade && \ + apt-get install -y vim && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# `tensorflow-gpu` dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + cuda-command-line-tools-10-1 \ + cuda-cufft-10-1 \ + cuda-curand-10-1 \ + cuda-cusolver-10-1 \ + cuda-cusparse-10-1 \ + libcublas10=10.2.3.254-1 \ + libcublas-dev=10.2.3.254-1 \ + libcudnn7=7.6.4.38-1+cuda10.1 \ + libcudnn7-dev=7.6.4.38-1+cuda10.1 \ + libfreetype6-dev \ + libhdf5-serial-dev \ + libpng-dev \ + libgl1-mesa-glx \ + libsm6 \ + libxrender1 \ + libzmq3-dev \ + pkg-config \ + software-properties-common \ + unzip \ + lsb-core \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# cuda-10.1 package install cublas in cuda-10.2 +# call ldconfig to link them +RUN cp -r /usr/local/cuda-10.2/* /usr/local/cuda-10.1/ && \ + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/ && \ + ldconfig /etc/ld.so.conf.d + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + libnvinfer5=5.1.5-1+cuda10.1 \ + libnvinfer6=6.0.1-1+cuda10.1 \ + libnvinfer-dev=5.1.5-1+cuda10.1 \ + libnvinfer-dev=6.0.1-1+cuda10.1 \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + + +# install cuda/bin +# RUN mkdir -p /usr/local/cuda-10.1/bin +# COPY /usr/local/cuda-10.1/bin/ /usr/local/cuda-10.1/bin/ + +# Install conda with pip and python 3.6 +ARG CONDA_ENVIORNMENT +RUN apt-get update --fix-missing && apt-get -y upgrade && \ + apt-get install -y curl bzip2 && \ + curl -sSL https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -o /tmp/miniconda.sh && \ + bash /tmp/miniconda.sh -bfp /usr/local && \ + rm -rf /tmp/miniconda.sh && \ + conda create -y --name $CONDA_ENVIORNMENT python=3.6 && \ + conda clean --all --yes +ENV PATH /usr/local/envs/$CONDA_ENVIORNMENT/bin:$PATH + +RUN pip install --upgrade pip +ENV PYTHONUNBUFFERED 1 + +ARG DOCKER_WORKDIR_PATH +RUN mkdir -p $DOCKER_WORKDIR_PATH +WORKDIR $DOCKER_WORKDIR_PATH +ENV PYTHONPATH $DOCKER_WORKDIR_PATH + +# Install python dependencies +COPY singa_auto/ singa_auto/ + +RUN mkdir -p /root/.config/pip/ +COPY ./.config/pip/pip.conf /root/.config/pip/pip.conf + +COPY ./backup_lib/torch-1.6.0-cp36-cp36m-manylinux1_x86_64.whl /root/torch-1.6.0-cp36-cp36m-manylinux1_x86_64.whl +RUN pip install /root/torch-1.6.0-cp36-cp36m-manylinux1_x86_64.whl +COPY ./backup_lib/opencv_python-4.4.0.46-cp36-cp36m-manylinux2014_x86_64.whl /root/opencv_python-4.4.0.46-cp36-cp36m-manylinux2014_x86_64.whl +RUN pip install /root/opencv_python-4.4.0.46-cp36-cp36m-manylinux2014_x86_64.whl + +RUN pip install -r singa_auto/requirements.txt +RUN pip install -r singa_auto/utils/requirements.txt +RUN pip install -r singa_auto/meta_store/requirements.txt +RUN pip install -r singa_auto/redis/requirements.txt +RUN pip install -r singa_auto/kafka/requirements.txt +RUN pip install -r singa_auto/advisor/requirements.txt +RUN pip install -r singa_auto/worker/requirements.txt + +COPY scripts/ scripts/ +RUN mkdir data/ + +CMD ["python", "scripts/start_worker.py"] diff --git a/dockerfiles/worker_cu110.Dockerfile b/dockerfiles/worker_cu110.Dockerfile new file mode 100644 index 00000000..d86d2cc7 --- /dev/null +++ b/dockerfiles/worker_cu110.Dockerfile @@ -0,0 +1,123 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +FROM nvidia/cuda:11.0-base-ubuntu16.04 + +RUN apt-get update && apt-get -y upgrade && \ + apt-get install -y vim && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# `tensorflow-gpu` dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + cuda-command-line-tools-11-0 \ + cuda-cudart-11-0 \ + cuda-cudart-dev-11-0 \ + # cuda-cufft-11-0 \ + # cuda-curand-11-0 \ + # cuda-cusolver-11-0 \ + # cuda-cusparse-11-0 \ + # libcublas-11-0==11.2.0.252-1 \ + # libcublas-dev-11-0==11.2.0.252-1 \ + # libcudnn8==8.0.4.30-1+cuda11.0 \ + # libcudnn8-dev==8.0.4.30-1+cuda11.0 \ + libcufft-11-0 \ + libcufft-dev-11-0 \ + libcurand-11-0 \ + libcurand-dev-11-0 \ + libcusolver-11-0 \ + libcusolver-dev-11-0 \ + libcusparse-11-0 \ + libcusparse-dev-11-0 \ + libcublas-11-0 \ + libcublas-dev-11-0 \ + libcudnn8 \ + libcudnn8-dev \ + libfreetype6-dev \ + libhdf5-serial-dev \ + libpng-dev \ + libgl1-mesa-glx \ + libsm6 \ + libxrender1 \ + libzmq3-dev \ + pkg-config \ + software-properties-common \ + unzip \ + lsb-core \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# # cuda-10.1 package install cublas in cuda-10.2 +# # call ldconfig to link them +# RUN cp -r /usr/local/cuda-10.2/* /usr/local/cuda-10.1/ && \ +# export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/ && \ +# ldconfig /etc/ld.so.conf.d + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + libnvinfer7=7.1.3-1+cuda11.0 \ + libnvinfer-dev=7.1.3-1+cuda11.0 \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + + +# install cuda/bin +# RUN mkdir -p /usr/local/cuda-10.1/bin +# COPY /usr/local/cuda-10.1/bin/ /usr/local/cuda-10.1/bin/ + +# Install conda with pip and python 3.6 +ARG CONDA_ENVIORNMENT +RUN apt-get update --fix-missing && apt-get -y upgrade && \ + apt-get install -y curl bzip2 && \ + curl -sSL https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -o /tmp/miniconda.sh && \ + bash /tmp/miniconda.sh -bfp /usr/local && \ + rm -rf /tmp/miniconda.sh && \ + conda create -y --name $CONDA_ENVIORNMENT python=3.6 && \ + conda clean --all --yes +ENV PATH /usr/local/envs/$CONDA_ENVIORNMENT/bin:$PATH + +RUN pip install --upgrade pip +ENV PYTHONUNBUFFERED 1 + +ARG DOCKER_WORKDIR_PATH +RUN mkdir -p $DOCKER_WORKDIR_PATH +WORKDIR $DOCKER_WORKDIR_PATH +ENV PYTHONPATH $DOCKER_WORKDIR_PATH + +# Install python dependencies +COPY singa_auto/ singa_auto/ + +RUN mkdir -p /root/.config/pip/ +COPY ./.config/pip/pip.conf /root/.config/pip/pip.conf + +RUN pip install -r singa_auto/requirements.txt +RUN pip install -r singa_auto/utils/requirements.txt +RUN pip install -r singa_auto/meta_store/requirements.txt +RUN pip install -r singa_auto/redis/requirements.txt +RUN pip install -r singa_auto/kafka/requirements.txt +RUN pip install -r singa_auto/advisor/requirements.txt + +COPY scripts/ scripts/ +RUN mkdir data/ + +CMD ["python", "scripts/start_worker.py"] diff --git a/dockerfiles/worker_cu90.Dockerfile b/dockerfiles/worker_cu90.Dockerfile new file mode 100644 index 00000000..7a079432 --- /dev/null +++ b/dockerfiles/worker_cu90.Dockerfile @@ -0,0 +1,90 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +FROM nvidia/cuda:9.0-base-ubuntu16.04 + +RUN apt-get update && apt-get -y upgrade + +# `tensorflow-gpu` dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + cuda-command-line-tools-9-0 \ + cuda-cublas-9-0 \ + cuda-cufft-9-0 \ + cuda-curand-9-0 \ + cuda-cusolver-9-0 \ + cuda-cusparse-9-0 \ + libcudnn7=7.2.1.38-1+cuda9.0 \ + libfreetype6-dev \ + libhdf5-serial-dev \ + libnccl2=2.2.13-1+cuda9.0 \ + libpng12-dev \ + libgl1-mesa-glx \ + libsm6 \ + libxrender1 \ + libzmq3-dev \ + pkg-config \ + software-properties-common \ + unzip \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN apt-get update && \ + apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \ + apt-get update && \ + apt-get install libnvinfer4=4.1.2-1+cuda9.0 + +# Install conda with pip and python 3.6 +ARG CONDA_ENVIORNMENT +RUN apt-get update --fix-missing && apt-get -y upgrade && \ + apt-get install -y curl bzip2 && \ + curl -sSL https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -o /tmp/miniconda.sh && \ + bash /tmp/miniconda.sh -bfp /usr/local && \ + rm -rf /tmp/miniconda.sh && \ + conda create -y --name $CONDA_ENVIORNMENT python=3.6 && \ + conda clean --all --yes +ENV PATH /usr/local/envs/$CONDA_ENVIORNMENT/bin:$PATH + +RUN pip install --upgrade pip +ENV PYTHONUNBUFFERED 1 + +ARG DOCKER_WORKDIR_PATH +RUN mkdir -p $DOCKER_WORKDIR_PATH +WORKDIR $DOCKER_WORKDIR_PATH +ENV PYTHONPATH $DOCKER_WORKDIR_PATH + +# Install python dependencies +COPY singa_auto/ singa_auto/ + +RUN mkdir -p /root/.config/pip/ +COPY ./.config/pip/pip.conf /root/.config/pip/pip.conf + +RUN pip install -r singa_auto/requirements.txt +RUN pip install -r singa_auto/utils/requirements.txt +RUN pip install -r singa_auto/meta_store/requirements.txt +RUN pip install -r singa_auto/redis/requirements.txt +RUN pip install -r singa_auto/kafka/requirements.txt +RUN pip install -r singa_auto/advisor/requirements.txt + +COPY scripts/ scripts/ +RUN mkdir data/ + +CMD ["python", "scripts/start_worker.py"] diff --git a/examples/data/image_segmentaion/2007_000862.jpg b/examples/data/image_segmentaion/2007_000862.jpg new file mode 100644 index 00000000..d2eb8f81 Binary files /dev/null and b/examples/data/image_segmentaion/2007_000862.jpg differ diff --git a/examples/data/image_segmentaion/2007_001397.jpg b/examples/data/image_segmentaion/2007_001397.jpg new file mode 100644 index 00000000..8fbf68f8 Binary files /dev/null and b/examples/data/image_segmentaion/2007_001397.jpg differ diff --git a/examples/data/image_segmentaion/Persian_120.jpg b/examples/data/image_segmentaion/Persian_120.jpg new file mode 100644 index 00000000..90097e16 Binary files /dev/null and b/examples/data/image_segmentaion/Persian_120.jpg differ diff --git a/examples/data/image_segmentaion/pomeranian_159.jpg b/examples/data/image_segmentaion/pomeranian_159.jpg new file mode 100644 index 00000000..48f501af Binary files /dev/null and b/examples/data/image_segmentaion/pomeranian_159.jpg differ diff --git a/examples/data/object_detection/cat.jpg b/examples/data/object_detection/cat.jpg new file mode 100644 index 00000000..d9331e09 Binary files /dev/null and b/examples/data/object_detection/cat.jpg differ diff --git a/examples/models/image_object_detection/SaYolo.py b/examples/models/image_object_detection/SaYolo.py new file mode 100644 index 00000000..8378ba11 --- /dev/null +++ b/examples/models/image_object_detection/SaYolo.py @@ -0,0 +1,664 @@ +import os +os.environ['CUDA_VISIBLE_DEVICES'] = "0" + +import sys +sys.path.append(os.getcwd()) + +import base64 +import copy +import cv2 +import io +import json +import logging +import numpy as np +import random +import tempfile +import torch +import torchvision +import zipfile + +import PIL + +from PIL import Image +from PIL import ImageFile +ImageFile.LOAD_TRUNCATED_IMAGES = True + +from terminaltables import AsciiTable +from torch.optim import lr_scheduler +from torch.utils.data import DataLoader +from torchvision.transforms import transforms +from typing import List + +# Singa-auto Dependency +from singa_auto.darknet.model import DarkNet +from singa_auto.darknet.utils import ap_per_class +from singa_auto.darknet.utils import get_batch_statistics +from singa_auto.darknet.utils import non_max_suppression +from singa_auto.darknet.utils import pad_to_square +from singa_auto.darknet.utils import rescale_boxes +from singa_auto.darknet.utils import resize +from singa_auto.darknet.utils import weights_init_normal +from singa_auto.darknet.utils import xywh2xyxy +from singa_auto.datasets.image_detection_dataset import YoloDataset +from singa_auto.datasets.image_detection_dataset import fetch_from_train_set +from singa_auto.datasets.image_detection_dataset import split_dataset +from singa_auto.model.dev import test_model_class +from singa_auto.model.knob import FixedKnob +# from singa_auto.model.model import BaseModel +from singa_auto.model.object_detection import ObjtDetModel +from singa_auto.model.utils import utils + + +logger = logging.getLogger(__name__) + + +class SaYolo(ObjtDetModel): + """ + implements a yolo + """ + def __init__(self, **knobs): + super().__init__(**knobs) + self._knobs = knobs + + self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + print("using device", self.device) + logger.info("using device", self.device) + + self.model = None + self.dataset_name = None + self.gradient_accumulations = 2 + + # default is cat , only one class + self.filter_classes = ['cat'] + + # # make sure results folder exists + # if os.path.exists(r"./results/"): + # import shutil + # shutil.rmtree(r"./results/") + # os.makedirs(r"./results/") + + @staticmethod + def get_knob_config(): + return { + "conf_thresh": FixedKnob(0.1), + "lr": FixedKnob(0.01), + "model_def": FixedKnob("./singa_auto/darknet/yolov3-tiny.cfg"), + "nms_thresh": FixedKnob(0.2), + "pretrained_weights": FixedKnob("./singa_auto/darknet/darknet53.conv.74"), + } + + def is_predict_valid(self, box_info, class_info, image_size): + """ + make sure predicted result is valid, ie coordinates and labels are correct + """ + if box_info[6] - 1 in range(len(class_info)) and min(box_info[0:4]) >= 0 and max(box_info[0:4]) < image_size: + return True + else: + return False + + def __collate_fn(self, batch): + return tuple(zip(*batch)) + + def train(self, dataset_path, **kwargs): + logger.info("Training params: {}".format(json.dumps(kwargs))) + + # num_classes = len(self._knobs.get("filter_classes")) + num_epoch = kwargs["num_epoch"] if "num_epoch" in kwargs else 2 + batch_size = kwargs["batch_size"] if "batch_size" in kwargs else 8 + + if "filter_classes" in kwargs: + self.filter_classes = kwargs["filter_classes"] + + print("{} in train.".format(self.filter_classes)) + logger.info("{} in train.".format(self.filter_classes)) + + # root_path = r"/home/taomingyang/dataset/coco_mini_cat/" + + # load + dataset_zipfile = zipfile.ZipFile(dataset_path, 'r') + train_folder = tempfile.TemporaryDirectory() + dataset_zipfile.extractall(path=train_folder.name) + root_path = train_folder.name + print("root_path: {}".format(root_path)) + logger.info("root_path: {}".format(root_path)) + + print("prepare dataset") + logger.info("prepare dataset") + if os.path.isdir(os.path.join(root_path, "image")): + print("split train/val subsets...") + logger.info("split train/val subsets...") + split_dataset(root_path) + elif os.path.isdir(os.path.join(root_path, "train")): + if not os.path.exists(os.path.join(root_path, "val")): + logger.info("fetch val from train") + fetch_from_train_set(root_path) + else: + print("unsupported dataset format!") + logger.info("unsupported dataset format!") + return None + + image_train = os.path.join(root_path, "train", "image") + image_val = os.path.join(root_path, "val", "image") + annotation_train = os.path.join(root_path, "train", "annotation") + annotation_val = os.path.join(root_path, "val", "annotation") + + # Get dataloader + dataset_train = YoloDataset( + image_train, + annotation_train, + is_single_json_file=False, + filter_classes=self.filter_classes, + is_train=True, + augment=True, + multiscale=True + ) + # Get dataloader + dataset_test = YoloDataset( + image_val, + annotation_val, + is_single_json_file=False, + filter_classes=self.filter_classes, + is_train=False, + augment=False, + multiscale=False + ) + + print("Training the model YOLO using {}".format(self.device)) + logger.info("Training the model YOLO using {}".format(self.device)) + + # define training and validation data loaders + data_loader_train = torch.utils.data.DataLoader( + dataset_train, batch_size=batch_size, shuffle=True, collate_fn=dataset_train.collate_fn + ) + + data_loader_test = torch.utils.data.DataLoader( + dataset_test, batch_size=batch_size, shuffle=False, collate_fn=dataset_test.collate_fn + ) + + # get the model using our helper function + self.model = DarkNet(config_path=self._knobs.get("model_def")).to(self.device) + self.model.apply(weights_init_normal) + + # pretrained weights + if self._knobs.get("pretrained_weights"): + pretrained_weights_path = self._knobs.get("pretrained_weights") + if pretrained_weights_path.endswith(".pth"): + if os.path.exists(pretrained_weights_path): + self.model.load_state_dict(torch.load(pretrained_weights_path, map_location="cpu")) + logger.info("using pretrained_weights {}".format(pretrained_weights_path)) + else: + logger.warning("pretrained_weights {} not exists.".format(pretrained_weights_path)) + else: + if not os.path.exists(pretrained_weights_path): + import wget + os.makedirs(os.path.dirname(pretrained_weights_path), exist_ok=True) + pretrained_weights_path = wget.download(r"https://pjreddie.com/media/files/darknet53.conv.74", out=os.path.dirname(pretrained_weights_path)) + self.model.load_darknet_weights(pretrained_weights_path) + logger.info("using pretrained_weights {}".format(pretrained_weights_path)) + + # # move model to the right device + # self.model.to(self.device) + + # construct an optimizer + optimizer = torch.optim.Adam( + self.model.parameters(), + lr=self._knobs.get("lr"), + ) + + torch.manual_seed(1) + + for epoch in range(num_epoch): + # train for one epoch, printing every 10 iterations + loss_value = self._train_one_epoch(self.model, optimizer, data_loader_train, epoch) + + print("loss is {}".format(loss_value)) + logger.info("loss is {}".format(loss_value)) + + if loss_value is None: + break + + # update the learning rate + # lr_scheduler.step() + + print("begin to evalute after epoch: {}".format(epoch)) + logger.info("begin to evalute after epoch: {}".format(epoch)) + precision, recall, AP, f1, ap_class = self._evaluate(data_loader_test) + print("Average Precisions:") + logger.info("Average Precisions:") + for i, c in enumerate(ap_class): + info_str = "\t+ Class \"{}\" ({}) - AP: {:.5f}".format(c, dataset_test.coco.cats[dataset_test.label_to_cat[c]]['name'], AP[i]) + print(info_str) + logger.info(info_str) + print("mAP: {:.9f}".format(AP.mean())) + logger.info("mAP: {:.9f}".format(AP.mean())) + + def _train_one_epoch(self, model, optimizer, data_loader, epoch): + model.train() + + # lr_scheduler = None + # if epoch == 0: + # warmup_factor = 1. / 1000 + # warmup_iters = min(1000, len(data_loader) - 1) + # + # lr_scdheduler = self.__warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) + + logger.info("On Epoch {}, begin to train".format(epoch)) + # loss_value = 0 + + metrics = [ + "grid_size", + "loss", + "x", + "y", + "w", + "h", + "conf", + "cls", + "cls_acc", + "recall50", + "recall75", + "precision", + "conf_obj", + "conf_noobj", + ] + + for batch_i, (_, images, targets) in enumerate(data_loader): + logger.info("\t batch {}/{}".format(batch_i, len(data_loader))) + batches_done = len(data_loader) * epoch + batch_i + + images = images.to(self.device) + targets = targets.to(self.device) + + loss, outputs = model(images, targets) + + if not np.math.isfinite(loss): + logger.info("Loss is {}, stopping training".format(loss)) + return None + + loss.backward() + + if batches_done % self.gradient_accumulations: + optimizer.step() + optimizer.zero_grad() + + # log_str = "\n---- [Epoch %d, Batch %d/%d] ----\n" % (epoch, batch_i, len(data_loader)) + # + # metric_table = [["Metrics", *[f"YOLO Layer {i}" for i in range(len(self.model.yolo_layers))]]] + # + # # Log metrics at each YOLO layer + # for i, metric in enumerate(metrics): + # formats = {m: "%.6f" for m in metrics} + # formats["grid_size"] = "%2d" + # formats["cls_acc"] = "%.2f%%" + # row_metrics = [formats[metric] % yolo.metrics.get(metric, 0) for yolo in self.model.yolo_layers] + # metric_table += [[metric, *row_metrics]] + # + # # # Tensorboard logging + # # tensorboard_log = [] + # # for j, yolo in enumerate(model.yolo_layers): + # # for name, metric in yolo.metrics.items(): + # # if name != "grid_size": + # # tensorboard_log += [(f"{name}_{j+1}", metric)] + # # tensorboard_log += [("loss", loss.item())] + # # summary_logger.list_of_scalars_summary(tensorboard_log, batches_done) + # + # log_str += AsciiTable(metric_table).table + # log_str += f"\nTotal loss {loss.item()}" + # print(log_str) + # logger.info(log_str) + # + # if lr_scheduler is not None: + # lr_scheduler.step() + + model.seen += images.size(0) + + return loss.item() + + def dump_parameters(self): + """ + dump parameters to local file + """ + params = {} + with tempfile.NamedTemporaryFile() as tmp: + # Save whole model to temp h5 file + torch.save(self.model.state_dict(), tmp.name) + # Read from temp h5 file & encode it to base64 string + with open(tmp.name, 'rb') as f: + weight_base64 = f.read() + params['weight_base64'] = base64.b64encode(weight_base64).decode('utf-8') + params["module_cfg"] = json.dumps(self.model.model_cfg) + return params + + def load_parameters(self, params): + """ + load parameters from local file + """ + + logger.info("load parameters") + weight_base64 = params['weight_base64'] + self.module_cfg = json.loads(params["module_cfg"]) + + weight_base64_bytes = base64.b64decode(weight_base64.encode('utf-8')) + + state_dict = torch.load(io.BytesIO(weight_base64_bytes), map_location=self.device) + self.model = DarkNet(model_cfg=self.module_cfg).to(self.device) + self.model.load_state_dict(state_dict) + # self.model.cuda() + + def evaluate(self, dataset_path, **kwargs): + print(kwargs) + + # root_path = r"/home/taomingyang/dataset/coco_mini_cat/" + + # load + dataset_zipfile = zipfile.ZipFile(dataset_path, 'r') + evaluate_folder = tempfile.TemporaryDirectory() + dataset_zipfile.extractall(path=evaluate_folder.name) + root_path = evaluate_folder.name + print(root_path) + logger.info("root_path: {}".format(root_path)) + + print("prepare dataset") + if os.path.isdir(os.path.join(root_path, "image")): + print("split train/val subsets...") + logger.info("split train/val subsets...") + split_dataset(root_path) + elif os.path.isdir(os.path.join(root_path, "train")): + if not os.path.exists(os.path.join(root_path, "val")): + fetch_from_train_set(root_path) + logger.info("fetch val from train") + else: + print("unsupported dataset format!") + logger.info("unsupported dataset format!") + return None + + image_val = os.path.join(root_path, "val", "image") + annotation_val = os.path.join(root_path, "val", "annotation") + + dataset_valid = YoloDataset( + image_val, + annotation_val, + is_single_json_file=False, + filter_classes=self.filter_classes, + is_train=False, + ) + data_loader_valid = torch.utils.data.DataLoader( + dataset_valid, + batch_size=1, + shuffle=False, + collate_fn=dataset_valid.collate_fn + ) + + logger.info("dataset prepared") + + # perform an evaluate + precision, recall, AP, f1, ap_class = self._evaluate(data_loader_valid) + + return np.mean(precision) + + @torch.no_grad() + def _evaluate(self, data_loader): + self.model.eval() + + labels = [] + sample_metrics = [] # List of tuples (TP, confs, pred) + + for batch_i, (names, images, targets) in enumerate(data_loader): + # Extract labels + labels += targets[:, 1].tolist() + # Rescale target + targets[:, 2:] = xywh2xyxy(targets[:, 2:]) + targets[:, 2:] *= 416 + + images = images.to(self.device) + + with torch.no_grad(): + outputs = self.model(images) + outputs = non_max_suppression(outputs, conf_thresh=self._knobs.get("conf_thresh"), nms_thresh=self._knobs.get("nms_thresh")) + + for name, image, output in zip(names, images, outputs): + tmp = image.cpu().detach().permute((1, 2, 0)).mul(255).clamp(0, 255).numpy() + tmp = cv2.cvtColor(tmp, cv2.COLOR_RGB2BGR) + if output is not None: + for rect_info in output: + coord = rect_info.cpu().numpy() + if self.is_predict_valid(coord, self.filter_classes, image.size(-1)): + cv2.rectangle(tmp, (coord[0], coord[1]), (coord[2], coord[3]), (0, 255, 0), 3) + + cv2.imwrite('./results/{}'.format(os.path.basename(name)), tmp) + + sample_metrics += get_batch_statistics(outputs, targets, iou_thresh=0.5) + + # return score, evaluate_res_str + if 0 == len(sample_metrics): + ap_class = np.array(list(set(labels)), dtype=np.int32) + precision = recall = AP = f1 = np.array([0 for x in ap_class], dtype=np.float64) + else: + # Concatenate sample statistics + true_positives, pred_scores, pred_labels = [np.concatenate(x, 0) for x in list(zip(*sample_metrics))] + precision, recall, AP, f1, ap_class = ap_per_class(true_positives, pred_scores, pred_labels, labels) + + return precision, recall, AP, f1, ap_class + + def predict(self, queries: List[PIL.Image.Image]) -> List[dict]: + """ + predict with trained model + """ + + os.makedirs("./results/", exist_ok=True) + + result = list() + + for img in queries: + img_res = dict() + + if isinstance(img, List): + print(len(img)) + img = np.array(img[0]) + img_data = Image.fromarray(np.uint8(img)) + elif isinstance(img, np.ndarray): + img_data = Image.fromarray(img) + else: + img_data = img + + # get prediction + res = self.__get_prediction(img_data) + if res is None: + img_with_box = img_with_segmentation = img_data + boxes, pred_cls = None, None + else: + boxes, pred_cls = res + img_data = np.asarray(img_data).astype(np.uint8) + img_with_box = self.__get_bounding_box(img_data, boxes, pred_cls) + + # the response format is only used to show on origin web ui + img_res['explanations'] = {} + # img_res['explanations']['lime_img'] = self.__convert_img_to_str(img_with_box) + # img_res['explanations']['box_info'] = boxes + # img_res['explanations']['classes'] = pred_cls + + img_res['explanations']['box_info'] = [] + + if boxes is not None and pred_cls is not None and len(boxes) == len(pred_cls): + for box_coord, class_name in zip(boxes, pred_cls): + img_res['explanations']['box_info'].append({ + "coord": box_coord, + "class_name": class_name, + }) + img_res['mc_dropout'] = [] + + result.append(img_res) + + return result + + def __warmup_lr_scheduler(self, optimizer, warmup_iters, warmup_factor): + def f(x): + if x >= warmup_iters: + return 1 + alpha = float(x) / warmup_iters + return warmup_factor * (1 - alpha) + alpha + + return torch.optim.lr_scheduler.LambdaLR(optimizer, f) + + def __get_prediction(self, img): + self.model.eval() + + img = torchvision.transforms.ToTensor()(img) + # Handle images with less than three channels + if len(img.shape) != 3: + img = img.unsqueeze(0) + img = img.expand((3, img.shape[1], img.shape[2])) + elif len(img.shape) == 3 and img.shape[0] == 1: + img = img.expand((3, img.shape[1], img.shape[2])) + + ori_size = img.shape[-2:] + + # Pad to square resolution + img, pad = pad_to_square(img, 0) + img = torch.unsqueeze(resize(img, 416), 0) + img = img.to(self.device) + pred = self.model(img) + pred = non_max_suppression(pred, conf_thresh=self._knobs.get("conf_thresh"), nms_thresh=self._knobs.get("nms_thresh")) + pred_class = [] + pred_boxes = [] + if pred[0] is None: + return None + + box_info = rescale_boxes(pred[0], 416, ori_size) + num_box = box_info.size()[0] + + # get predicted info + for rect_info in box_info: + coord = rect_info.cpu().numpy() + if self.is_predict_valid(coord, self.filter_classes, img.size(-1)): + pred_class.append(self.filter_classes[np.int(coord[6])-1]) + pred_boxes.append((np.int(coord[0]), np.int(coord[1]), np.int(coord[2]), np.int(coord[3]))) + + if len(pred_boxes) == 0: + return None + else: + return pred_boxes, pred_class + + def __get_bounding_box(self, img, boxes, pred_cls, rect_th=3, text_size=1, text_th=3): + """ + draw the bounding box on img + """ + + img = copy.deepcopy(img) + img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + for i in range(len(boxes)): + cv2.rectangle(img, (boxes[i][0], boxes[i][1]), (boxes[i][2], boxes[i][3]), (0, 255, 0), rect_th) + cv2.putText(img, pred_cls[i], (boxes[i][0], boxes[i][1]), cv2.FONT_HERSHEY_SIMPLEX, text_size, (0, 255, 0), + thickness=text_th) + + cv2.imwrite("./results/{:04d}.png".format(random.randint(0, 9999)), img) + return img + + def __get_segmentation(self, img, masks): + """ + draw the segmentation box on img + """ + def random_colour_masks(image): + """ + for display the prediction image + """ + colours = [ + [0, 255, 0], + [0, 0, 255], + [255, 0, 0], + [0, 255, 255], + [255, 255, 0], + [255, 0, 255], + [80, 70, 180], + [250, 80, 190], + [245, 145, 50], + [70, 150, 250], + [50, 190, 190] + ] + r = np.zeros_like(image).astype(np.uint8) + g = np.zeros_like(image).astype(np.uint8) + b = np.zeros_like(image).astype(np.uint8) + r[image == 1], g[image == 1], b[image == 1] = colours[random.randrange(0, 10)] + coloured_mask = np.stack([r, g, b], axis=2) + return coloured_mask + + img = copy.deepcopy(img) + for i in range(len(masks)): + rgb_mask = random_colour_masks(masks[i]) + img = cv2.addWeighted(img, 1, rgb_mask, 0.5, 0) + return img + + def __convert_img_to_str(self, arr): + im = Image.fromarray(arr.astype("uint8")) + rawBytes = io.BytesIO() + encoding = 'utf-8' + im.save(rawBytes, "PNG") + rawBytes.seek(0) + return base64.b64encode(rawBytes.read()).decode(encoding) + + def __get_iou_types(self, model): + model_without_ddp = model + if isinstance(model, torch.nn.parallel.DistributedDataParallel): + model_without_ddp = model.module + iou_types = ["bbox"] + if isinstance(model_without_ddp, torchvision.models.detection.MaskRCNN): + iou_types.append("segm") + if isinstance(model_without_ddp, torchvision.models.detection.KeypointRCNN): + iou_types.append("keypoints") + return iou_types + + +if __name__ == "__main__": + import argparse + from singa_auto.model.dev import test_model_class + + parser = argparse.ArgumentParser() + parser.add_argument( + '--train_path', + type=str, + default='/home/taomingyang/dataset/package/coco_mini.zip', + help='Path to train dataset' + ) + parser.add_argument( + '--val_path', + type=str, + default='/home/taomingyang/dataset/package/coco_mini.zip', + help='Path to validation dataset' + ) + parser.add_argument( + '--query_path', + type=str, + default='./examples/data/object_detection/cat.jpg', + help='Path(s) to query image(s), delimited by commas' + ) + + (args, _) = parser.parse_known_args() + + queries = utils.dataset.load_images(args.query_path.split(',')) + test_model_class( + model_file_path=__file__, + model_class='SaYolo', + task='OBJECT_DETECTION', + dependencies={ + "opencv-python": "4.4.0.46", + "terminaltables": "3.1.0", + "torch": "1.4.0", + "torchvision": "0.5.0", + }, + train_dataset_path=args.train_path, + val_dataset_path=args.val_path, + train_args={ + "batch_size": 8, + "model_def": "./singa_auto/darknet/yolov3-tiny.cfg", + "filter_classes": ['cat'], + "num_epoch": 1, + "pretrained_weights": "./singa_auto/darknet/darknet53.conv.74", + }, + queries=queries + ) + + """ + Test the model out of singa-auto platform + python -c "import torch;print(torch.cuda.is_available())" + """ + diff --git a/examples/models/image_object_detection/food_detection/FoodlgNet.py b/examples/models/image_object_detection/food_detection/FoodlgNet.py new file mode 100644 index 00000000..0441d54e --- /dev/null +++ b/examples/models/image_object_detection/food_detection/FoodlgNet.py @@ -0,0 +1,1339 @@ +import os +from os.path import join +os.environ['CUDA_VISIBLE_DEVICES'] = "0" + +import sys +sys.path.append(os.getcwd()) + +import time +import json +import base64 +import random +import logging +import zipfile +import tempfile +import datetime +import requests +from PIL import Image +from io import BytesIO +from typing import List +from collections import OrderedDict + +from singa_auto.model import ObjtDetModel, FixedKnob +from singa_auto.model.dev import test_model_class +from singa_auto.model.utils import dataset + +import torch +import numpy as np +from torch import nn +from torch.utils.data import Dataset, DataLoader +from torchvision import transforms +import torch.nn.functional as F + +logger = logging.getLogger(__name__) + +class FoodlgNet(ObjtDetModel): + + @staticmethod + def get_knob_config(): + return { + 'learning_rate': FixedKnob(1e-10), + 'momentum': FixedKnob(0.7), + 'epoch': FixedKnob(0), + 'batch_size': FixedKnob(4) + } + + def __init__(self, **knobs): + # load model parameters and configurations + self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + # image preprocessing function + self.cls_transform = transforms.Compose([ + transforms.Resize((299,299), interpolation=2), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ]) + + # initialize other parameters + self.lr = knobs.get('learning_rate') + self.momentum = knobs.get('momentum') + self.epoch = knobs.get('epoch') + self.batch_size = knobs.get('batch_size') + + def _initialize_model(self, paths=None): + # two networks, yolo detection network (to train) and resnext classification network (fixed) + + if paths is None: + # here you need to set an extra file server to provide config/pretrained weights files + # the codes below will download files from file server. + # During prediction, this process is not required any more. + dst_folder = tempfile.TemporaryDirectory() + dst_folder_path = dst_folder.name + # object_names_path = load_url(save_path=dst_folder_path, url='http://192.168.100.203:8000/FoodlgNet/classes.names') + model_config_path = load_url(save_path=dst_folder_path, url='http://192.168.100.203:8000/FoodlgNet/foodlg_yolo.cfg') + pretrain_model_path = load_url(save_path=dst_folder_path, url='http://192.168.100.203:8000/FoodlgNet/yolov3_ckpt_SGD_94.pth') + + classify_names_path = load_url(save_path=dst_folder_path, url='http://192.168.100.203:8000/FoodlgNet/food783.name') + classify_model_path = load_url(save_path=dst_folder_path, url='http://192.168.100.203:8000/FoodlgNet/resnext101_ckpt.pth') + + else: + model_config_path = paths['model_config_path'] + pretrain_model_path = paths['pretrain_model_path'] + classify_model_path = paths['classify_model_path'] + classify_names_path = paths['classify_names_path'] + + + # initiate detection model + self.conf_thres = 0.5 + self.nms_thres = 0.4 + self.img_size = 416 + with open(model_config_path, encoding = 'utf-8') as f: + self.model_config_path = f.readlines() + + det_model = Darknet(model_config_path, img_size=self.img_size) + det_model.load_state_dict(torch.load(pretrain_model_path, map_location='cpu')) + det_model = det_model.to(self.device) + det_model.eval() + self.det_model = det_model + self.det_classes = ['food'] + + # initiate classification model + self.num_classes = 783 + self.clf_classes = load_classes(classify_names_path) + + from torchvision.models.resnet import Bottleneck, ResNet + clf_model = ResNet(block=Bottleneck, layers=[3, 4, 23, 3], groups=32, width_per_group=16) + clf_model.fc = nn.Linear(2048, self.num_classes) + + ckpt = torch.load(classify_model_path, map_location='cpu') + ckpt = OrderedDict({k.replace('module.', ''): v for k, v in ckpt.items()}) + clf_model.load_state_dict(ckpt) + clf_model = clf_model.to(self.device) + clf_model.eval() + self.clf_model = clf_model + + + def train(self, dataset_path, shared_params=None, **train_args): + # fine-tune yolov3 model for detectection part + self._initialize_model() + + # load and process data + dataset_folder = load_zip(dataset_path) + # split dataset and then save to txt files + train_path, valid_path = split_dataset_save(dataset_folder, ratio=0.9) + # load data to torch dataloader + dataset = ListDataset(train_path, augment=True, multiscale=True) + dataloader = torch.utils.data.DataLoader( + dataset, + batch_size=self.batch_size, + shuffle=False, + num_workers=8, + pin_memory=True, + collate_fn=dataset.collate_fn, + ) + # other settings for training + optimizer = torch.optim.SGD(self.det_model.parameters(), lr=self.lr, momentum=self.momentum) + + metrics = [ + "grid_size", + "loss", + "x", + "y", + "w", + "h", + "conf", + "cls", + "cls_acc", + "recall50", + "recall75", + "precision", + "conf_obj", + "conf_noobj", + ] + + # start training + start_epoch = 0 + for epoch in range(start_epoch, self.epoch): + self.det_model.train() + start_time = time.time() + for batch_i, (_, imgs, targets) in enumerate(dataloader): + + batches_done = len(dataloader) * epoch + batch_i + + imgs = imgs.to(self.device) + targets = targets.to(self.device) + + loss, outputs = self.det_model(imgs, targets) + loss.backward() + + if batches_done % 2: + # Accumulates gradient before each step + optimizer.step() + optimizer.zero_grad() + + # ---------------- + # Log progress + # ---------------- + + log_str = "\n---- [Epoch %d/%d, Batch %d/%d] ----\n" % (epoch, self.epoch, batch_i, len(dataloader)) + metric_table = [["Metrics", *[f"YOLO Layer {i}" for i in range(len(self.det_model.yolo_layers))]]] + + # Log metrics at each YOLO layer + for i, metric in enumerate(metrics): + formats = {m: "%.6f" for m in metrics} + formats["grid_size"] = "%2d" + formats["cls_acc"] = "%.2f%%" + row_metrics = [formats[metric] % yolo.metrics.get(metric, 0) for yolo in self.det_model.yolo_layers] + metric_table += [[metric, *row_metrics]] + + if batch_i%50==0: + log_str += toAscii(metric_table) + log_str += f"\nTotal loss {loss.item()}" + + # Determine approximate time left for epoch + epoch_batches_left = len(dataloader) - (batch_i + 1) + time_left = datetime.timedelta(seconds=epoch_batches_left * (time.time() - start_time) / (batch_i + 1)) + log_str += f"\n---- ETA {time_left}" + + print(log_str) + + self.det_model.seen += imgs.size(0) + + print("\n---- Evaluating Model ----") + # Evaluate the model on the validation set + precision, recall, AP, f1, ap_class = _evaluate( + self.det_model, + path=valid_path, + iou_thres=0.5, + conf_thres=self.conf_thres, + nms_thres=self.nms_thres, + img_size=self.img_size, + batch_size=self.batch_size, + device=self.device + ) + evaluation_metrics = [ + ("val_precision", precision.mean()), + ("val_recall", recall.mean()), + ("val_mAP", AP.mean()), + ("val_f1", f1.mean()), + ] + + # Print class APs and mAP + ap_table = [["Index", "Class name", "AP"]] + for i, c in enumerate(ap_class): + ap_table += [[c, self.det_classes[c], "%.5f" % AP[i]]] + # print(AsciiTable(ap_table).table) + print(toAscii(ap_table)) + print(f"---- mAP {AP.mean()}") + + + def dump_parameters(self): + params = {} + # get models weights + det_model_weights = self.det_model.state_dict() + clf_model_weights = self.clf_model.state_dict() + + # convert model weights and configs to json format + params['det_model_weights'] = serialize_state_dict(det_model_weights) + params['clf_model_weights'] = serialize_state_dict(clf_model_weights) + params['det_model_cfg'] = json.dumps(self.model_config_path) + params['clf_classes'] = json.dumps(self.clf_classes) + + return params + + def evaluate(self, dataset_path, **kargs): + # load and process data + dataset_folder = load_zip(dataset_path) + # split dataset and then save to txt files + train_path, valid_path = split_dataset_save(dataset_folder, ratio=0.9) + # evaluate process + precision, recall, AP, f1, ap_class = _evaluate( + self.det_model, + path=valid_path, + iou_thres=0.5, + conf_thres=self.conf_thres, + nms_thres=self.nms_thres, + img_size=self.img_size, + batch_size=self.batch_size, + device=self.device + ) + + return f1[0] + + def load_parameters(self, params): + paths = {} + # prepare tmp file paths + dst_folder = tempfile.TemporaryDirectory().name + os.mkdir(dst_folder) + + model_config_path = os.path.join(dst_folder,'det_config.pth') + pretrain_model_path = os.path.join(dst_folder, 'det_model.pth') + classify_model_path = os.path.join(dst_folder, 'clf_model.names') + classify_names_path = os.path.join(dst_folder, 'clf_class.cfg') + + # convert params to python object and save to tmp paths + det_model_weights = deserialize_state_dict(params['det_model_weights']) + clf_model_weights = deserialize_state_dict(params['clf_model_weights']) + + torch.save(det_model_weights, pretrain_model_path) + torch.save(clf_model_weights, classify_model_path) + + with open(model_config_path,'w', encoding = 'utf-8') as f: + f.writelines(json.loads(params['det_model_cfg'])) + + with open(classify_names_path,'w', encoding = 'utf-8') as f: + for name in json.loads(params['clf_classes']): + f.write(name + '\n') + + # record these paths + paths['model_config_path'] = model_config_path + paths['pretrain_model_path'] = pretrain_model_path + paths['classify_model_path'] = classify_model_path + paths['classify_names_path'] = classify_names_path + + # initiate model + self._initialize_model(paths=paths) + + + def predict(self, queries): + logger.info(f'the length of queries is {len(queries)}') + # queries is a list of PIL.Image object + ########## + # yolo part + input_imgs = [] + widths = [] + heights = [] + + queries = unifyImageType(queries) + for img in queries: + w, h = img.size + widths.append(w) + heights.append(h) + + img = transforms.ToTensor()(img) # Extract image as PyTorch tensor + img, _ = pad_to_square(img, 0) # Pad to square resolution + img = resize(img, self.img_size) # Resize + input_imgs.append(img.unsqueeze(0)) + + input_imgs = torch.cat(input_imgs) + input_imgs = input_imgs.to(self.device) + + + # Get detections + with torch.no_grad(): + detections = self.det_model(input_imgs) + detections = non_max_suppression(detections, self.conf_thres, self.nms_thres) + + cls_results = [] + for det_res, width, height, img in zip(detections, widths, heights, queries): + + if det_res is None: # no food detected, skip irv2 part + cls_results.append({ + 'explanations': { + 'box_info': [] + }, + 'raw_preds': [], + 'mc_dropout': [], # not used + }) + continue + + ########## + # irv2 part + # pass each detection to classification model + + cropped_imgs = [] + bbox_values = [] + confs = [] + predictions = [] + + det_res = rescale_boxes(det_res, self.img_size, (height, width)) + for x1, y1, x2, y2, conf, cls_conf, cls_pred in det_res: + + x1, y1, x2, y2 = list(map(lambda x: x.tolist(), (x1, y1, x2, y2))) + cropped = img.crop((x1, y1, x2, y2)) # (left, upper, right, lower) + + cropped_imgs.append(cropped) + bbox_values.append([x1, y1, x2, y2]) + confs.append(conf.cpu().item()) + + cropped_imgs = [self.cls_transform(img) for img in cropped_imgs] + test_dataloader = DataLoader(cropped_imgs, batch_size=self.batch_size, shuffle=False) + + for batch in test_dataloader: + + batch = batch.to(self.device) + # parallelly batch prediction + with torch.no_grad(): + prediction = self.clf_model(batch) + + predictions.extend(p.cpu().numpy() for p in prediction) + + # post processing to make result compatible with different front ends + result = { + 'explanations':{ + 'box_info': [] + }, + 'raw_preds': [], + 'mc_dropout': [], # not used + } + for idx in range(len(predictions)): + class_id = np.argsort(predictions[idx])[::-1][:1] + str_class = ' '.join(self.clf_classes[i] for i in class_id) + + jbox = {} + jbox['label_id'] = str(class_id[0]) + jbox['label'] = str(str_class) + jbox['probability'] = confs[idx] + + x1, y1, x2, y2 = bbox_values[idx] + jbox['detection_box'] = [ + max(0, y1 / height), + max(0, x1 / width), + min(1, y2 / height), + min(1, x2 / width) + ] + + exp_box = {} + exp_box['coord'] = [int(x1), int(y1), int(x2), int(y2)] + exp_box['class_name'] = str(str_class) + + result['explanations']['box_info'].append(exp_box) + result['raw_preds'].append(jbox) + + cls_results.append(result) + + logger.info(f'Predict result: {cls_results}') + return cls_results + +def unifyImageType(imgs): + # to check if the image is PIL.Image or numpy.ndarray + # and convert all to PIL.Image + results = [] + for img in imgs: + if isinstance(img, List): + # used for accepting image from forkcloud frontend + img = np.uint8(np.array(img[0])) + results.append(Image.fromarray(img)) + elif isinstance(img, np.ndarray): + results.append(Image.fromarray(img)) + else: + results.append(img) + return results + + +def load_zip(zip_path): + logger.info(zip_path) + # extract uploaded zipfile + if not os.path.exists(zip_path): + raise FileNotFoundError(f'zip file {zip_path} does not exist') + + dst_folder = tempfile.TemporaryDirectory().name + zip_data = zipfile.ZipFile(zip_path, 'r') + zip_data.extractall(path=dst_folder) + return dst_folder + +def load_url(save_path, url): + # download file and save + res = requests.get(url, timeout=300) + filename = join(save_path, url.split('/')[-1]) + with open(filename, 'wb') as f: + f.write(res.content) + return filename + +def serialize_state_dict(state_dict): + with tempfile.NamedTemporaryFile() as tmp: + torch.save(state_dict, tmp.name) + with open(tmp.name, 'rb') as f: + weights = f.read() + + return base64.b64encode(weights).decode('utf-8') + +def deserialize_state_dict(b64bytes): + b64bytes = base64.b64decode(b64bytes.encode('utf-8')) + state_dict = torch.load(BytesIO(b64bytes), map_location='cpu') + + return state_dict + +def split_dataset_save(dataset_folder, ratio = 0.9): + image_paths = os.listdir(join(dataset_folder, 'images')) + image_paths = [join(dataset_folder, 'images', path) for path in image_paths] + + # split dataset + train_size = round(len(image_paths) * ratio) + valid_size = len(image_paths) - train_size + random.seed(42) + random_idx = random.sample(range(len(image_paths)), k=train_size) + random_idx = sorted(random_idx) + + train_set = [] + valid_set = [] + idx_pointer = 0 + for i, path in enumerate(image_paths): + if i == random_idx[idx_pointer]: + train_set.append(path) + idx_pointer += 1 + else: + valid_set.append(path) + + train_file_path = join(dataset_folder, 'train.txt') + valid_file_path = join(dataset_folder, 'valid.txt') + with open(train_file_path, 'w') as f: + for line in train_set: + f.write(line + '\n') + + with open(valid_file_path, 'w') as f: + for line in valid_set: + f.write(line + '\n') + + return train_file_path, valid_file_path + + +def toAscii(data_list): + # convert evaluate result list to string + res = '' + for line in data_list: + line = [str(l) for l in line] + line = ','.join(line) + '\n' + res += line + return res + +def _evaluate(model, path, iou_thres, conf_thres, nms_thres, img_size, batch_size, device): + model.eval() + + # Get dataloader + dataset = ListDataset(path, img_size=img_size, augment=False, multiscale=False) + dataloader = torch.utils.data.DataLoader( + dataset, batch_size=batch_size, shuffle=False, num_workers=1, collate_fn=dataset.collate_fn + ) + + Tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor + + labels = [] + sample_metrics = [] # List of tuples (TP, confs, pred) + for batch_i, (_, imgs, targets) in enumerate(dataloader): + + # Extract labels + labels += targets[:, 1].tolist() + # Rescale target + targets[:, 2:] = xywh2xyxy(targets[:, 2:]) + targets[:, 2:] *= img_size + + + with torch.no_grad(): + imgs = imgs.to(device) + outputs = model(imgs) + outputs = non_max_suppression(outputs, conf_thres=conf_thres, nms_thres=nms_thres) + + sample_metrics += get_batch_statistics(outputs, targets, iou_threshold=iou_thres) + + # Concatenate sample statistics + true_positives, pred_scores, pred_labels = [np.concatenate(x, 0) for x in list(zip(*sample_metrics))] + precision, recall, AP, f1, ap_class = ap_per_class(true_positives, pred_scores, pred_labels, labels) + + return precision, recall, AP, f1, ap_class + + + + +############################# +# YOLOv3 part + +def to_cpu(tensor): + return tensor.detach().cpu() + +def load_classes(path): + """ + Loads class labels at 'path' + """ + fp = open(path, "r", encoding = 'utf-8') + names = fp.read().split("\n")[:-1] + return names + +def parse_model_config(path): + """Parses the yolo-v3 layer configuration file and returns module definitions""" + file = open(path, 'r', encoding = 'utf-8') + lines = file.read().split('\n') + lines = [x for x in lines if x and not x.startswith('#')] + lines = [x.rstrip().lstrip() for x in lines] # get rid of fringe whitespaces + module_defs = [] + for line in lines: + if line.startswith('['): # This marks the start of a new block + module_defs.append({}) + module_defs[-1]['type'] = line[1:-1].rstrip() + if module_defs[-1]['type'] == 'convolutional': + module_defs[-1]['batch_normalize'] = 0 + else: + key, value = line.split("=") + value = value.strip() + module_defs[-1][key.rstrip()] = value.strip() + + return module_defs + +def pad_to_square(img, pad_value): + c, h, w = img.shape + dim_diff = np.abs(h - w) + # (upper / left) padding and (lower / right) padding + pad1, pad2 = dim_diff // 2, dim_diff - dim_diff // 2 + # Determine padding + pad = (0, 0, pad1, pad2) if h <= w else (pad1, pad2, 0, 0) + # Add padding + img = F.pad(img, pad, "constant", value=pad_value) + + return img, pad + +def resize(image, size): + image = F.interpolate(image.unsqueeze(0), size=size, mode="nearest").squeeze(0) + return image + +def horisontal_flip(images, targets): + images = torch.flip(images, [-1]) + targets[:, 2] = 1 - targets[:, 2] + return images, targets + +def ap_per_class(tp, conf, pred_cls, target_cls): + """ Compute the average precision, given the recall and precision curves. + Source: https://github.com/rafaelpadilla/Object-Detection-Metrics. + # Arguments + tp: True positives (list). + conf: Objectness value from 0-1 (list). + pred_cls: Predicted object classes (list). + target_cls: True object classes (list). + # Returns + The average precision as computed in py-faster-rcnn. + """ + + # Sort by objectness + i = np.argsort(-conf) + tp, conf, pred_cls = tp[i], conf[i], pred_cls[i] + + # Find unique classes + unique_classes = np.unique(target_cls) + + # Create Precision-Recall curve and compute AP for each class + ap, p, r = [], [], [] + for c in unique_classes: + i = pred_cls == c + n_gt = (target_cls == c).sum() # Number of ground truth objects + n_p = i.sum() # Number of predicted objects + + if n_p == 0 and n_gt == 0: + continue + elif n_p == 0 or n_gt == 0: + ap.append(0) + r.append(0) + p.append(0) + else: + # Accumulate FPs and TPs + fpc = (1 - tp[i]).cumsum() + tpc = (tp[i]).cumsum() + + # Recall + recall_curve = tpc / (n_gt + 1e-16) + r.append(recall_curve[-1]) + + # Precision + precision_curve = tpc / (tpc + fpc) + p.append(precision_curve[-1]) + + # AP from recall-precision curve + ap.append(compute_ap(recall_curve, precision_curve)) + + # Compute F1 score (harmonic mean of precision and recall) + p, r, ap = np.array(p), np.array(r), np.array(ap) + f1 = 2 * p * r / (p + r + 1e-16) + + return p, r, ap, f1, unique_classes.astype("int32") + +def compute_ap(recall, precision): + """ Compute the average precision, given the recall and precision curves. + Code originally from https://github.com/rbgirshick/py-faster-rcnn. + + # Arguments + recall: The recall curve (list). + precision: The precision curve (list). + # Returns + The average precision as computed in py-faster-rcnn. + """ + # correct AP calculation + # first append sentinel values at the end + mrec = np.concatenate(([0.0], recall, [1.0])) + mpre = np.concatenate(([0.0], precision, [0.0])) + + # compute the precision envelope + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + i = np.where(mrec[1:] != mrec[:-1])[0] + + # and sum (\Delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap + +def get_batch_statistics(outputs, targets, iou_threshold): + """ Compute true positives, predicted scores and predicted labels per sample """ + batch_metrics = [] + for sample_i in range(len(outputs)): + + if outputs[sample_i] is None: + continue + + output = outputs[sample_i] + pred_boxes = output[:, :4] + pred_scores = output[:, 4] + pred_labels = output[:, -1] + + true_positives = np.zeros(pred_boxes.shape[0]) + + annotations = targets[targets[:, 0] == sample_i][:, 1:] + target_labels = annotations[:, 0] if len(annotations) else [] + if len(annotations): + detected_boxes = [] + target_boxes = annotations[:, 1:] + + for pred_i, (pred_box, pred_label) in enumerate(zip(pred_boxes, pred_labels)): + + # If targets are found break + if len(detected_boxes) == len(annotations): + break + + # Ignore if label is not one of the target labels + if pred_label not in target_labels: + continue + + iou, box_index = bbox_iou(pred_box.unsqueeze(0), target_boxes).max(0) + if iou >= iou_threshold and box_index not in detected_boxes: + true_positives[pred_i] = 1 + detected_boxes += [box_index] + batch_metrics.append([true_positives, pred_scores, pred_labels]) + return batch_metrics + +def bbox_wh_iou(wh1, wh2): + wh2 = wh2.t() + w1, h1 = wh1[0], wh1[1] + w2, h2 = wh2[0], wh2[1] + inter_area = torch.min(w1, w2) * torch.min(h1, h2) + union_area = (w1 * h1 + 1e-16) + w2 * h2 - inter_area + return inter_area / union_area + +def bbox_iou(box1, box2, x1y1x2y2=True): + """ + Returns the IoU of two bounding boxes + """ + if not x1y1x2y2: + # Transform from center and width to exact coordinates + b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 + b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 + b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 + b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 + else: + # Get the coordinates of bounding boxes + b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] + b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] + + # get the corrdinates of the intersection rectangle + inter_rect_x1 = torch.max(b1_x1, b2_x1) + inter_rect_y1 = torch.max(b1_y1, b2_y1) + inter_rect_x2 = torch.min(b1_x2, b2_x2) + inter_rect_y2 = torch.min(b1_y2, b2_y2) + # Intersection area + inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1 + 1, min=0) * torch.clamp( + inter_rect_y2 - inter_rect_y1 + 1, min=0 + ) + # Union Area + b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) + b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) + + iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) + + return iou + +def weights_init_normal(m): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + torch.nn.init.normal_(m.weight.data, 0.0, 0.02) + elif classname.find("BatchNorm2d") != -1: + torch.nn.init.normal_(m.weight.data, 1.0, 0.02) + torch.nn.init.constant_(m.bias.data, 0.0) + +def rescale_boxes(boxes, current_dim, original_shape): + """ Rescales bounding boxes to the original shape """ + orig_h, orig_w = original_shape + # The amount of padding that was added + pad_x = max(orig_h - orig_w, 0) * (current_dim / max(original_shape)) + pad_y = max(orig_w - orig_h, 0) * (current_dim / max(original_shape)) + # Image height and width after padding is removed + unpad_h = current_dim - pad_y + unpad_w = current_dim - pad_x + # Rescale bounding boxes to dimension of original image + boxes[:, 0] = ((boxes[:, 0] - pad_x // 2) / unpad_w) * orig_w + boxes[:, 1] = ((boxes[:, 1] - pad_y // 2) / unpad_h) * orig_h + boxes[:, 2] = ((boxes[:, 2] - pad_x // 2) / unpad_w) * orig_w + boxes[:, 3] = ((boxes[:, 3] - pad_y // 2) / unpad_h) * orig_h + return boxes + +def xywh2xyxy(x): + y = x.new(x.shape) + y[..., 0] = x[..., 0] - x[..., 2] / 2 + y[..., 1] = x[..., 1] - x[..., 3] / 2 + y[..., 2] = x[..., 0] + x[..., 2] / 2 + y[..., 3] = x[..., 1] + x[..., 3] / 2 + return y + +def non_max_suppression(prediction, conf_thres=0.5, nms_thres=0.4): + """ + Removes detections with lower object confidence score than 'conf_thres' and performs + Non-Maximum Suppression to further filter detections. + Returns detections with shape: + (x1, y1, x2, y2, object_conf, class_score, class_pred) + """ + + # From (center x, center y, width, height) to (x1, y1, x2, y2) + prediction[..., :4] = xywh2xyxy(prediction[..., :4]) + output = [None for _ in range(len(prediction))] + for image_i, image_pred in enumerate(prediction): + # Filter out confidence scores below threshold + image_pred = image_pred[image_pred[:, 4] >= conf_thres] + # If none are remaining => process next image + if not image_pred.size(0): + continue + # Object confidence times class confidence + score = image_pred[:, 4] * image_pred[:, 5:].max(1)[0] + # Sort by it + image_pred = image_pred[(-score).argsort()] + class_confs, class_preds = image_pred[:, 5:].max(1, keepdim=True) + detections = torch.cat((image_pred[:, :5], class_confs.float(), class_preds.float()), 1) + # Perform non-maximum suppression + keep_boxes = [] + while detections.size(0): + large_overlap = bbox_iou(detections[0, :4].unsqueeze(0), detections[:, :4]) > nms_thres + label_match = detections[0, -1] == detections[:, -1] + # Indices of boxes with lower confidence scores, large IOUs and matching labels + invalid = large_overlap & label_match + weights = detections[invalid, 4:5] + # Merge overlapping bboxes by order of confidence + detections[0, :4] = (weights * detections[invalid, :4]).sum(0) / weights.sum() + keep_boxes += [detections[0]] + detections = detections[~invalid] + if keep_boxes: + output[image_i] = torch.stack(keep_boxes) + + return output + +def build_targets(pred_boxes, pred_cls, target, anchors, ignore_thres): + + # ByteTensor = torch.cuda.ByteTensor if pred_boxes.is_cuda else torch.ByteTensor + ByteTensor = torch.cuda.BoolTensor if pred_boxes.is_cuda else torch.BoolTensor + FloatTensor = torch.cuda.FloatTensor if pred_boxes.is_cuda else torch.FloatTensor + + nB = pred_boxes.size(0) + nA = pred_boxes.size(1) + nC = pred_cls.size(-1) + nG = pred_boxes.size(2) + + # Output tensors + obj_mask = ByteTensor(nB, nA, nG, nG).fill_(0) + noobj_mask = ByteTensor(nB, nA, nG, nG).fill_(1) + class_mask = FloatTensor(nB, nA, nG, nG).fill_(0) + iou_scores = FloatTensor(nB, nA, nG, nG).fill_(0) + tx = FloatTensor(nB, nA, nG, nG).fill_(0) + ty = FloatTensor(nB, nA, nG, nG).fill_(0) + tw = FloatTensor(nB, nA, nG, nG).fill_(0) + th = FloatTensor(nB, nA, nG, nG).fill_(0) + tcls = FloatTensor(nB, nA, nG, nG, nC).fill_(0) + + + # # note: solver multi gpu problem + # target = target[target.sum(dim=1) != 0] + + # Convert to position relative to box + target_boxes = target[:, 2:6] * nG + gxy = target_boxes[:, :2] + gwh = target_boxes[:, 2:] + # Get anchors with best iou + ious = torch.stack([bbox_wh_iou(anchor, gwh) for anchor in anchors]) + best_ious, best_n = ious.max(0) + # Separate target values + b, target_labels = target[:, :2].long().t() + gx, gy = gxy.t() + gw, gh = gwh.t() + gi, gj = gxy.long().t() + + # prevent index out of boundary + gi = gi.clamp(0, nG - 1) + gj = gj.clamp(0, nG - 1) + + # Set masks + obj_mask[b, best_n, gj, gi] = 1 + noobj_mask[b, best_n, gj, gi] = 0 + + # Set noobj mask to zero where iou exceeds ignore threshold + for i, anchor_ious in enumerate(ious.t()): + noobj_mask[b[i], anchor_ious > ignore_thres, gj[i], gi[i]] = 0 + + # Coordinates + tx[b, best_n, gj, gi] = gx - gx.floor() + ty[b, best_n, gj, gi] = gy - gy.floor() + # Width and height + tw[b, best_n, gj, gi] = torch.log(gw / anchors[best_n][:, 0] + 1e-16) + th[b, best_n, gj, gi] = torch.log(gh / anchors[best_n][:, 1] + 1e-16) + # One-hot encoding of label + tcls[b, best_n, gj, gi, target_labels] = 1 + # Compute label correctness and iou at best anchor + class_mask[b, best_n, gj, gi] = (pred_cls[b, best_n, gj, gi].argmax(-1) == target_labels).float() + iou_scores[b, best_n, gj, gi] = bbox_iou(pred_boxes[b, best_n, gj, gi], target_boxes, x1y1x2y2=False) + + tconf = obj_mask.float() + return iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf + +def create_modules(module_defs): + """ + Constructs module list of layer blocks from module configuration in module_defs + """ + hyperparams = module_defs.pop(0) + output_filters = [int(hyperparams["channels"])] + module_list = nn.ModuleList() + for module_i, module_def in enumerate(module_defs): + modules = nn.Sequential() + + if module_def["type"] == "convolutional": + bn = int(module_def["batch_normalize"]) + filters = int(module_def["filters"]) + kernel_size = int(module_def["size"]) + pad = (kernel_size - 1) // 2 + modules.add_module( + f"conv_{module_i}", + nn.Conv2d( + in_channels=output_filters[-1], + out_channels=filters, + kernel_size=kernel_size, + stride=int(module_def["stride"]), + padding=pad, + bias=not bn, + ), + ) + if bn: + modules.add_module(f"batch_norm_{module_i}", nn.BatchNorm2d(filters, momentum=0.9, eps=1e-5)) + if module_def["activation"] == "leaky": + modules.add_module(f"leaky_{module_i}", nn.LeakyReLU(0.1)) + + elif module_def["type"] == "maxpool": + kernel_size = int(module_def["size"]) + stride = int(module_def["stride"]) + if kernel_size == 2 and stride == 1: + modules.add_module(f"_debug_padding_{module_i}", nn.ZeroPad2d((0, 1, 0, 1))) + maxpool = nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=int((kernel_size - 1) // 2)) + modules.add_module(f"maxpool_{module_i}", maxpool) + + elif module_def["type"] == "upsample": + upsample = Upsample(scale_factor=int(module_def["stride"]), mode="nearest") + modules.add_module(f"upsample_{module_i}", upsample) + + elif module_def["type"] == "route": + layers = [int(x) for x in module_def["layers"].split(",")] + filters = sum([output_filters[1:][i] for i in layers]) + modules.add_module(f"route_{module_i}", EmptyLayer()) + + elif module_def["type"] == "shortcut": + filters = output_filters[1:][int(module_def["from"])] + modules.add_module(f"shortcut_{module_i}", EmptyLayer()) + + elif module_def["type"] == "yolo": + anchor_idxs = [int(x) for x in module_def["mask"].split(",")] + # Extract anchors + anchors = [int(x) for x in module_def["anchors"].split(",")] + anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)] + anchors = [anchors[i] for i in anchor_idxs] + num_classes = int(module_def["classes"]) + img_size = int(hyperparams["height"]) + # Define detection layer + yolo_layer = YOLOLayer(anchors, num_classes, img_size) + modules.add_module(f"yolo_{module_i}", yolo_layer) + # Register module list and number of output filters + module_list.append(modules) + output_filters.append(filters) + + return hyperparams, module_list + + +class Upsample(nn.Module): + """ nn.Upsample is deprecated """ + + def __init__(self, scale_factor, mode="nearest"): + super(Upsample, self).__init__() + self.scale_factor = scale_factor + self.mode = mode + + def forward(self, x): + x = F.interpolate(x, scale_factor=self.scale_factor, mode=self.mode) + return x + + +class EmptyLayer(nn.Module): + """Placeholder for 'route' and 'shortcut' layers""" + + def __init__(self): + super(EmptyLayer, self).__init__() + + +class YOLOLayer(nn.Module): + """Detection layer""" + + def __init__(self, anchors, num_classes, img_dim=416): + super(YOLOLayer, self).__init__() + self.anchors = anchors + self.num_anchors = len(anchors) + self.num_classes = num_classes + self.ignore_thres = 0.5 + self.mse_loss = nn.MSELoss() + self.bce_loss = nn.BCELoss() + self.obj_scale = 1 + self.noobj_scale = 100 + self.metrics = {} + self.img_dim = img_dim + self.grid_size = 0 # grid size + + def compute_grid_offsets(self, grid_size, cuda=True): + self.grid_size = grid_size + g = self.grid_size + FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor + self.stride = self.img_dim / self.grid_size + # Calculate offsets for each grid + self.grid_x = torch.arange(g).repeat(g, 1).view([1, 1, g, g]).type(FloatTensor) + self.grid_y = torch.arange(g).repeat(g, 1).t().view([1, 1, g, g]).type(FloatTensor) + self.scaled_anchors = FloatTensor([(a_w / self.stride, a_h / self.stride) for a_w, a_h in self.anchors]) + self.anchor_w = self.scaled_anchors[:, 0:1].view((1, self.num_anchors, 1, 1)) + self.anchor_h = self.scaled_anchors[:, 1:2].view((1, self.num_anchors, 1, 1)) + + def forward(self, x, targets=None, img_dim=None): + + # Tensors for cuda support + FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor + LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor + ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor + + self.img_dim = img_dim + num_samples = x.size(0) + grid_size = x.size(2) + + prediction = ( + x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size) + .permute(0, 1, 3, 4, 2) + .contiguous() + ) + + # Get outputs + x = torch.sigmoid(prediction[..., 0]) # Center x + y = torch.sigmoid(prediction[..., 1]) # Center y + w = prediction[..., 2] # Width + h = prediction[..., 3] # Height + pred_conf = torch.sigmoid(prediction[..., 4]) # Conf + pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. + + # If grid size does not match current we compute new offsets + if grid_size != self.grid_size: + self.compute_grid_offsets(grid_size, cuda=x.is_cuda) + + # Add offset and scale with anchors + pred_boxes = FloatTensor(prediction[..., :4].shape) + pred_boxes[..., 0] = x.data + self.grid_x + pred_boxes[..., 1] = y.data + self.grid_y + pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w + pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h + + output = torch.cat( + ( + pred_boxes.view(num_samples, -1, 4) * self.stride, + pred_conf.view(num_samples, -1, 1), + pred_cls.view(num_samples, -1, self.num_classes), + ), + -1, + ) + + if targets is None: + return output, 0 + else: + iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets( + pred_boxes=pred_boxes, + pred_cls=pred_cls, + target=targets, + anchors=self.scaled_anchors, + ignore_thres=self.ignore_thres, + ) + + # Loss : Mask outputs to ignore non-existing objects (except with conf. loss) + loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) + loss_y = self.mse_loss(y[obj_mask], ty[obj_mask]) + loss_w = self.mse_loss(w[obj_mask], tw[obj_mask]) + loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) + loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) + loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) + loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj + loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) + total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls + + # Metrics + cls_acc = 100 * class_mask[obj_mask].mean() + conf_obj = pred_conf[obj_mask].mean() + conf_noobj = pred_conf[noobj_mask].mean() + conf50 = (pred_conf > 0.5).float() + iou50 = (iou_scores > 0.5).float() + iou75 = (iou_scores > 0.75).float() + detected_mask = conf50 * class_mask * tconf + precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16) + recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16) + recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16) + + self.metrics = { + "loss": to_cpu(total_loss).item(), + "x": to_cpu(loss_x).item(), + "y": to_cpu(loss_y).item(), + "w": to_cpu(loss_w).item(), + "h": to_cpu(loss_h).item(), + "conf": to_cpu(loss_conf).item(), + "cls": to_cpu(loss_cls).item(), + "cls_acc": to_cpu(cls_acc).item(), + "recall50": to_cpu(recall50).item(), + "recall75": to_cpu(recall75).item(), + "precision": to_cpu(precision).item(), + "conf_obj": to_cpu(conf_obj).item(), + "conf_noobj": to_cpu(conf_noobj).item(), + "grid_size": grid_size, + } + + return output, total_loss + + +class Darknet(nn.Module): + """YOLOv3 object detection model""" + + def __init__(self, config_path, img_size=416): + super(Darknet, self).__init__() + self.module_defs = parse_model_config(config_path) + self.hyperparams, self.module_list = create_modules(self.module_defs) + self.yolo_layers = [layer[0] for layer in self.module_list if hasattr(layer[0], "metrics")] + self.img_size = img_size + self.seen = 0 + self.header_info = np.array([0, 0, 0, self.seen, 0], dtype=np.int32) + + def forward(self, x, targets=None): + img_dim = x.shape[2] + loss = 0 + layer_outputs, yolo_outputs = [], [] + for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)): + if module_def["type"] in ["convolutional", "upsample", "maxpool"]: + x = module(x) + elif module_def["type"] == "route": + x = torch.cat([layer_outputs[int(layer_i)] for layer_i in module_def["layers"].split(",")], 1) + elif module_def["type"] == "shortcut": + layer_i = int(module_def["from"]) + x = layer_outputs[-1] + layer_outputs[layer_i] + elif module_def["type"] == "yolo": + x, layer_loss = module[0](x, targets, img_dim) + loss += layer_loss + yolo_outputs.append(x) + layer_outputs.append(x) + yolo_outputs = to_cpu(torch.cat(yolo_outputs, 1)) + return yolo_outputs if targets is None else (loss, yolo_outputs) + + def load_darknet_weights(self, weights_path): + """Parses and loads the weights stored in 'weights_path'""" + + # Open the weights file + with open(weights_path, "rb") as f: + header = np.fromfile(f, dtype=np.int32, count=5) # First five are header values + self.header_info = header # Needed to write header when saving weights + self.seen = header[3] # number of images seen during training + weights = np.fromfile(f, dtype=np.float32) # The rest are weights + + # Establish cutoff for loading backbone weights + cutoff = None + if "darknet53.conv.74" in weights_path: + cutoff = 75 + + ptr = 0 + for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)): + if i == cutoff: + break + if module_def["type"] == "convolutional": + conv_layer = module[0] + if module_def["batch_normalize"]: + # Load BN bias, weights, running mean and running variance + bn_layer = module[1] + num_b = bn_layer.bias.numel() # Number of biases + # Bias + bn_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.bias) + bn_layer.bias.data.copy_(bn_b) + ptr += num_b + # Weight + bn_w = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.weight) + bn_layer.weight.data.copy_(bn_w) + ptr += num_b + # Running Mean + bn_rm = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_mean) + bn_layer.running_mean.data.copy_(bn_rm) + ptr += num_b + # Running Var + bn_rv = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_var) + bn_layer.running_var.data.copy_(bn_rv) + ptr += num_b + else: + # Load conv. bias + num_b = conv_layer.bias.numel() + conv_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(conv_layer.bias) + conv_layer.bias.data.copy_(conv_b) + ptr += num_b + # Load conv. weights + num_w = conv_layer.weight.numel() + conv_w = torch.from_numpy(weights[ptr : ptr + num_w]).view_as(conv_layer.weight) + conv_layer.weight.data.copy_(conv_w) + ptr += num_w + + def save_darknet_weights(self, path, cutoff=-1): + """ + @:param path - path of the new weights file + @:param cutoff - save layers between 0 and cutoff (cutoff = -1 -> all are saved) + """ + fp = open(path, "wb") + self.header_info[3] = self.seen + self.header_info.tofile(fp) + + # Iterate through layers + for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])): + if module_def["type"] == "convolutional": + conv_layer = module[0] + # If batch norm, load bn first + if module_def["batch_normalize"]: + bn_layer = module[1] + bn_layer.bias.data.cpu().numpy().tofile(fp) + bn_layer.weight.data.cpu().numpy().tofile(fp) + bn_layer.running_mean.data.cpu().numpy().tofile(fp) + bn_layer.running_var.data.cpu().numpy().tofile(fp) + # Load conv bias + else: + conv_layer.bias.data.cpu().numpy().tofile(fp) + # Load conv weights + conv_layer.weight.data.cpu().numpy().tofile(fp) + + fp.close() + + +class ListDataset(Dataset): + def __init__(self, list_path, img_size=416, augment=True, multiscale=True, normalized_labels=True): + with open(list_path, "r") as file: + self.img_files = file.readlines() + + self.label_files = [ + # path.replace("images", "labels").replace(".png", ".txt").replace(".jpg", ".txt") + # just remove the postfix '.txt' + path.replace("/images/", "/labels/").rstrip() + '.txt' + for path in self.img_files + ] + self.img_size = img_size + self.max_objects = 100 + self.augment = augment + self.multiscale = multiscale + self.normalized_labels = normalized_labels + self.min_size = self.img_size - 3 * 32 + self.max_size = self.img_size + 3 * 32 + self.batch_count = 0 + + def __getitem__(self, index): + + # --------- + # Image + # --------- + + img_path = self.img_files[index % len(self.img_files)].rstrip() + + # Extract image as PyTorch tensor + img = transforms.ToTensor()(Image.open(img_path).convert('RGB')) + + # Handle images with less than three channels + if len(img.shape) != 3: + img = img.unsqueeze(0) + img = img.expand((3, img.shape[1:])) + + _, h, w = img.shape + h_factor, w_factor = (h, w) if self.normalized_labels else (1, 1) + # Pad to square resolution + img, pad = pad_to_square(img, 0) + _, padded_h, padded_w = img.shape + + # --------- + # Label + # --------- + + label_path = self.label_files[index % len(self.img_files)].rstrip() + + targets = None + if os.path.exists(label_path): + boxes = torch.from_numpy(np.loadtxt(label_path).reshape(-1, 5)) + # Extract coordinates for unpadded + unscaled image + x1 = w_factor * (boxes[:, 1] - boxes[:, 3] / 2) + y1 = h_factor * (boxes[:, 2] - boxes[:, 4] / 2) + x2 = w_factor * (boxes[:, 1] + boxes[:, 3] / 2) + y2 = h_factor * (boxes[:, 2] + boxes[:, 4] / 2) + # Adjust for added padding + x1 += pad[0] + y1 += pad[2] + x2 += pad[1] + y2 += pad[3] + # Returns (x, y, w, h) + boxes[:, 1] = ((x1 + x2) / 2) / padded_w + boxes[:, 2] = ((y1 + y2) / 2) / padded_h + boxes[:, 3] *= w_factor / padded_w + boxes[:, 4] *= h_factor / padded_h + + targets = torch.zeros((len(boxes), 6)) + targets[:, 1:] = boxes + + # Apply augmentations + if self.augment: + if np.random.random() < 0.5: + img, targets = horisontal_flip(img, targets) + + return img_path, img, targets + + def collate_fn(self, batch): + paths, imgs, targets = list(zip(*batch)) + # Remove empty placeholder targets + targets = [boxes for boxes in targets if boxes is not None] + # Add sample index to targets + for i, boxes in enumerate(targets): + boxes[:, 0] = i + targets = torch.cat(targets, 0) + # Selects new image size every tenth batch + if self.multiscale and self.batch_count % 10 == 0: + self.img_size = random.choice(range(self.min_size, self.max_size + 1, 32)) + # Resize images to input shape + imgs = torch.stack([resize(img, self.img_size) for img in imgs]) + self.batch_count += 1 + return paths, imgs, targets + + def __len__(self): + return len(self.img_files) + + +if __name__ == "__main__": + os.environ['WORKDIR_PATH'] = os.getcwd() + os.environ['PARAMS_DIR_PATH'] = os.getcwd() + + test_img_paths = [ + '/home/jiahua/food_all/三明治/99_sanmingzhi.jpg', + '/home/jiahua/food_all/三明治/0_sanmingzhi.jpg' + ] + imgs = dataset.load_images(test_img_paths) + + # # forkcloud format test + # imgs = [[np.array(imgs[0]).tolist()]] + + test_model_class(model_file_path=__file__, + model_class='FoodlgNet', + task='IMAGE_DETECTION', + dependencies={}, + train_dataset_path='/home/jiahua/singa-local/dataset.zip', + val_dataset_path='/home/jiahua/singa-local/dataset.zip', + train_args={}, + queries=imgs) diff --git a/examples/models/image_object_detection/onnx_tiny_yolov2.py b/examples/models/image_object_detection/onnx_tiny_yolov2.py index 2e20c447..2ac0f6d2 100644 --- a/examples/models/image_object_detection/onnx_tiny_yolov2.py +++ b/examples/models/image_object_detection/onnx_tiny_yolov2.py @@ -24,7 +24,7 @@ from singa_auto.model import BaseModel, utils from singa_auto.constants import ModelDependency -from singa_auto.model.dev import make_predictions, _check_model_class, _print_header, _check_dependencies, inform_user +from singa_auto.model.dev import make_predictions_json, _check_model_class, _print_header, _check_dependencies, inform_user from singa_auto.model.utils import load_model_class from singa_auto.advisor.constants import Proposal, ParamsType @@ -215,7 +215,7 @@ def softmax(x): proposal = Proposal(trial_no=0, knobs={}, params_type=ParamsType.LOCAL_RECENT) - (predictions, model_inst) = make_predictions(queries, task, + (predictions, model_inst) = make_predictions_json(queries, task, py_model_class, proposal, fine_tune_dataset_path=None, diff --git a/examples/models/image_segmentation/PyPandaResUnet.py b/examples/models/image_segmentation/PyPandaResUnet.py new file mode 100644 index 00000000..e1e01d70 --- /dev/null +++ b/examples/models/image_segmentation/PyPandaResUnet.py @@ -0,0 +1,549 @@ +import os + +os.environ['CUDA_VISIBLE_DEVICES'] = "0" + +import sys +sys.path.append(os.getcwd()) + + +import base64 +import json +import logging +import os +import tempfile +import zipfile +from collections.abc import Sequence +from copy import deepcopy +from io import BytesIO +from typing import List +from glob import glob + +import cv2 +import numpy as np +import PIL +import torch +import torch.nn as nn +import torchvision +from PIL import Image +from torch.utils.data import DataLoader, Dataset +from torch.utils.data.sampler import RandomSampler +from torchvision import models +from torchvision.transforms import functional as F +from torchvision.transforms.transforms import Pad, Resize +from tqdm import tqdm + +from singa_auto.model import SegmentationModel, CategoricalKnob, FixedKnob, utils +from singa_auto.model.knob import BaseKnob +# from singa_auto.utils.metrics import do_kaggle_metric + +from singa_auto.datasets.image_segmentation_dataset import * + + +# define model +def convrelu(in_channels, out_channels, kernel, padding): + return nn.Sequential( + nn.Conv2d(in_channels, out_channels, kernel, padding=padding), + nn.ReLU(inplace=True), + ) + + +class ResNetUNet(nn.Module): + def __init__(self, n_class): + super().__init__() + + self.base_model = models.resnet18(pretrained=False) + self.base_layers = list(self.base_model.children()) + + self.layer0 = nn.Sequential(*self.base_layers[:3]) # size=(N, 64, x.H/2, x.W/2) + self.layer0_1x1 = convrelu(64, 64, 1, 0) + self.layer1 = nn.Sequential(*self.base_layers[3:5]) # size=(N, 64, x.H/4, x.W/4) + self.layer1_1x1 = convrelu(64, 64, 1, 0) + self.layer2 = self.base_layers[5] # size=(N, 128, x.H/8, x.W/8) + self.layer2_1x1 = convrelu(128, 128, 1, 0) + self.layer3 = self.base_layers[6] # size=(N, 256, x.H/16, x.W/16) + self.layer3_1x1 = convrelu(256, 256, 1, 0) + self.layer4 = self.base_layers[7] # size=(N, 512, x.H/32, x.W/32) + self.layer4_1x1 = convrelu(512, 512, 1, 0) + + self.upsample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True) + + self.conv_up3 = convrelu(256 + 512, 512, 3, 1) + self.conv_up2 = convrelu(128 + 512, 256, 3, 1) + self.conv_up1 = convrelu(64 + 256, 256, 3, 1) + self.conv_up0 = convrelu(64 + 256, 128, 3, 1) + + self.conv_original_size0 = convrelu(3, 64, 3, 1) + self.conv_original_size1 = convrelu(64, 64, 3, 1) + self.conv_original_size2 = convrelu(64 + 128, 64, 3, 1) + + self.conv_last = nn.Conv2d(64, n_class, 1) + + def forward(self, input): + x_original = self.conv_original_size0(input) + x_original = self.conv_original_size1(x_original) + + layer0 = self.layer0(input) + layer1 = self.layer1(layer0) + layer2 = self.layer2(layer1) + layer3 = self.layer3(layer2) + layer4 = self.layer4(layer3) + + layer4 = self.layer4_1x1(layer4) + x = self.upsample(layer4) + layer3 = self.layer3_1x1(layer3) + x = torch.cat([x, layer3], dim=1) + x = self.conv_up3(x) + + x = self.upsample(x) + layer2 = self.layer2_1x1(layer2) + x = torch.cat([x, layer2], dim=1) + x = self.conv_up2(x) + + x = self.upsample(x) + layer1 = self.layer1_1x1(layer1) + x = torch.cat([x, layer1], dim=1) + x = self.conv_up1(x) + + x = self.upsample(x) + layer0 = self.layer0_1x1(layer0) + x = torch.cat([x, layer0], dim=1) + x = self.conv_up0(x) + + x = self.upsample(x) + x = torch.cat([x, x_original], dim=1) + x = self.conv_original_size2(x) + + out = self.conv_last(x) + + return out + + +# pre-process: resize image to the target scale keeping aspect ratio then pad to square +class ResizeSquarePad(Resize, Pad): + def __init__(self, target_length, interpolation_strategy): + if not isinstance(target_length, (int, Sequence)): + raise TypeError("Size should be int or sequence. Got {}".format(type(target_length))) + if isinstance(target_length, Sequence) and len(target_length) not in (1, 2): + raise ValueError("If size is a sequence, it should have 1 or 2 values") + + self.target_length = target_length + self.interpolation_strategy = interpolation_strategy + Resize.__init__(self, size=(512, 512), interpolation=self.interpolation_strategy) + Pad.__init__(self, padding=(0,0,0,0), fill=255, padding_mode="constant") + + + def __call__(self, img): + w, h = img.size + if w > h: + self.size = (int(np.round(self.target_length * (h / w))), self.target_length) + img = Resize.__call__(self, img) + + total_pad = self.size[1] - self.size[0] + half_pad = total_pad // 2 + self.padding = (0, half_pad, 0, total_pad - half_pad) + return Pad.__call__(self, img) + else: + self.size = (self.target_length, int(np.round(self.target_length * (w / h)))) + img = Resize.__call__(self, img) + + total_pad = self.size[0] - self.size[1] + half_pad = total_pad // 2 + self.padding = (half_pad, 0, total_pad - half_pad, 0) + return Pad.__call__(self, img) + + +logger = logging.getLogger(__name__) + + +# main process procedure +class PyPandaResUnet(SegmentationModel): + ''' + train UNet + ''' + @staticmethod + def get_knob_config(): + return { + # hyper parameters + "lr": FixedKnob(1e-4), + "ignore_index": FixedKnob(255), + "batch_size": FixedKnob(4), + "epoch": FixedKnob(2), + + # application parameters + # "num_classes": FixedKnob(1), + "fine_size": FixedKnob(512), + + } + + + def __init__(self, **knobs): + super().__init__(**knobs) + self._knobs = knobs + + self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + print("self.device", self.device) + logger.info(self.device) + + self.model = None + + self.fine_size = self._knobs.get("fine_size") + + # define preprocessing procedure + self.transform_img = torchvision.transforms.Compose([ + ResizeSquarePad(self.fine_size, Image.BILINEAR), + torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225]) + ]) + + self.transform_mask = torchvision.transforms.Compose([ + ResizeSquarePad(self.fine_size, Image.NEAREST) + ]) + + + def train(self, dataset_path, **kwargs): + # hyper parameters + self.batch_size = self._knobs.get("batch_size") + self.epoch = self._knobs.get("epoch") + snapshot = 2 + + self.lr = self._knobs.get("lr") + self.ignore_index = self._knobs.get("ignore_index") + + logger.info("Training params: {}".format(json.dumps(kwargs))) + + + # extract uploaded zipfile + dataset_zipfile = zipfile.ZipFile(dataset_path, 'r') + + train_folder = tempfile.TemporaryDirectory() + folder_name = train_folder.name + dataset_zipfile.extractall(path=folder_name) + + # load train params from zipfile + with open(os.path.join(folder_name, 'param.json'),'r') as load_f: + load_dict = json.load(load_f) + self.num_classes = load_dict["num_classes"] if "num_classes" in list(load_dict.keys()) else 21 # default class number(21) is the same as voc2012 + + # load images from zipfile + if os.path.isdir(os.path.join(folder_name, "image")): + print("split train/val subsets...") + logger.info("split train/val subsets...") + image_train, mask_train, image_val, mask_val = ImageFetch(folder_name) + self.num_image = len(image_train) + print("Total training images : ", self.num_image) + logger.info(f"Total training images : {self.num_image}") + elif os.path.isdir(os.path.join(folder_name, "train")): + print("directly load train/val datasets...") + logger.info("directly load train/val datasets...") + image_train, mask_train = trainImageFetch(folder_name) + image_val, mask_val = valImageFetch(folder_name) + self.num_image = len(image_train) + print("Total training images : ", self.num_image) + logger.info(f"Total training images : {self.num_image}") + else: + print("unsupported dataset format!") + logger.info("unsupported dataset format!") + + # load dataset + train_data = SegDataset(image_train, mask_train, self.transform_img, self.transform_mask) + val_data = SegDataset(image_val, mask_val, self.transform_img, self.transform_mask) + + logger.info("Training the model ResUNet using {}".format(self.device)) + print("Training the model ResUNet using {}".format(self.device)) + + # define training and validation data loaders + train_loader = DataLoader(train_data, + shuffle=RandomSampler(train_data), + batch_size=self.batch_size) + + val_loader = DataLoader(val_data, + shuffle=False, + batch_size=self.batch_size) + + # get the model using our helper function + self.model = ResNetUNet(self.num_classes) + self.model.to(self.device) + + self.criterion = nn.CrossEntropyLoss(ignore_index=self.ignore_index) + + self.optimizer_ft = torch.optim.Adam(filter(lambda p: p.requires_grad, self.model.parameters()), lr=self.lr) + self.exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer_ft, step_size=30, gamma=0.1) + + # start training + for epoch_ in range(self.epoch): + train_loss = self._train_one_epoch(train_loader, self.model) + val_loss, accuracy = self._evaluate(val_loader, self.model) + self.exp_lr_scheduler.step() + + print('epoch: {} train_loss: {:.3f} val_loss: {:.3f} val_accuracy: {:.3f}'.format(epoch_ + 1, train_loss, val_loss, accuracy)) + logger.info('epoch: {} train_loss: {:.3f} val_loss: {:.3f} val_accuracy: {:.3f}'.format(epoch_ + 1, train_loss, val_loss, accuracy)) + + + def _train_one_epoch(self, train_loader, model): + ''' + consider as a sub-train function inside singa-auto framework + ''' + running_loss = 0.0 + data_size = len(train_loader) + + model.train() + + for inputs, masks in tqdm(train_loader): + inputs, masks = inputs.to(self.device), masks.long().to(self.device) + self.optimizer_ft.zero_grad() + + logit = model(inputs) + + loss = self.criterion(logit, masks.squeeze(1)) # cross_entropy loss + loss.backward() + self.optimizer_ft.step() + running_loss += loss.item() * self.batch_size + + epoch_loss = running_loss / data_size + return epoch_loss + + def _evaluate(self, test_loader, model): + ''' + validation per epoch + ''' + running_loss = 0.0 + acc = 0.0 + data_size = len(test_loader) + + model.eval() + + with torch.no_grad(): + for inputs, masks in test_loader: + inputs, masks = inputs.to(self.device), masks.long().to(self.device) + + outputs = self.model(inputs) + + predict = torch.argmax(nn.Softmax(dim=1)(outputs), dim=1) # extract argmax as the final prediction + + # we do not consider the ignore_index + pure_mask = masks.masked_select(masks.ne(self.ignore_index)) + pure_predict = predict.masked_select(masks.ne(self.ignore_index)) + + acc += pure_mask.cpu().eq(pure_predict.cpu()).sum().item()/len(pure_mask) # find the correct piixels + + loss = self.criterion(outputs.squeeze(1), masks.squeeze(1)) + running_loss += loss.item() * inputs.size(0) + + epoch_loss = running_loss / data_size + accuracy = acc / data_size + return epoch_loss, accuracy + + + def evaluate(self, val_dataset_path, **kwargs): + # extract validation datasets + dataset_zipfile = zipfile.ZipFile(val_dataset_path, 'r') + val_folder = tempfile.TemporaryDirectory() + dataset_zipfile.extractall(path=val_folder.name) + folder_name = val_folder.name + + if os.path.isdir(os.path.join(folder_name, "image")): + print("split train/val subsets...") + logger.info("split train/val subsets...") + image_train, mask_train, X_val, y_val = ImageFetch(folder_name) + self.num_image = len(X_val) + print("Total val images : ", self.num_image) + logger.info(f"Total val images : {self.num_image}") + elif os.path.isdir(os.path.join(folder_name, "train")): + print("directly load train/val datasets...") + logger.info("directly load train/val datasets...") + image_train, mask_train = trainImageFetch(folder_name) + X_val, y_val = valImageFetch(folder_name) + self.num_image = len(X_val) + print("Total val images : ", self.num_image) + logger.info(f"Total val images : {self.num_image}") + else: + print("unsupported dataset format!") + logger.info("unsupported dataset format!") + + val_data = SegDataset(X_val, y_val, self.transform_img, self.transform_mask) + + val_loader = DataLoader(val_data, + shuffle=False, + batch_size=4) + # compute MIoU metric(consider as accuracy) + temp_miou = {} + for i in range(self.num_classes): + temp_miou[i] = [0, 0.0] + + self.model.eval() + + with torch.no_grad(): + for inputs, masks in val_loader: + inputs, masks = inputs.to(self.device), masks.long().to(self.device) + + outputs = self.model(inputs) + + predict = torch.argmax(nn.Softmax(dim=1)(outputs), dim=1) + pure_mask = masks.masked_select(masks.ne(255)) + pure_predict = predict.masked_select(masks.ne(255)) + + for class_value in pure_mask.unique(): + valued_mask = pure_mask.masked_select(pure_mask.eq(class_value)) + real_len = len(valued_mask) + + valued_predict = pure_predict.masked_select(pure_mask.eq(class_value)) + cross_len = valued_mask.eq(valued_predict).sum().item() + + predict_len = len(pure_predict.masked_select(pure_predict.eq(class_value))) + + temp_miou[class_value.item()][1] += cross_len / (real_len + predict_len - cross_len) + temp_miou[class_value.item()][0] += 1 + + miou_overall = 0.0 + existed_classes = 0 + for key in temp_miou.keys(): + if temp_miou[key][0] != 0: + miou_overall += (temp_miou[key][1] / temp_miou[key][0]) + existed_classes += 1 + temp_miou['overall'] = [1, miou_overall / existed_classes] + + for key in temp_miou.keys(): + if temp_miou[key][0] != 0: + print(f"class {key} accuracy: {temp_miou[key][1] / temp_miou[key][0]}") + return temp_miou['overall'][1] + + + def dump_parameters(self): + params = {} + with tempfile.NamedTemporaryFile() as tmp: + # Save whole model to a tempfile + torch.save(self.model, tmp.name) + # Read from tempfile & encode it to base64 string + with open(tmp.name, 'rb') as f: + weight_base64 = f.read() + params['weight_base64'] = base64.b64encode(weight_base64).decode('utf-8') + return params + + + def load_parameters(self, params): + weight_base64 = params['weight_base64'] + + weight_base64_bytes = base64.b64decode(weight_base64.encode('utf-8')) + + self.model = torch.load(BytesIO(weight_base64_bytes), map_location=self.device) + + def _get_prediction(self, img): + + image = self.transform_img(img) + + image = image.to(self.device) + predict = self.model(image.unsqueeze(0)) + + predict = predict.squeeze(0) + predict = nn.Softmax(dim=0)(predict) + predict = torch.argmax(predict, dim=0) + + # transform result image into original size + w, h = img.size + if w > h: + re_h = int(np.round(self.fine_size * (h / w))) + total_pad = self.fine_size - re_h + half_pad = total_pad // 2 + out = predict[half_pad : half_pad + re_h, :] + else: + re_w = int(np.round(self.fine_size * (w / h))) + total_pad = self.fine_size - re_w + half_pad = total_pad // 2 + out = predict[:, half_pad : half_pad + re_w] + + out = cv2.resize(out.cpu().numpy(), (w, h), interpolation=cv2.INTER_NEAREST) + + return out + + + + def predict(self, queries: List[PIL.Image.Image]) -> List[dict]: + + result = list() + + for idx, img in enumerate(queries): + res_raw = self._get_prediction(img) + + # add color palette (we follow the VOC2012 color map ant the max num_class is 21) + res_raw = res_raw.astype(np.uint8) + res = Image.fromarray(res_raw) + palette = [] + for i in range(256): + palette.extend((i, i, i)) + palette[:3*21] = np.array([[0, 0, 0], + [128, 0, 0], + [0, 128, 0], + [128, 128, 0], + [0, 0, 128], + [128, 0, 128], + [0, 128, 128], + [128, 128, 128], + [64, 0, 0], + [192, 0, 0], + [64, 128, 0], + [192, 128, 0], + [64, 0, 128], + [192, 0, 128], + [64, 128, 128], + [192, 128, 128], + [0, 64, 0], + [128, 64, 0], + [0, 192, 0], + [128, 192, 0], + [0, 64, 128] + ], dtype='uint8').flatten() + res.putpalette(palette) + + name = f"./query_{idx}.png" + res.save(name) + + result.append(name) + + return result + + +if __name__ == "__main__": + import argparse + + from singa_auto.model.dev import test_model_class + parser = argparse.ArgumentParser() + parser.add_argument('--train_path', + type=str, + default='/home/taomingyang/dataset/package/voc2012_mini.zip', + help='Path to train dataset') + parser.add_argument('--val_path', + type=str, + default='/home/taomingyang/dataset/package/voc2012_mini.zip', + help='Path to validation dataset') + + # parser.add_argument('--annotation_dataset_path', + # type=str, + # default='./dataset/voc2012/val2014.zip', + # help='Path to validation dataset') + + # parser.add_argument('--test_path', + # type=str, + # default='/hdd1/PennFudanPed.zip', + # help='Path to test dataset') + parser.add_argument('--query_path', + type=str, + default='/home/taomingyang/git/singa_auto_hub/examples/data/image_segmentaion/2007_000862.jpg,/home/taomingyang/git/singa_auto_hub/examples/data/image_segmentaion/2007_001397.jpg', + help='Path(s) to query image(s), delimited by commas') + + (args, _) = parser.parse_known_args() + + # print(args.query_path.split(',')) + + queries = utils.dataset.load_images(args.query_path.split(',')) + test_model_class(model_file_path=__file__, + model_class='PyPandaResUnet', + task='IMAGE_SEGMENTATION', + dependencies={"torch": "1.6.0+cu101", + "torchvision": "0.7.0+cu101", + "opencv-python": "4.4.0.46", + "tqdm": "4.28.0"}, + train_dataset_path=args.train_path, + val_dataset_path=args.val_path, + test_dataset_path=None, + train_args={"num_classes": 21}, + queries=queries) diff --git a/examples/models/image_segmentation/SaDeeplab.py b/examples/models/image_segmentation/SaDeeplab.py new file mode 100644 index 00000000..d829aafe --- /dev/null +++ b/examples/models/image_segmentation/SaDeeplab.py @@ -0,0 +1,714 @@ +import os +os.environ['CUDA_VISIBLE_DEVICES'] = "0, 1, 2, 3" + +import sys +sys.path.append(os.getcwd()) + +import base64 +import json +import logging +import tempfile +import zipfile +from collections.abc import Sequence +from copy import deepcopy +from io import BytesIO +from typing import List +from glob import glob +from time import time +import requests +from singa_auto.model import BaseModel, CategoricalKnob, FixedKnob, utils +from singa_auto.model.knob import BaseKnob + + +import tensorflow as tf + +tf.random.set_seed(100) + +from tensorflow import keras +from tensorflow.keras import layers +import numpy as np +from tensorflow.keras.preprocessing.image import load_img +import random +import cv2 +from PIL import Image +import h5py +from tensorflow.python.keras.saving import hdf5_format + +class OxfordPets(keras.utils.Sequence): + """Helper to iterate over the data (as Numpy arrays).""" + + def __init__(self, batch_size, img_size, input_img_paths, target_img_paths): + self.batch_size = batch_size + self.img_size = img_size + self.input_img_paths = input_img_paths + self.target_img_paths = target_img_paths + + def __len__(self): + return len(self.target_img_paths) // self.batch_size + + def __getitem__(self, idx): + """Returns tuple (input, target) correspond to batch #idx.""" + i = idx * self.batch_size + batch_input_img_paths = self.input_img_paths[i : i + self.batch_size] + batch_target_img_paths = self.target_img_paths[i : i + self.batch_size] + x = np.zeros((self.batch_size,) + (self.img_size, self.img_size) + (3,), dtype="float32") + for j, path in enumerate(batch_input_img_paths): + img = load_img(path, target_size=(self.img_size, self.img_size)) + x[j] = img + # y = np.zeros((self.batch_size,) + self.img_size + (1,), dtype="uint8") + # for j, path in enumerate(batch_target_img_paths): + # img = load_img(path, target_size=self.img_size, color_mode="grayscale") + # y[j] = np.expand_dims(img, 2) + # # Ground truth labels are 1, 2, 3. Subtract one to make them 0, 1, 2: + # y[j] -= 1 + + y = np.zeros((self.batch_size,) + (self.img_size * self.img_size,) + (1,), dtype="uint8") + for j, path in enumerate(batch_target_img_paths): + img = load_img(path, target_size=(self.img_size, self.img_size), color_mode="grayscale") + img = np.array(img).flatten() + y[j] = np.expand_dims(img, 1) + # Ground truth labels are 1, 2, 3. Subtract one to make them 0, 1, 2: (already update dataset, this operation has expired) + # y[j] -= 1 + + sample_weight = np.zeros((self.batch_size,) + (self.img_size * self.img_size,), dtype="float32") + for k in range(self.batch_size): + unique_class = np.unique(y) + if len(unique_class): + class_weights = {class_id: 1.0 for class_id in unique_class} + class_weights[unique_class[-1]] = 0.0 + for yy in unique_class: + np.putmask(sample_weight[k], y[k]==yy, class_weights[yy]) + np.putmask(sample_weight[k], y[k]==unique_class[-1], class_weights[unique_class[-1]]) + + return x, y, sample_weight + + +def Bottleneck(input_shape, output_channels, stride=1, dilation=1): + ''' + a classic residual convolution module + ''' + inputs = tf.keras.Input(input_shape) + residual = inputs + + # residual conv branch + x = layers.Conv2D(output_channels, (1, 1), padding='same', use_bias=False)(residual) + x = layers.BatchNormalization()(x) + x = layers.ReLU()(x) + + x = layers.Conv2D(output_channels, (3, 3), strides=(stride, stride), padding='same', + dilation_rate=(dilation, dilation), use_bias=False)(x) + x = layers.BatchNormalization()(x) + x = layers.ReLU()(x) + + x = layers.Conv2D(output_channels * 4, (1, 1), padding='same', use_bias=False)(x) + x = layers.BatchNormalization()(x) + + # original branch + if stride != 1 or inputs.shape[-1] != x.shape[-1]: + residual = layers.Conv2D(output_channels * 4, (1, 1), padding='same', + strides=(stride, stride), use_bias=False)(residual) + residual = layers.BatchNormalization()(residual) + + # merge two branches + x = layers.Add()([x, residual]) + x = layers.ReLU()(x) + + # export model + return keras.Model(inputs=inputs, outputs=x) + + +def ResNetAtrous(layer_num=[3, 4, 6, 3], dilations=[1, 2, 1]): + ''' + an atrous conv version resnet50 model + ''' + inputs = keras.Input((None, None, 3)) + strides = [2, 1, 1] + + # conv + x = layers.Conv2D(64, (7, 7), (2, 2), padding='same', use_bias=False)(inputs) + x = layers.BatchNormalization()(x, training=False) + x = layers.ReLU()(x) + + # down-sampling + x = layers.MaxPool2D(pool_size=(3, 3), strides=(2, 2), padding='same')(x) + + # resblock 1 + for i in range(layer_num[0]): + x = Bottleneck(x.shape[1:], 64, stride=1, dilation=1)(x) + low = x # save low level features + + # resblock 2 + for i in range(layer_num[1]): + x = Bottleneck(x.shape[1:], 128, stride=strides[0] if i == 0 else 1, dilation=1)(x) + + # resblock 3 + for i in range(layer_num[2]): + x = Bottleneck(x.shape[1:], 256, stride=strides[1] if i == 0 else 1, dilation=1)(x) + + # resblock 4 + for i in range(layer_num[3]): + x = Bottleneck(x.shape[1:], 512, stride=strides[2] if i == 0 else 1, dilation=dilations[i])(x) + high = x + + return keras.Model(inputs=inputs, outputs=(low, high)) + + +def ASPP(input_channels): + inputs = layers.Input((None, None, input_channels)) + + # global pooling + global_mean = layers.Lambda(lambda x: tf.math.reduce_mean(x, [1, 2], keepdims=True))(inputs) # size (b, 1, 1, c) + + global_mean = layers.Conv2D(256, (1, 1), padding='same', + kernel_initializer=tf.keras.initializers.he_normal(), use_bias=False)(global_mean) + global_mean = layers.BatchNormalization()(global_mean) + global_mean = layers.ReLU()(global_mean) # size (b, 1, 1, 256) + + global_mean = layers.Lambda(lambda x: tf.image.resize(x[0], (tf.shape(x[1])[1], tf.shape(x[1])[2]), + method=tf.image.ResizeMethod.BILINEAR))([global_mean, inputs]) # size (b, h, w, 256) + + # dilation with rate 1 + dilated_1 = layers.Conv2D(256, (1, 1), dilation_rate=1, padding='same', + kernel_initializer=tf.keras.initializers.he_normal(), use_bias=False)(inputs) + dilated_1 = layers.BatchNormalization()(dilated_1) + dilated_1 = layers.ReLU()(dilated_1) + + # dilation with rate 6 + dilated_6 = layers.Conv2D(256, (3, 3), dilation_rate=6, padding='same', + kernel_initializer=tf.keras.initializers.he_normal(), use_bias=False)(inputs) + dilated_6 = layers.BatchNormalization()(dilated_6) + dilated_6 = layers.ReLU()(dilated_6) + + # dilation with rate 12 + dilated_12 = layers.Conv2D(256, (3, 3), dilation_rate=12, padding='same', + kernel_initializer=tf.keras.initializers.he_normal(), use_bias=False)(inputs) + dilated_12 = layers.BatchNormalization()(dilated_12) + dilated_12 = layers.ReLU()(dilated_12) + + # dilation with rate 18 + dilated_18 = layers.Conv2D(256, (3, 3), dilation_rate=18, padding='same', + kernel_initializer=tf.keras.initializers.he_normal(), use_bias=False)(inputs) + dilated_18 = layers.BatchNormalization()(dilated_18) + dilated_18 = layers.ReLU()(dilated_18) + + # concate pyramid + x = layers.Concatenate(axis=-1)([global_mean, dilated_1, dilated_6, dilated_12, dilated_18]) + x = layers.Conv2D(256, (1, 1), padding='same', + kernel_initializer=tf.keras.initializers.he_normal(), use_bias=False)(x) + x = layers.BatchNormalization()(x) + x = layers.ReLU()(x) + + return keras.Model(inputs=inputs, outputs=x) + + +def DeepLabV3Plus(img_size, n_classes): + inputs = keras.Input(shape=img_size + (3,)) + # inputs = keras.Input((None, None, 3)) + + low, high = ResNetAtrous(layer_num=[3, 4, 6, 3], dilations=[1, 2, 1])(inputs) + + # modify low level feature channel number + low = layers.Conv2D(48, (1, 1), padding='same', + kernel_initializer=keras.initializers.he_normal(), use_bias=False)(low) + low = layers.BatchNormalization()(low) + low = layers.ReLU()(low) # size (b, h/4, w/4, 48) + + # pass high level feature into ASPP module + high = ASPP(high.shape[-1])(high) # size (b, h/8, w/8, 256) + high = layers.Lambda(lambda x: tf.image.resize(x[0], (tf.shape(x[1])[1], tf.shape(x[1])[2]), + method = tf.image.ResizeMethod.BILINEAR))([high, low]); + # concate and modify channel + x = layers.Concatenate(axis=-1)([high, low]) # size (b, h/4, w/4, 304) + + x = layers.Conv2D(256, (3, 3), padding='same', activation='relu', + kernel_initializer=keras.initializers.he_normal(), use_bias=False)(x) + x = layers.BatchNormalization()(x) + x = layers.ReLU()(x) + + x = layers.Conv2D(256, (3, 3), padding='same', activation='relu', + kernel_initializer=keras.initializers.he_normal(), use_bias=False)(x) + x = layers.BatchNormalization()(x) + x = layers.ReLU()(x) + + # upsampling + x = layers.Lambda(lambda x: tf.image.resize(x[0], (tf.shape(x[1])[1], tf.shape(x[1])[2]), + method = tf.image.ResizeMethod.BILINEAR))([x, inputs]) + + + # full conv + x = layers.Conv2D(n_classes, (1,1), padding='same', activation=keras.activations.softmax, + name = 'full_conv')(x) + + # flatten + x = layers.Reshape((img_size[0] * img_size[1], n_classes))(x) + + + return keras.Model(inputs=inputs, outputs=x) + + +class CustomModel(keras.Model): + def train_step(self, data): + # Unpack the data. Its structure depends on your model and + # on what you pass to `fit()`. + if len(data) == 3: + x, y, sample_weight = data + else: + sample_weight = None + x, y = data + + with tf.GradientTape() as tape: + y_pred = self(x, training=True) # Forward pass + # Compute the loss value. + # The loss function is configured in `compile()`. + loss = self.compiled_loss( + y, + y_pred, + sample_weight=sample_weight, + regularization_losses=self.losses, + ) + + # Compute gradients + trainable_vars = self.trainable_variables + gradients = tape.gradient(loss, trainable_vars) + + # Update weights + self.optimizer.apply_gradients(zip(gradients, trainable_vars)) + + # Update the metrics. + # Metrics are configured in `compile()`. + self.compiled_metrics.update_state(y, y_pred, sample_weight=sample_weight) + + # Return a dict mapping metric names to current value. + # Note that it will include the loss (tracked in self.metrics). + return {m.name: m.result() for m in self.metrics} + + +logger = logging.getLogger(__name__) + + +class SaDeeplab(BaseModel): + ''' + train deeplab + ''' + @staticmethod + def get_knob_config(): + return { + # hyper parameters + "lr": FixedKnob(1e-4), + "batch_size": FixedKnob(2), + "epoch": FixedKnob(1), + + # application parameters + # "num_classes": FixedKnob(1), + "fine_size": FixedKnob(160), + "train_val_split_rate": FixedKnob(0.9), + + } + + + def __init__(self, **knobs): + super().__init__(**knobs) + self._knobs = knobs + + self.model = None + + self.fine_size = self._knobs.get("fine_size") + self.split_rate = self._knobs.get("train_val_split_rate") + + + def train(self, dataset_path, **kwargs): + # hyper parameters + self.batch_size = self._knobs.get("batch_size") + self.epoch = self._knobs.get("epoch") + + self.lr = self._knobs.get("lr") + + + logger.info("Training params: {}".format(json.dumps(kwargs))) + + + # extract uploaded zipfile + dataset_zipfile = zipfile.ZipFile(dataset_path, 'r') + + train_folder = tempfile.TemporaryDirectory() + folder_name = train_folder.name + dataset_zipfile.extractall(path=folder_name) + + # load train params from zipfile + with open(os.path.join(folder_name, 'param.json'),'r') as load_f: + load_dict = json.load(load_f) + self.num_classes = load_dict["num_classes"] if "num_classes" in list(load_dict.keys()) else 21 # default class number(21) is the same as voc2012 + print(f"total number of classes: {self.num_classes}") + logger.info(f"total number of classes: {self.num_classes}") + + # load images from zipfile + if os.path.isdir(os.path.join(folder_name, "image")): + print("split train/val subsets...") + logger.info("split train/val subsets...") + + # load image and mask seperately + input_img_paths = sorted( + [ + os.path.join(folder_name, "image", fname) + for fname in os.listdir(os.path.join(folder_name, "image")) + ] + ) + target_img_paths = sorted( + [ + os.path.join(folder_name, "mask", fname) + for fname in os.listdir(os.path.join(folder_name, "mask")) + ] + ) + self.num_image = len(input_img_paths) + print("Total image number: ", self.num_image) + logger.info(f"Total image number : {self.num_image}") + + # split train/val + val_samples = int((1 - self.split_rate) * self.num_image) + # random.Random(1337).shuffle(input_img_paths) + # random.Random(1337).shuffle(target_img_paths) + train_input_img_paths = input_img_paths[:-val_samples] + train_target_img_paths = target_img_paths[:-val_samples] + val_input_img_paths = input_img_paths[-val_samples:] + val_target_img_paths = target_img_paths[-val_samples:] + + print(f"train images: {len(train_input_img_paths)}, val images: {len(val_input_img_paths)}") + logger.info(f"train images: {len(train_input_img_paths)}, val images: {len(val_input_img_paths)}") + + elif os.path.isdir(os.path.join(folder_name, "train")): + print("directly load train/val datasets...") + logger.info("directly load train/val datasets...") + + # load image and mask seperately + train_input_img_paths = sorted( + [ + os.path.join(folder_name, "train", "image", fname) + for fname in os.listdir(os.path.join(folder_name, "train", "image")) + ] + ) + train_target_img_paths = sorted( + [ + os.path.join(folder_name, "train", "mask", fname) + for fname in os.listdir(os.path.join(folder_name, "train", "mask")) + ] + ) + + val_input_img_paths = sorted( + [ + os.path.join(folder_name, "val", "image", fname) + for fname in os.listdir(os.path.join(folder_name, "val", "image")) + ] + ) + val_target_img_paths = sorted( + [ + os.path.join(folder_name, "val", "mask", fname) + for fname in os.listdir(os.path.join(folder_name, "val", "mask")) + ] + ) + self.num_image = len(train_input_img_paths) + len(val_input_img_paths) + print("Total image number: ", self.num_image) + logger.info(f"Total image number : {self.num_image}") + + print(f"train images: {len(train_input_img_paths)}, val images: {len(val_input_img_paths)}") + logger.info(f"train images: {len(train_input_img_paths)}, val images: {len(val_input_img_paths)}") + else: + print("unsupported dataset format!") + logger.info("unsupported dataset format!") + + # load dataset + train_gen = OxfordPets( + self.batch_size, self.fine_size, train_input_img_paths, train_target_img_paths + ) + val_gen = OxfordPets(self.batch_size, self.fine_size, val_input_img_paths, val_target_img_paths) + + logger.info("Training the model DeeplabV3+ using {}".format("cuda" if tf.test.is_gpu_available()==True else "cpu")) + print("Training the model DeeplabV3+ using {}".format("cuda" if tf.test.is_gpu_available()==True else "cpu")) + + # clear session buffer + keras.backend.clear_session() + + # get the model using our helper function + inputs = keras.Input(shape=(self.fine_size, self.fine_size) + (3,)) + outputs = DeepLabV3Plus((self.fine_size, self.fine_size), self.num_classes)(inputs) + self.model = CustomModel(inputs, outputs) + self.model.summary() + + # coimpile model with optimizers... + self.model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["sparse_categorical_accuracy"]) + callbacks = [ + # keras.callbacks.ModelCheckpoint("oxford_segmentation.h5", save_best_only=True), + keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5, min_lr=1e-6) + ] + + # start training + self.model.fit(train_gen, epochs=self.epoch, validation_data=val_gen, callbacks=callbacks) + + + def evaluate(self, val_dataset_path, **kwargs): + # extract validation datasets + dataset_zipfile = zipfile.ZipFile(val_dataset_path, 'r') + val_folder = tempfile.TemporaryDirectory() + dataset_zipfile.extractall(path=val_folder.name) + folder_name = val_folder.name + + if os.path.isdir(os.path.join(folder_name, "image")): + print("split train/val subsets...") + logger.info("split train/val subsets...") + + # load image and mask seperately + input_img_paths = sorted( + [ + os.path.join(folder_name, "image", fname) + for fname in os.listdir(os.path.join(folder_name, "image")) + ] + ) + target_img_paths = sorted( + [ + os.path.join(folder_name, "mask", fname) + for fname in os.listdir(os.path.join(folder_name, "mask")) + ] + ) + + # split train/val + val_samples = int((1 - self.split_rate) * self.num_image) + # random.Random(1337).shuffle(input_img_paths) + # random.Random(1337).shuffle(target_img_paths) + train_input_img_paths = input_img_paths[:-val_samples] + train_target_img_paths = target_img_paths[:-val_samples] + val_input_img_paths = input_img_paths[-val_samples:] + val_target_img_paths = target_img_paths[-val_samples:] + + elif os.path.isdir(os.path.join(folder_name, "train")): + print("directly load train/val datasets...") + logger.info("directly load train/val datasets...") + + # load image and mask seperately + train_input_img_paths = sorted( + [ + os.path.join(folder_name, "train", "image", fname) + for fname in os.listdir(os.path.join(folder_name, "train", "image")) + ] + ) + train_target_img_paths = sorted( + [ + os.path.join(folder_name, "train", "mask", fname) + for fname in os.listdir(os.path.join(folder_name, "train", "mask")) + ] + ) + + val_input_img_paths = sorted( + [ + os.path.join(folder_name, "val", "image", fname) + for fname in os.listdir(os.path.join(folder_name, "val", "image")) + ] + ) + val_target_img_paths = sorted( + [ + os.path.join(folder_name, "val", "mask", fname) + for fname in os.listdir(os.path.join(folder_name, "val", "mask")) + ] + ) + + else: + print("unsupported dataset format!") + logger.info("unsupported dataset format!") + + val_gen = OxfordPets(self.batch_size, self.fine_size, val_input_img_paths, val_target_img_paths) + + loss, accuracy = self.model.evaluate(val_gen) + + + return accuracy + + + def dump_parameters(self): + params = {} + with tempfile.NamedTemporaryFile(suffix=".h5") as tmp: + + # Save whole model to a tempfile + self.model.save_weights(tmp.name) + # Read from tempfile & encode it to base64 string + # with h5py.File(tmp.name, 'r') as f: + # if 'layer_names' not in f.attrs and 'model_weights' in f: + # weights_h5 = f['model_weights'] + with open(tmp.name, 'rb') as f: + weight_base64 = f.read() + params['weight_base64'] = base64.b64encode(weight_base64).decode('utf-8') + params['num_classes'] = self.num_classes + return params + + + def load_parameters(self, params): + weight_base64 = params['weight_base64'] + self.num_classes = params['num_classes'] + + weight_base64_bytes = base64.b64decode(weight_base64.encode('utf-8')) + + # state_dict = torch.load(BytesIO(weight_base64_bytes), map_location=self.device) + + inputs = keras.Input(shape=(self.fine_size, self.fine_size) + (3,)) + outputs = DeepLabV3Plus((self.fine_size, self.fine_size), self.num_classes)(inputs) + self.model = CustomModel(inputs, outputs) + + # weight_h5 = h5py.File(BytesIO(weight_base64_bytes)) + with h5py.File(BytesIO(weight_base64_bytes), 'r') as f: + hdf5_format.load_weights_from_hdf5_group(f, self.model.layers) + + # self.model.load_weights(weight_h5) + + + def _get_prediction(self, img): + + image = cv2.resize(img.astype('float32'), (self.fine_size, self.fine_size)) + print("+"*30) + print(image.shape) + image = np.expand_dims(image, axis=0) + predict = self.model.predict(image) + + mask = np.argmax(predict, axis=-1) + mask = np.expand_dims(mask, axis=-1) + mask = np.reshape(mask, (160, 160)) + + h, w = image.shape[:2] + + # transform result image into original size + mask_out = cv2.resize(mask.astype(np.uint8), (w, h), cv2.INTER_NEAREST) + + + return mask_out + + + def predict(self, queries: List[List]) -> List[dict]: + # print(len(queries)) + result = list() + + # depending on different input types, need different conditions + for idx, img in enumerate(queries): + logger.info(type(img)) + if isinstance(img, List): + print(len(img)) + img = np.array(img[0]) + print(img.shape) + img_file = img + # print(type(img_file)) + elif isinstance(img, Image.Image): + img_file = np.array(img) + else: + img_data = img + + # get prediction + res_raw = self._get_prediction(img_file) + + # add color palette (we follow the VOC2012 color map and the max num_class is 21) + res_raw = res_raw.astype(np.uint8) + res = Image.fromarray(res_raw) + palette = [] + for i in range(256): + palette.extend((i, i, i)) + palette[:3*21] = np.array([[0, 0, 0], + [128, 0, 0], + [0, 128, 0], + [128, 128, 0], + [0, 0, 128], + [128, 0, 128], + [0, 128, 128], + [128, 128, 128], + [64, 0, 0], + [192, 0, 0], + [64, 128, 0], + [192, 128, 0], + [64, 0, 128], + [192, 0, 128], + [64, 128, 128], + [192, 128, 128], + [0, 64, 0], + [128, 64, 0], + [0, 192, 0], + [128, 192, 0], + [0, 64, 128] + ], dtype='uint8').flatten() + res.putpalette(palette) + + name = f"./query_{idx}.png" + res.save(name) + full_name = os.path.abspath(name) + + buffered = BytesIO() + res.save(buffered, format="PNG") + img_str = base64.b64encode(buffered.getvalue()) + + + result.append(img_str.decode('utf-8')) + + # result.append(requests.get('http://192.168.100.203:36667/fetch').text) + + return result + +if __name__ == "__main__": + import argparse + + from singa_auto.model.dev import test_model_class + parser = argparse.ArgumentParser() + parser.add_argument('--train_path', + type=str, + default='./dataset/oxford_pets/datasets.zip', + help='Path to train dataset') + parser.add_argument('--val_path', + type=str, + default='./dataset/oxford_pets/datasets.zip', + help='Path to validation dataset') + + # parser.add_argument('--annotation_dataset_path', + # type=str, + # default='./dataset/voc2012/val2014.zip', + # help='Path to validation dataset') + + # parser.add_argument('--test_path', + # type=str, + # default='/hdd1/PennFudanPed.zip', + # help='Path to test dataset') + parser.add_argument( + '--query_path', + type=str, + default='/home/zhaozixiao/projects/singa_local/singa-auto/dataset/oxford_pets/Persian_120.jpg,/home/zhaozixiao/projects/singa_local/singa-auto/dataset/oxford_pets/pomeranian_159.jpg', + help='Path(s) to query image(s), delimited by commas' + ) + + (args, _) = parser.parse_known_args() + + # print(args.query_path.split(',')) + + imgs = utils.dataset.load_images(args.query_path.split(',')) + img_nps = [] + for i in imgs: + img = np.array(i) + img_nps.append(img) + + queries = img_nps + test_model_class(model_file_path=__file__, + model_class='SaDeeplab', + task='IMAGE_SEGMENTATION', + dependencies={ + "tensorflow": "2.3.0", + "opencv": "3.4.2.16", + }, + train_dataset_path=args.train_path, + val_dataset_path=args.val_path, + test_dataset_path=None, + train_args={"num_classes": 3}, + queries=img_nps) + + + + + + + + + + + + + + + diff --git a/examples/models/image_segmentation/SaUNetBorderLoss.py b/examples/models/image_segmentation/SaUNetBorderLoss.py new file mode 100644 index 00000000..aae6180b --- /dev/null +++ b/examples/models/image_segmentation/SaUNetBorderLoss.py @@ -0,0 +1,847 @@ +import os + +os.environ['CUDA_VISIBLE_DEVICES'] = "4, 5, 6, 7" + +import sys +sys.path.append(os.getcwd()) + +import base64 +import json +import logging +import os +import tempfile +import zipfile +from collections.abc import Sequence +from collections import defaultdict +from copy import deepcopy +from io import BytesIO +from typing import List +from glob import glob +from time import time +import requests + +import cv2 +import numpy as np +import PIL +import torch +import torch.nn as nn +import torchvision +from PIL import Image +from torch.utils.data import DataLoader, Dataset +from torch.utils.data.sampler import RandomSampler +from torchvision import models +from torchvision.transforms import functional as F +from torchvision.transforms.transforms import Pad, Resize +from tqdm import tqdm +from torch.nn import DataParallel + +from singa_auto.model import SegmentationModel, CategoricalKnob, FixedKnob, utils +from singa_auto.model.knob import BaseKnob +# from singa_auto.utils.metrics import do_kaggle_metric + +# from singa_auto.datasets.image_segmentation_dataset import * + + +# dataset fetch +def ImageFetch(img_folder, split_rate=0.9): + img_train = [] + mask_train = [] + img_val = [] + mask_val = [] + + image_folder = os.path.join(img_folder, "image") + mask_folder = os.path.join(img_folder, "mask") + + img_list = os.listdir(image_folder) + total_img_num = len(img_list) + print(f'Total number of images: {total_img_num}') + + train_num = int(total_img_num * split_rate) + for idx, image_name in tqdm(enumerate(img_list[:train_num]), total=train_num, desc="load train images......"): + image_path = os.path.join(image_folder, image_name) + mask_path = os.path.join(mask_folder, image_name.split('.')[0] + ".png") + + image = Image.open(image_path) + img_train.append(image) + + mask = Image.open(mask_path) + mask_train.append(mask) + for idx, image_name in tqdm(enumerate(img_list[train_num:]), total=(total_img_num - train_num), desc="load val images......"): + image_path = os.path.join(image_folder, image_name) + mask_path = os.path.join(mask_folder, image_name.split('.')[0] + ".png") + + image = Image.open(image_path) + img_val.append(image) + + mask = Image.open(mask_path) + mask_val.append(mask) + + return img_train, mask_train, img_val, mask_val + + +def trainImageFetch(train_folder): + image_train = [] + mask_train = [] + + # load images and masks from their folders + images_folder = os.path.join(train_folder, "image") + masks_folder = os.path.join(train_folder, "mask") + + image_list = os.listdir(images_folder) + for idx, image_name in tqdm(enumerate(image_list), total=len(image_list), desc="load train images......"): + image_path = os.path.join(images_folder, image_name) + mask_path = os.path.join(masks_folder, image_name.split('.')[0] + ".png") + + image = Image.open(image_path) + image_train.append(image) + + mask = Image.open(mask_path) + mask_train.append(mask) + + return image_train, mask_train + + +def valImageFetch(val_folder): + image_val = [] + mask_val = [] + + images_folder = os.path.join(val_folder, "image") + masks_folder = os.path.join(val_folder, "mask") + + image_list = os.listdir(images_folder) + for idx, image_name in tqdm(enumerate(image_list), total=len(image_list), desc="load validation images......"): + image_path = os.path.join(images_folder, image_name) + mask_path = os.path.join(masks_folder, image_name.split('.')[0] + ".png") + + image = Image.open(image_path) + image_val.append(image) + + mask = Image.open(mask_path) + mask_val.append(mask) + + return image_val, mask_val + + +class SegDataset(Dataset): + def __init__(self, image_list, mask_list, mode, transform_img, transform_mask, transform_border): + self.mode = mode + self.transform_img = transform_img + self.transform_mask = transform_mask + self.transform_border = transform_border + self.imagelist = image_list + self.masklist = mask_list + + + def __len__(self): + return len(self.imagelist) + + + def __getitem__(self, idx): + image = deepcopy(self.imagelist[idx]) + + if self.mode == 'train': + mask = deepcopy(self.masklist[idx]) + + mask_arr = np.array(mask) + border = cv2.Canny(mask_arr, 0, 0).astype(np.float) + border /= 255 + border = Image.fromarray(border.astype(np.uint8)) + border_img = self.transform_border(border) + border = torch.as_tensor(np.array(border_img), dtype=torch.int64) + # one_hot = torch.cat((torch.zeros_like(border).unsqueeze(0), torch.zeros_like(border).unsqueeze(0))).scatter_(0, border.unsqueeze(0), 1) + + image = self.transform_img(image) + + mask = self.transform_mask(mask) + mask = torch.as_tensor(np.array(mask), dtype=torch.int64) + # print(f'after transform mask max: {mask.max()}') + + # image = image.unsqueeze(0) + # mask = mask.unsqueeze(0) + + return image, mask, border + + elif self.mode == 'val': + mask = deepcopy(self.masklist[idx]) + + mask_arr = np.array(mask) + border = cv2.Canny(mask_arr, 0, 0).astype(np.float) + border /= 255 + border = Image.fromarray(border.astype(np.uint8)) + border_img = self.transform_border(border) + border = torch.as_tensor(np.array(border_img), dtype=torch.int64) + # one_hot = torch.cat((torch.zeros_like(border).unsqueeze(0), torch.zeros_like(border).unsqueeze(0))).scatter_(0, border.unsqueeze(0), 1) + + image = self.transform_img(image) + + mask = self.transform_mask(mask) + mask = torch.as_tensor(np.array(mask), dtype=torch.int64) + + # image = image.unsqueeze(0) + # mask = mask.unsqueeze(0) + + return image, mask, border + + +# define model +down_feature = defaultdict(list) +filter_list = [i for i in range(6, 9)] + + +class down_sampling(nn.Module): + def __init__(self, in_channel, out_channel): + super(down_sampling, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d(in_channel, out_channel, 3, padding=1), + nn.BatchNorm2d(out_channel), + nn.ReLU(inplace=True), + nn.Conv2d(out_channel, out_channel, 3, padding=1), + nn.BatchNorm2d(out_channel), + nn.ReLU(inplace=True) + ) + self.pool = nn.MaxPool2d(2) + + + def forward(self, in_feat): + x = self.conv(in_feat) + down_feature[in_feat.device.index].append(x) + x = self.pool(x) + + return x + + +class up_sampling(nn.Module): + def __init__(self, in_channel, out_channel): + super(up_sampling, self).__init__() + self.up_conv = nn.ConvTranspose2d(in_channel, out_channel, 2, stride=2) + self.relu_conv = nn.Sequential( + nn.Conv2d(in_channel, out_channel, kernel_size=3, padding=1), + nn.BatchNorm2d(out_channel), + nn.ReLU(inplace=True), + nn.Conv2d(out_channel, out_channel, kernel_size=3, padding=1), + nn.BatchNorm2d(out_channel), + nn.ReLU(inplace=True) + ) + + + def forward(self, in_feat): + x = self.up_conv(in_feat) + down_map = down_feature[in_feat.device.index].pop() + x = torch.cat([x, down_map], dim=1) + x = self.relu_conv(x) + return x + + +class UNet(nn.Module): + def __init__(self, num_classes): + super(UNet, self).__init__() + self.input_conv = down_sampling(3, 64) + self.down_list = [down_sampling(2 ** i, 2 ** (i + 1)) for i in filter_list] + self.down = nn.Sequential(*self.down_list) + + self.last_layer = nn.Sequential( + nn.Conv2d(512, 1024, 3, padding=1), + nn.BatchNorm2d(1024), + nn.ReLU(inplace=True), + nn.Conv2d(1024, 1024, 3, padding=1), + nn.BatchNorm2d(1024), + nn.ReLU(inplace=True) + ) + + self.up_init = up_sampling(1024, 512) + self.up_list = [up_sampling(2 ** (i + 1), 2 ** i) for i in filter_list[::-1]] + self.up = nn.Sequential(*self.up_list) + + self.output = nn.Conv2d(64, num_classes, 1) + # self.classifier = nn.Softmax() + + + + def forward(self, in_feat): + x = self.input_conv(in_feat) + x = self.down(x) + x = self.last_layer(x) + x = self.up_init(x) + x = self.up(x) + out = self.output(x) + + + # out = self.classifier(x) + # return out + return out, x + + +class BorderUNet(nn.Module): + def __init__(self, n_class): + super().__init__() + + self.unet = UNet(n_class) + + self.border_extraction = nn.Conv2d(64, 2, kernel_size=1, padding=0) + + self.softmax = nn.Softmax2d() + + def forward(self, in_feat): + # regular cnn process + init_seg, unet_feature = self.unet(in_feat) + + # extract and enhance border + init_border = self.border_extraction(unet_feature) + + # output + out = self.softmax(init_seg) + + return out, init_border + + +# pre-process: resize image to the target scale keeping aspect ratio then pad to square +class ResizeSquarePad(Resize, Pad): + def __init__(self, target_length, interpolation_strategy): + if not isinstance(target_length, (int, Sequence)): + raise TypeError("Size should be int or sequence. Got {}".format(type(target_length))) + if isinstance(target_length, Sequence) and len(target_length) not in (1, 2): + raise ValueError("If size is a sequence, it should have 1 or 2 values") + + self.target_length = target_length + self.interpolation_strategy = interpolation_strategy + Resize.__init__(self, size=(320, 320), interpolation=self.interpolation_strategy) + Pad.__init__(self, padding=(0,0,0,0), fill=255, padding_mode="constant") + + + def __call__(self, img): + w, h = img.size + if w > h: + self.size = (int(np.round(self.target_length * (h / w))), self.target_length) + img = Resize.__call__(self, img) + + total_pad = self.size[1] - self.size[0] + half_pad = total_pad // 2 + self.padding = (0, half_pad, 0, total_pad - half_pad) + return Pad.__call__(self, img) + else: + self.size = (self.target_length, int(np.round(self.target_length * (w / h)))) + img = Resize.__call__(self, img) + + total_pad = self.size[0] - self.size[1] + half_pad = total_pad // 2 + self.padding = (half_pad, 0, total_pad - half_pad, 0) + return Pad.__call__(self, img) + + +# customized loss function +class DiceLoss(nn.Module): + def __init__(self): + super(DiceLoss, self).__init__() + + def forward(self, input, target): + N = target.size(0) + smooth = 1 + + input_flat = input.view(N, -1) + target_flat = target.view(N, -1) + + intersection = input_flat * target_flat + + loss = 2 * (intersection.sum(1) + smooth) / \ + (input_flat.sum(1) + target_flat.sum(1) + smooth) + loss = 1 - loss.sum() / N + + return loss + + +class MulticlassDiceLoss(nn.Module): + """ + requires input(prediction) dimension as [b, c, h, w] + target(ground truth mask) dimension as [b, 1, h, w] where dimension 2 refers to the class index + Can convert target to one_hot automatically and support ignore labels (should be in the form of list) + """ + + def __init__(self, ignore_labels=None): + super(MulticlassDiceLoss, self).__init__() + self.ignore_labels = ignore_labels + + def forward(self, input, target): + + num_ignore = 0 if self.ignore_labels == None else len(self.ignore_labels) + + n, _, h, w = target.shape[:] + + num_classes = input.shape[1] + + # initialize zeros for one_hot + zeros = torch.zeros((n, (num_classes + num_ignore), h, w)).to(target.device) + + # decrease ignore labels' indexes into successive integers(eg: convert 0, 1, 2, 255 into 0, 1, 2, 3) + for i in range(num_ignore): + target[target == self.ignore_labels[i]] = num_classes + i + + # scatter to one_hot + one_hot = zeros.scatter_(1, target, 1) + + dice = DiceLoss() + totalLoss = 0 + + # for indexes out of range, not compute corresponding loss + for i in range(num_classes): + diceLoss = dice(input[:, i], one_hot[:,i]) + totalLoss += diceLoss + + return totalLoss + + +logger = logging.getLogger(__name__) + + +# main process procedure +class SaUNetBorderLoss(SegmentationModel): + ''' + train UNet + ''' + @staticmethod + def get_knob_config(): + return { + # hyper parameters + "lr": FixedKnob(1e-3), + "momentum": FixedKnob(0.9), + + "ignore_index": FixedKnob(255), + "batch_size": FixedKnob(12), + "epoch": FixedKnob(1), + + # application parameters + # "num_classes": FixedKnob(1), + "fine_size": FixedKnob(512), + + } + + + def __init__(self, **knobs): + super().__init__(**knobs) + # load knobs + self._knobs = knobs + + # initiate hyper params + self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') + print("self.device", self.device) + logger.info(self.device) + + self.model = None + + self.fine_size = self._knobs.get("fine_size") + + self.ignore_index = self._knobs.get("ignore_index") + + self.batch_size = self._knobs.get("batch_size") + self.epoch = self._knobs.get("epoch") + + self.lr = self._knobs.get("lr") + self.momentum = self._knobs.get("momentum") + + # define preprocessing procedure + self.transform_img = torchvision.transforms.Compose([ + ResizeSquarePad(self.fine_size, Image.BILINEAR), + torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225]) + ]) + + self.transform_mask = torchvision.transforms.Compose([ + ResizeSquarePad(self.fine_size, Image.NEAREST) + ]) + + self.transform_border = torchvision.transforms.Compose([ + ResizeSquarePad(self.fine_size, Image.NEAREST) + ]) + + + def train(self, dataset_path, **kwargs): + logger.info("Training params: {}".format(json.dumps(kwargs))) + + + # extract uploaded zipfile + dataset_zipfile = zipfile.ZipFile(dataset_path, 'r') + + train_folder = tempfile.TemporaryDirectory() + folder_name = train_folder.name + dataset_zipfile.extractall(path=folder_name) + + # load train params from zipfile + with open(os.path.join(folder_name, 'param.json'),'r') as load_f: + load_dict = json.load(load_f) + self.num_classes = load_dict["num_classes"] if "num_classes" in list(load_dict.keys()) else 21 # default class number(21) is the same as voc2012 + + # load images from zipfile + if os.path.isdir(os.path.join(folder_name, "image")): + print("split train/val subsets...") + logger.info("split train/val subsets...") + image_train, mask_train, image_val, mask_val = ImageFetch(folder_name) + self.num_image = len(image_train) + print("Total training images : ", self.num_image) + logger.info(f"Total training images : {self.num_image}") + elif os.path.isdir(os.path.join(folder_name, "train")): + print("directly load train/val datasets...") + logger.info("directly load train/val datasets...") + image_train, mask_train = trainImageFetch(folder_name) + image_val, mask_val = valImageFetch(folder_name) + self.num_image = len(image_train) + print("Total training images : ", self.num_image) + logger.info(f"Total training images : {self.num_image}") + else: + print("unsupported dataset format!") + logger.info("unsupported dataset format!") + + # load dataset + train_data = SegDataset(image_train, mask_train, 'train', self.transform_img, self.transform_mask, self.transform_border) + val_data = SegDataset(image_val, mask_val, 'val', self.transform_img, self.transform_mask, self.transform_border) + + logger.info("Training the model ResUNet using {}".format(self.device)) + print("Training the model ResUNet using {}".format(self.device)) + + # define training and validation data loaders + train_loader = DataLoader(train_data, + shuffle=RandomSampler(train_data), + batch_size=self.batch_size) + + val_loader = DataLoader(val_data, + shuffle=False, + batch_size=self.batch_size) + + # get the model using our helper function + self.model = BorderUNet(self.num_classes) + self.model = DataParallel(self.model) + self.model.to(self.device) + + self.criterion_ce = nn.CrossEntropyLoss(weight=torch.Tensor([1, 100]), ignore_index=self.ignore_index) + self.criterion_dice = MulticlassDiceLoss(ignore_labels=[255]) + + self.optimizer_ft = torch.optim.SGD(filter(lambda p: p.requires_grad, self.model.parameters()), lr=self.lr, momentum=self.momentum) + self.exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer_ft, step_size=20, gamma=0.1) + + # start training + for epoch_ in range(self.epoch): + train_loss = self._train_one_epoch(train_loader, self.model) + val_loss, accuracy = self._evaluate(val_loader, self.model) + self.exp_lr_scheduler.step() + + print('epoch: {} train_loss: {:.3f} val_loss: {:.3f} val_accuracy: {:.3f}'.format(epoch_ + 1, train_loss, val_loss, accuracy)) + logger.info('epoch: {} train_loss: {:.3f} val_loss: {:.3f} val_accuracy: {:.3f}'.format(epoch_ + 1, train_loss, val_loss, accuracy)) + + + def _train_one_epoch(self, train_loader, model): + ''' + consider as a sub-train function inside singa-auto framework + ''' + running_loss = 0.0 + data_size = len(train_loader) * self.batch_size + + model.train() + + for inputs, masks, borders in tqdm(train_loader): + inputs, masks, borders = inputs.to(self.device), masks.long().to(self.device), borders.long().to(self.device) + self.optimizer_ft.zero_grad() + + init_seg, init_border = model(inputs) + + self.criterion_ce.to(self.device) + loss_border = self.criterion_ce(init_border, borders) + loss_seg = self.criterion_dice(init_seg, masks.unsqueeze(1)) + + loss = loss_border + loss_seg + + loss.backward() + self.optimizer_ft.step() + + running_loss += loss.item() * self.batch_size + + epoch_loss = running_loss / data_size + return epoch_loss + + + def _evaluate(self, test_loader, model): + ''' + validation per epoch + ''' + running_loss = 0.0 + acc = 0.0 + data_size = len(test_loader) * self.batch_size + + model.eval() + + with torch.no_grad(): + for inputs, masks, borders in test_loader: + inputs, masks, borders = inputs.to(self.device), masks.long().to(self.device), borders.long().to(self.device) + + outputs, fine_border = self.model(inputs) + + predict = torch.argmax(nn.Softmax(dim=1)(outputs), dim=1) # extract argmax as the final prediction + + # we do not consider the ignore_index + pure_mask = masks.masked_select(masks.ne(self.ignore_index)) + pure_predict = predict.masked_select(masks.ne(self.ignore_index)) + + acc += pure_mask.cpu().eq(pure_predict.cpu()).sum().item()/len(pure_mask) # find the correct pixels + + self.criterion_ce.to(self.device) + loss_border = self.criterion_ce(fine_border, borders) + loss_seg = self.criterion_dice(outputs, masks.unsqueeze(1)) + + loss = loss_seg + loss_border + + running_loss += loss.item() * self.batch_size + + epoch_loss = running_loss / data_size + accuracy = acc / len(test_loader) + return epoch_loss, accuracy + + + def evaluate(self, val_dataset_path, **kwargs): + # extract validation datasets + dataset_zipfile = zipfile.ZipFile(val_dataset_path, 'r') + val_folder = tempfile.TemporaryDirectory() + dataset_zipfile.extractall(path=val_folder.name) + folder_name = val_folder.name + + if os.path.isdir(os.path.join(folder_name, "image")): + print("split train/val subsets...") + logger.info("split train/val subsets...") + image_train, mask_train, X_val, y_val = ImageFetch(folder_name) + self.num_image = len(X_val) + print("Total val images : ", self.num_image) + logger.info(f"Total val images : {self.num_image}") + elif os.path.isdir(os.path.join(folder_name, "train")): + print("directly load train/val datasets...") + logger.info("directly load train/val datasets...") + image_train, mask_train = trainImageFetch(folder_name) + X_val, y_val = valImageFetch(folder_name) + self.num_image = len(X_val) + print("Total val images : ", self.num_image) + logger.info(f"Total val images : {self.num_image}") + else: + print("unsupported dataset format!") + logger.info("unsupported dataset format!") + + val_data = SegDataset(X_val, y_val, 'val', self.transform_img, self.transform_mask, self.transform_border) + + val_loader = DataLoader(val_data, + shuffle=False, + batch_size=self.batch_size) + # compute MIoU metric(consider as accuracy) + temp_miou = {} + for i in range(self.num_classes): + temp_miou[i] = [0, 0.0] + + self.model.eval() + + with torch.no_grad(): + for inputs, masks, borders in val_loader: + inputs, masks, borders = inputs.to(self.device), masks.long().to(self.device), borders.long().to(self.device) + + outputs, fine_border = self.model(inputs) + + predict = torch.argmax(nn.Softmax(dim=1)(outputs), dim=1) + pure_mask = masks.masked_select(masks.ne(255)) + pure_predict = predict.masked_select(masks.ne(255)) + + for class_value in pure_mask.unique(): + valued_mask = pure_mask.masked_select(pure_mask.eq(class_value)) + real_len = len(valued_mask) + + valued_predict = pure_predict.masked_select(pure_mask.eq(class_value)) + cross_len = valued_mask.eq(valued_predict).sum().item() + + predict_len = len(pure_predict.masked_select(pure_predict.eq(class_value))) + + temp_miou[class_value.item()][1] += cross_len / (real_len + predict_len - cross_len) + temp_miou[class_value.item()][0] += 1 + + miou_overall = 0.0 + existed_classes = 0 + for key in temp_miou.keys(): + if temp_miou[key][0] != 0: + miou_overall += (temp_miou[key][1] / temp_miou[key][0]) + existed_classes += 1 + temp_miou['overall'] = [1, miou_overall / existed_classes] + + for key in temp_miou.keys(): + if temp_miou[key][0] != 0: + print(f"class {key} accuracy: {temp_miou[key][1] / temp_miou[key][0]}") + return temp_miou['overall'][1] + + + def dump_parameters(self): + params = {} + with tempfile.NamedTemporaryFile() as tmp: + + # Save whole model to a tempfile + torch.save(self.model.module.state_dict(), tmp.name) + # Read from tempfile & encode it to base64 string + with open(tmp.name, 'rb') as f: + weight_base64 = f.read() + params['weight_base64'] = base64.b64encode(weight_base64).decode('utf-8') + params['num_classes'] = self.num_classes + return params + + + def load_parameters(self, params): + weight_base64 = params['weight_base64'] + self.num_classes = params['num_classes'] + + weight_base64_bytes = base64.b64decode(weight_base64.encode('utf-8')) + + state_dict = torch.load(BytesIO(weight_base64_bytes), map_location=self.device) + + self.model = BorderUNet(self.num_classes) + self.model.load_state_dict(state_dict) + + self.model = DataParallel(self.model) + self.model.to(self.device) + self.model.eval() + + def _get_prediction(self, img): + + image = self.transform_img(img) + + image = image.to(self.device) + predict, _ = self.model(image.unsqueeze(0)) + + predict = predict.squeeze(0) + predict = nn.Softmax(dim=0)(predict) + predict = torch.argmax(predict, dim=0) + + # transform result image into original size + w, h = img.size + if w > h: + re_h = int(np.round(self.fine_size * (h / w))) + total_pad = self.fine_size - re_h + half_pad = total_pad // 2 + out = predict[half_pad : half_pad + re_h, :] + else: + re_w = int(np.round(self.fine_size * (w / h))) + total_pad = self.fine_size - re_w + half_pad = total_pad // 2 + out = predict[:, half_pad : half_pad + re_w] + + out = cv2.resize(out.cpu().numpy(), (w, h), interpolation=cv2.INTER_NEAREST) + + return out + + + + def predict(self, queries: List[List]) -> List[dict]: + # print(len(queries)) + result = list() + + # depending on different input types, need different conditions + for idx, img in enumerate(queries): + print("*" * 30) + print(type(img)) + if isinstance(img, List): + print(len(img)) + img = np.array(img[0]) + print(img.shape) + img_file = Image.fromarray(np.uint8(img)) + print(type(img_file)) + elif isinstance(img, np.ndarray): + img_file = Image.fromarray(img) + else: + img_file = img + + # get prediction + res_raw = self._get_prediction(img_file) + + # add color palette (we follow the VOC2012 color map and the max num_class is 21) + res_raw = res_raw.astype(np.uint8) + res = Image.fromarray(res_raw) + palette = [] + for i in range(256): + palette.extend((i, i, i)) + palette[:3*21] = np.array([[0, 0, 0], + [128, 0, 0], + [0, 128, 0], + [128, 128, 0], + [0, 0, 128], + [128, 0, 128], + [0, 128, 128], + [128, 128, 128], + [64, 0, 0], + [192, 0, 0], + [64, 128, 0], + [192, 128, 0], + [64, 0, 128], + [192, 0, 128], + [64, 128, 128], + [192, 128, 128], + [0, 64, 0], + [128, 64, 0], + [0, 192, 0], + [128, 192, 0], + [0, 64, 128] + ], dtype='uint8').flatten() + res.putpalette(palette) + + name = f"./query_{idx}.png" + res.save(name) + full_name = os.path.abspath(name) + + buffered = BytesIO() + res.save(buffered, format="PNG") + img_str = base64.b64encode(buffered.getvalue()) + + + result.append(img_str.decode('utf-8')) + + # result.append(requests.get('http://192.168.100.203:36667/fetch').text) + + return result + + +if __name__ == "__main__": + import argparse + + from singa_auto.model.dev import test_model_class + parser = argparse.ArgumentParser() + parser.add_argument('--train_path', + type=str, + default='/home/zhaozixiao/dataset/pets/datasets.zip', + help='Path to train dataset') + parser.add_argument('--val_path', + type=str, + default='/home/zhaozixiao/dataset/pets/datasets.zip', + help='Path to validation dataset') + + # parser.add_argument('--annotation_dataset_path', + # type=str, + # default='./dataset/voc2012/val2014.zip', + # help='Path to validation dataset') + + # parser.add_argument('--test_path', + # type=str, + # default='/hdd1/PennFudanPed.zip', + # help='Path to test dataset') + parser.add_argument('--query_path', + type=str, + default='/home/zhaozixiao/dataset/pets/Persian_120.jpg,/home/zhaozixiao/dataset/pets/pomeranian_159.jpg', + help='Path(s) to query image(s), delimited by commas') + + (args, _) = parser.parse_known_args() + + # print(args.query_path.split(',')) + + imgs = utils.dataset.load_images(args.query_path.split(',')) + img_nps = [] + for i in imgs: + img = np.array(i) + img_nps.append(img) + + queries = img_nps + test_model_class(model_file_path=__file__, + model_class='SaUNetBorderLoss', + task='IMAGE_SEGMENTATION', + dependencies={"torch": "1.6.0+cu101", + "torchvision": "0.7.0+cu101", + "opencv": "3.4.2", + "tqdm": "4.28.0"}, + train_dataset_path=args.train_path, + val_dataset_path=args.val_path, + test_dataset_path=None, + train_args={"num_classes": 3}, + queries=img_nps) diff --git a/examples/models/question_answering/onnx_bert/onnx_bert.py b/examples/models/question_answering/onnx_bert/onnx_bert.py index bc9bd0b1..a6bfc0f0 100644 --- a/examples/models/question_answering/onnx_bert/onnx_bert.py +++ b/examples/models/question_answering/onnx_bert/onnx_bert.py @@ -23,7 +23,7 @@ from singa_auto.model import BaseModel from singa_auto.constants import ModelDependency -from singa_auto.model.dev import make_predictions, _check_model_class, _print_header, _check_dependencies, inform_user +from singa_auto.model.dev import make_predictions_json, _check_model_class, _print_header, _check_dependencies, inform_user from singa_auto.model.utils import load_model_class from singa_auto.advisor.constants import Proposal, ParamsType @@ -204,7 +204,7 @@ def _postprocess(self, eval_examples, extra_data, all_results): proposal = Proposal(trial_no=0, knobs={}, params_type=ParamsType.LOCAL_RECENT) - (predictions, model_inst) = make_predictions(queries, task, + (predictions, model_inst) = make_predictions_json(queries, task, py_model_class, proposal, fine_tune_dataset_path=None, diff --git a/examples/models/tabular_classification/SVCClf.py b/examples/models/tabular_classification/SVCClf.py index 72675304..26ee49a3 100644 --- a/examples/models/tabular_classification/SVCClf.py +++ b/examples/models/tabular_classification/SVCClf.py @@ -1,200 +1,198 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import numpy as np -import pandas as pd - -import json -import pickle -import base64 -from sklearn.preprocessing import StandardScaler -from sklearn.svm import SVC - -from singa_auto.model import TabularClfModel, IntegerKnob, CategoricalKnob, FloatKnob, logger -from singa_auto.model.dev import test_model_class -from singa_auto.constants import ModelDependency - - -class SVCClf(TabularClfModel): - ''' - Implements a C-Support Vector Classifier for classification task using Pima Indian Diabetes dataset. - ''' - - @staticmethod - def get_knob_config(): - return { - 'C': IntegerKnob(2, 3), - 'kernel': CategoricalKnob(['poly', 'rbf', 'linear']), - 'degree': IntegerKnob(2, 3), - 'gamma': CategoricalKnob(['scale', 'auto']), - 'coef0': FloatKnob(0.0, 0.1), - 'shrinking': CategoricalKnob([True, False]), - 'tol': FloatKnob(1e-03, 1e-01, is_exp=True), - 'decision_function_shape': CategoricalKnob(['ovo', 'ovr']), - 'probability': CategoricalKnob([True, False]), - } - - def __init__(self, **knobs): - self._knobs = knobs - self.__dict__.update(knobs) - self._clf = self._build_classifier( - self._knobs.get("C"), self._knobs.get("kernel"), - self._knobs.get("degree"), self._knobs.get("gamma"), - self._knobs.get("coef0"), self._knobs.get("shrinking"), - self._knobs.get("tol"), self._knobs.get("decision_function_shape"), - self._knobs.get("probability")) - - def train(self, dataset_path, features=None, target=None, **kwargs): - # Record features & target - self._features = features - self._target = target - - # Load CSV file as pandas dataframe - csv_path = dataset_path - data = pd.read_csv(csv_path) - - # Extract X & y from dataframe - (X, y) = self._extract_xy(data) - - X = self.prepare_X(X) - - self._clf.fit(X, y) - - # Compute train accuracy - score = self._clf.score(X, y) - logger.log('Train accuracy: {}'.format(score)) - - def evaluate(self, dataset_path, **kwargs): - # Load CSV file as pandas dataframe - csv_path = dataset_path - data = pd.read_csv(csv_path) - - # Extract X & y from dataframe - (X, y) = self._extract_xy(data) - - X = self.prepare_X(X) - - accuracy = self._clf.score(X, y) - return accuracy - - def predict(self, queries): - queries = pd.DataFrame.from_records(queries, index=[0]) - data = self.prepare_X(queries) - probs = self._clf.predict_proba(data) - return probs.tolist() - - - def destroy(self): - pass - - def dump_parameters(self): - params = {} - - # Put model parameters - clf_bytes = pickle.dumps(self._clf) - clf_base64 = base64.b64encode(clf_bytes).decode('utf-8') - params['clf_base64'] = clf_base64 - params['features'] = json.dumps(self._features) - if self._target: - params['target'] = self._target - - return params - - def load_parameters(self, params): - # Load model parameters - assert 'clf_base64' in params - clf_base64 = params['clf_base64'] - clf_bytes = base64.b64decode(clf_base64.encode('utf-8')) - - self._clf = pickle.loads(clf_bytes) - self._features = json.loads(params['features']) - if "target" in params: - self._target = params['target'] - else: - self._target = None - - def _extract_xy(self, data): - features = self._features - target = self._target - - if features is None: - X = data.iloc[:, :-1] - else: - X = data[features] - - if target is None: - y = data.iloc[:, -1] - else: - y = data[target] - - return (X, y) - - def median_dataset(self, df): - #replace zero values by median so that 0 will not affect median. - for col in df.columns: - df[col].replace(0, np.nan, inplace=True) - df[col].fillna(df[col].median(), inplace=True) - return df - - def prepare_X(self, df): - data = self.median_dataset(df) - X = StandardScaler().fit_transform(data) - return X - - - def _build_classifier(self, C, kernel, degree, gamma, coef0, shrinking, tol, - decision_function_shape, probability): - clf = SVC( - C=C, - kernel=kernel, - degree=degree, - gamma=gamma, - coef0=coef0, - shrinking=shrinking, - tol=tol, - decision_function_shape=decision_function_shape, - probability=probability, - ) - return clf - - -if __name__ == '__main__': - test_model_class(model_file_path=__file__, - model_class='SVCClf', - task='TABULAR_CLASSIFICATION', - dependencies={ModelDependency.SCIKIT_LEARN: '0.20.0'}, - train_dataset_path='data/diabetes_train.csv', - val_dataset_path='data/diabetes_val.csv', - train_args={ - 'features': [ - 'Pregnancies', 'Glucose', 'BloodPressure', - 'SkinThickness', 'Insulin', 'DiabetesPedigreeFunction','BMI', 'Age'], - 'target': 'Outcome' - }, - queries={ - 'Pregnancies': 3, - 'Glucose': 130, - 'BloodPressure': 92, - 'SkinThickness': 30, - 'Insulin': 90, - 'DiabetesPedigreeFunction': 1, - 'BMI': 30.4, - 'Age': 40 - }) - +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +import numpy as np +import pandas as pd + +import json +import pickle +import base64 +from sklearn.preprocessing import StandardScaler +from sklearn.svm import SVC + +from singa_auto.model import TabularClfModel, IntegerKnob, CategoricalKnob, FloatKnob, logger +from singa_auto.model.dev import test_model_class +from singa_auto.constants import ModelDependency + + +class SVCClf(TabularClfModel): + ''' + Implements a C-Support Vector Classifier for classification task using Pima Indian Diabetes dataset. + ''' + + @staticmethod + def get_knob_config(): + return { + 'C': IntegerKnob(2, 3), + 'kernel': CategoricalKnob(['poly', 'rbf', 'linear']), + 'degree': IntegerKnob(2, 3), + 'gamma': CategoricalKnob(['scale', 'auto']), + 'coef0': FloatKnob(0.0, 0.1), + 'shrinking': CategoricalKnob([True, False]), + 'tol': FloatKnob(1e-03, 1e-01, is_exp=True), + 'decision_function_shape': CategoricalKnob(['ovo', 'ovr']), + 'probability': CategoricalKnob([True, False]), + } + + def __init__(self, **knobs): + self._knobs = knobs + self.__dict__.update(knobs) + self._clf = self._build_classifier( + self._knobs.get("C"), self._knobs.get("kernel"), + self._knobs.get("degree"), self._knobs.get("gamma"), + self._knobs.get("coef0"), self._knobs.get("shrinking"), + self._knobs.get("tol"), self._knobs.get("decision_function_shape"), + self._knobs.get("probability")) + + def train(self, dataset_path, features=None, target=None, **kwargs): + # Record features & target + self._features = features + self._target = target + + # Load CSV file as pandas dataframe + csv_path = dataset_path + data = pd.read_csv(csv_path) + + # Extract X & y from dataframe + (X, y) = self._extract_xy(data) + + X = self.prepare_X(X) + + self._clf.fit(X, y) + + # Compute train accuracy + score = self._clf.score(X, y) + logger.log('Train accuracy: {}'.format(score)) + + def evaluate(self, dataset_path, **kwargs): + # Load CSV file as pandas dataframe + csv_path = dataset_path + data = pd.read_csv(csv_path) + + # Extract X & y from dataframe + (X, y) = self._extract_xy(data) + + X = self.prepare_X(X) + + accuracy = self._clf.score(X, y) + return accuracy + + def predict(self, queries): + queries = pd.DataFrame.from_records(queries, index=[0]) + data = self.prepare_X(queries) + probs = self._clf.predict_proba(data) + return probs.tolist() + + def destroy(self): + pass + + def dump_parameters(self): + params = {} + + # Put model parameters + clf_bytes = pickle.dumps(self._clf) + clf_base64 = base64.b64encode(clf_bytes).decode('utf-8') + params['clf_base64'] = clf_base64 + params['features'] = json.dumps(self._features) + if self._target: + params['target'] = self._target + + return params + + def load_parameters(self, params): + # Load model parameters + assert 'clf_base64' in params + clf_base64 = params['clf_base64'] + clf_bytes = base64.b64decode(clf_base64.encode('utf-8')) + + self._clf = pickle.loads(clf_bytes) + self._features = json.loads(params['features']) + if "target" in params: + self._target = params['target'] + else: + self._target = None + + def _extract_xy(self, data): + features = self._features + target = self._target + + if features is None: + X = data.iloc[:, :-1] + else: + X = data[features] + + if target is None: + y = data.iloc[:, -1] + else: + y = data[target] + + return (X, y) + + def median_dataset(self, df): + #replace zero values by median so that 0 will not affect median. + for col in df.columns: + df[col].replace(0, np.nan, inplace=True) + df[col].fillna(df[col].median(), inplace=True) + return df + + def prepare_X(self, df): + data = self.median_dataset(df) + X = StandardScaler().fit_transform(data) + return X + + def _build_classifier(self, C, kernel, degree, gamma, coef0, shrinking, tol, + decision_function_shape, probability): + clf = SVC( + C=C, + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + shrinking=shrinking, + tol=tol, + decision_function_shape=decision_function_shape, + probability=probability, + ) + return clf + + +if __name__ == '__main__': + test_model_class(model_file_path=__file__, + model_class='SVCClf', + task='TABULAR_CLASSIFICATION', + dependencies={ModelDependency.SCIKIT_LEARN: '0.20.0'}, + train_dataset_path='data/diabetes_train.csv', + val_dataset_path='data/diabetes_val.csv', + train_args={ + 'features': [ + 'Pregnancies', 'Glucose', 'BloodPressure', + 'SkinThickness', 'Insulin', 'DiabetesPedigreeFunction','BMI', 'Age'], + 'target': 'Outcome' + }, + queries={ + 'Pregnancies': 3, + 'Glucose': 130, + 'BloodPressure': 92, + 'SkinThickness': 30, + 'Insulin': 90, + 'DiabetesPedigreeFunction': 1, + 'BMI': 30.4, + 'Age': 40 + }) + diff --git a/examples/models/tabular_regression/RidgeReg.py b/examples/models/tabular_regression/RidgeReg.py index 43de7aba..2fc0bc5b 100644 --- a/examples/models/tabular_regression/RidgeReg.py +++ b/examples/models/tabular_regression/RidgeReg.py @@ -1,224 +1,226 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import pickle -import base64 -import numpy as np -import pandas as pd -import json - -from sklearn.linear_model import Ridge -from sklearn.metrics import mean_squared_error - -from singa_auto.model import BaseModel, IntegerKnob, FloatKnob, CategoricalKnob, logger -from singa_auto.model.dev import test_model_class -from singa_auto.constants import ModelDependency - - -class RidgeReg(BaseModel): - ''' - Implements a Linear Ridge Regressor for regression task using bodyfat dataset. - ''' - - @staticmethod - def get_knob_config(): - return { - 'alpha': FloatKnob(0.001, 0.01), - 'normalize': CategoricalKnob([True, False]), - 'copy_X': CategoricalKnob([True, False]), - 'tol': FloatKnob(1e-05, 1e-04), - 'solver': CategoricalKnob(['svd', 'sag']), - 'random_state': IntegerKnob(1, 123) - } - - def __init__(self, **knobs): - self._knobs = knobs - self.__dict__.update(knobs) - self._regressor = self._build_regressor(self._knobs.get("alpha"), - self._knobs.get("normalize"), - self._knobs.get("copy_X"), - self._knobs.get("tol"), - self._knobs.get("solver"), - self._knobs.get("random_state")) - - def train(self, dataset_path, features=None, target=None, **kwargs): - # Record features & target - self._features = features - self._target = target - - # Load CSV file as pandas dataframe - csv_path = dataset_path - data = pd.read_csv(csv_path) - - # Extract X & y from dataframe - (X, y) = self._extract_xy(data) - - # Encode categorical features - X = self._encoding_categorical_type(X) - - self._regressor.fit(X, y) - - # Compute train root mean square error - preds = self._regressor.predict(X) - rmse = np.sqrt(mean_squared_error(y, preds)) - logger.log('Train RMSE: {}'.format(rmse)) - - - def evaluate(self, dataset_path, **kwargs): - # Load CSV file as pandas dataframe - csv_path = dataset_path - data = pd.read_csv(csv_path) - - # Extract X & y from dataframe - (X, y) = self._extract_xy(data) - - # Encode categorical features - X = self._encoding_categorical_type(X) - - preds = self._regressor.predict(X) - rmse = np.sqrt(mean_squared_error(y, preds)) - return 1 / rmse - - def predict(self, queries): - queries = [pd.DataFrame(query, index=[0]) for query in queries] - results = [ - self._regressor.predict(self._features_mapping(query)).tolist()[0] - for query in queries - ] - return results - - - def destroy(self): - pass - - def dump_parameters(self): - params = {} - - # Put model parameters - regressor_bytes = pickle.dumps(self._regressor) - regressor_base64 = base64.b64encode(regressor_bytes).decode('utf-8') - params['regressor_base64'] = regressor_base64 - params['encoding_dict'] = json.dumps(self._encoding_dict) - params['features'] = json.dumps(self._features) - params['target'] = self._target - - return params - - - def load_parameters(self, params): - # Load model parameters - assert 'regressor_base64' in params - regressor_base64 = params['regressor_base64'] - regressor_bytes = base64.b64decode(regressor_base64.encode('utf-8')) - self._regressor = pickle.loads(regressor_bytes) - - self._encoding_dict = json.loads(params['encoding_dict']) - self._features = json.loads(params['features']) - self._target = params['target'] - - - def _extract_xy(self, data): - features = self._features - target = self._target - - if features is None: - X = data.iloc[:, :-1] - else: - X = data[features] - - if target is None: - y = data.iloc[:, -1] - else: - y = data[target] - - return (X, y) - - - def _encoding_categorical_type(self, cols): - # Apply label encoding for those categorical columns - cat_cols = list( - filter(lambda x: cols[x].dtype == 'object', cols.columns)) - encoded_cols = pd.DataFrame({col: cols[col].astype('category').cat.codes \ - if cols[col].dtype == 'object' else cols[col] for col in cols}, index=cols.index) - - # Recover the missing elements (Use XGBoost to automatically handle them) - encoded_cols = encoded_cols.replace(to_replace=-1, value=np.nan) - - # Generate the dict that maps categorical features to numerical - encoding_dict = {col: {cat: n for n, cat in enumerate(cols[col].astype('category'). \ - cat.categories)} for col in cat_cols} - self._encoding_dict = encoding_dict - - return encoded_cols - - - def _features_mapping(self, df): - # Encode the categorical features with pre saved encoding dict - cat_cols = list(filter(lambda x: df[x].dtype == 'object', df.columns)) - df_temp = df.copy() - for col in cat_cols: - df_temp[col] = df[col].map(self._encoding_dict[col]) - df = df_temp - return df - - - def _build_regressor(self, alpha, normalize, copy_X, tol, solver, - random_state): - regressor = Ridge( - alpha=alpha, - normalize=normalize, - copy_X=copy_X, - tol=tol, - solver=solver, - random_state=random_state, - ) - return regressor - - -if __name__ == '__main__': - test_model_class(model_file_path=__file__, - model_class='RidgeReg', - task='TABULAR_REGRESSION', - dependencies={ModelDependency.SCIKIT_LEARN: '0.20.0'}, - train_dataset_path='data/bodyfat_train.csv', - val_dataset_path='data/bodyfat_val.csv', - train_args={ - 'features': [ - 'density', 'age', 'weight', 'height', 'neck', - 'chest', 'abdomen', 'hip', 'thigh', 'knee', - 'ankle', 'biceps', 'forearm', 'wrist' - ], - 'target': 'bodyfat' - }, - queries=[{ - 'density': 1.0207, - 'age': 65, - 'weight': 224.5, - 'height': 68.25, - 'neck': 38.8, - 'chest': 119.6, - 'abdomen': 118.0, - 'hip': 114.3, - 'thigh': 61.3, - 'knee': 42.1, - 'ankle': 23.4, - 'biceps': 34.9, - 'forearm': 30.1, - 'wrist': 19.4 - }]) +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +import pickle +import base64 +import numpy as np +import pandas as pd +import json + +from sklearn.linear_model import Ridge +from sklearn.metrics import mean_squared_error + +from singa_auto.model import BaseModel, IntegerKnob, FloatKnob, CategoricalKnob, logger +from singa_auto.model.dev import test_model_class +from singa_auto.constants import ModelDependency + + +class RidgeReg(BaseModel): + ''' + Implements a Linear Ridge Regressor for regression task using bodyfat dataset. + ''' + + @staticmethod + def get_knob_config(): + return { + 'alpha': FloatKnob(0.001, 0.01), + 'normalize': CategoricalKnob([True, False]), + 'copy_X': CategoricalKnob([True, False]), + 'tol': FloatKnob(1e-05, 1e-04), + 'solver': CategoricalKnob(['svd', 'sag']), + 'random_state': IntegerKnob(1, 123) + } + + def __init__(self, **knobs): + self._knobs = knobs + self.__dict__.update(knobs) + self._regressor = self._build_regressor(self._knobs.get("alpha"), + self._knobs.get("normalize"), + self._knobs.get("copy_X"), + self._knobs.get("tol"), + self._knobs.get("solver"), + self._knobs.get("random_state")) + + def train(self, dataset_path, features=None, target=None, **kwargs): + # Record features & target + self._features = features + self._target = target + + # Load CSV file as pandas dataframe + csv_path = dataset_path + data = pd.read_csv(csv_path) + + # Extract X & y from dataframe + (X, y) = self._extract_xy(data) + + # Encode categorical features + X = self._encoding_categorical_type(X) + + self._regressor.fit(X, y) + + # Compute train root mean square error + preds = self._regressor.predict(X) + + rmse = np.sqrt(mean_squared_error(y, preds)) + logger.log('Train RMSE: {}'.format(rmse)) + + def evaluate(self, dataset_path, **kwargs): + # Load CSV file as pandas dataframe + csv_path = dataset_path + data = pd.read_csv(csv_path) + + # Extract X & y from dataframe + (X, y) = self._extract_xy(data) + + # Encode categorical features + X = self._encoding_categorical_type(X) + + preds = self._regressor.predict(X) + + rmse = np.sqrt(mean_squared_error(y, preds)) + + return 1 / rmse + + def predict(self, queries): + queries = [pd.DataFrame(query, index=[0]) for query in queries] + results = [ + self._regressor.predict(self._features_mapping(query)).tolist()[0] + for query in queries + ] + return results + + + def destroy(self): + pass + + def dump_parameters(self): + params = {} + + # Put model parameters + regressor_bytes = pickle.dumps(self._regressor) + regressor_base64 = base64.b64encode(regressor_bytes).decode('utf-8') + params['regressor_base64'] = regressor_base64 + params['encoding_dict'] = json.dumps(self._encoding_dict) + params['features'] = json.dumps(self._features) + params['target'] = self._target + + return params + + def load_parameters(self, params): + # Load model parameters + assert 'regressor_base64' in params + regressor_base64 = params['regressor_base64'] + regressor_bytes = base64.b64decode(regressor_base64.encode('utf-8')) + + self._regressor = pickle.loads(regressor_bytes) + + self._encoding_dict = json.loads(params['encoding_dict']) + self._features = json.loads(params['features']) + self._target = params['target'] + + + def _extract_xy(self, data): + features = self._features + target = self._target + + if features is None: + X = data.iloc[:, :-1] + else: + X = data[features] + + if target is None: + y = data.iloc[:, -1] + else: + y = data[target] + + return (X, y) + + + def _encoding_categorical_type(self, cols): + # Apply label encoding for those categorical columns + cat_cols = list( + filter(lambda x: cols[x].dtype == 'object', cols.columns)) + encoded_cols = pd.DataFrame({col: cols[col].astype('category').cat.codes \ + if cols[col].dtype == 'object' else cols[col] for col in cols}, index=cols.index) + + # Recover the missing elements (Use XGBoost to automatically handle them) + encoded_cols = encoded_cols.replace(to_replace=-1, value=np.nan) + + # Generate the dict that maps categorical features to numerical + encoding_dict = {col: {cat: n for n, cat in enumerate(cols[col].astype('category'). \ + cat.categories)} for col in cat_cols} + self._encoding_dict = encoding_dict + + return encoded_cols + + + def _features_mapping(self, df): + # Encode the categorical features with pre saved encoding dict + cat_cols = list(filter(lambda x: df[x].dtype == 'object', df.columns)) + df_temp = df.copy() + for col in cat_cols: + df_temp[col] = df[col].map(self._encoding_dict[col]) + df = df_temp + return df + + + def _build_regressor(self, alpha, normalize, copy_X, tol, solver, + random_state): + regressor = Ridge( + alpha=alpha, + normalize=normalize, + copy_X=copy_X, + tol=tol, + solver=solver, + random_state=random_state, + ) + return regressor + + +if __name__ == '__main__': + test_model_class(model_file_path=__file__, + model_class='RidgeReg', + task='TABULAR_REGRESSION', + dependencies={ModelDependency.SCIKIT_LEARN: '0.20.0'}, + train_dataset_path='data/bodyfat_train.csv', + val_dataset_path='data/bodyfat_val.csv', + train_args={ + 'features': [ + 'density', 'age', 'weight', 'height', 'neck', + 'chest', 'abdomen', 'hip', 'thigh', 'knee', + 'ankle', 'biceps', 'forearm', 'wrist' + ], + 'target': 'bodyfat' + }, + queries=[{ + 'density': 1.0207, + 'age': 65, + 'weight': 224.5, + 'height': 68.25, + 'neck': 38.8, + 'chest': 119.6, + 'abdomen': 118.0, + 'hip': 114.3, + 'thigh': 61.3, + 'knee': 42.1, + 'ankle': 23.4, + 'biceps': 34.9, + 'forearm': 30.1, + 'wrist': 19.4 + }]) diff --git a/examples/models/tabular_regression/TreeReg.py b/examples/models/tabular_regression/TreeReg.py index 9a810417..18343cee 100644 --- a/examples/models/tabular_regression/TreeReg.py +++ b/examples/models/tabular_regression/TreeReg.py @@ -1,224 +1,226 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import pickle -import base64 -import numpy as np -import pandas as pd -import json - -from sklearn.tree import DecisionTreeRegressor -from sklearn.metrics import mean_squared_error - -from singa_auto.model import BaseModel, IntegerKnob, FloatKnob, CategoricalKnob, logger -from singa_auto.model.dev import test_model_class -from singa_auto.constants import ModelDependency - - -class TreeReg(BaseModel): - ''' - Implements a Decision Tree Regressor for regression task using bodyfat dataset. - ''' - - @staticmethod - def get_knob_config(): - return { - 'criterion': CategoricalKnob(['mse', 'mae']), - 'splitter': CategoricalKnob(['best', 'random']), - 'min_samples_split': IntegerKnob(2, 5), - 'max_features': CategoricalKnob(['auto', 'sqrt']), - 'random_state': IntegerKnob(1, 123), - 'min_impurity_decrease': FloatKnob(0.0, 0.2), - 'min_impurity_split': FloatKnob(1e-07, 1e-03) - } - - def __init__(self, **knobs): - self._knobs = knobs - self.__dict__.update(knobs) - self._regressor = self._build_regressor( - self._knobs.get("criterion"), self._knobs.get("splitter"), - self._knobs.get("min_samples_split"), - self._knobs.get("max_features"), self._knobs.get("random_state"), - self._knobs.get("min_impurity_decrease"), - self._knobs.get("min_impurity_split")) - - - def train(self, dataset_path, features=None, target=None, **kwargs): - # Record features & target - self._features = features - self._target = target - - # Load CSV file as pandas dataframe - csv_path = dataset_path - data = pd.read_csv(csv_path) - - # Extract X & y from dataframe - (X, y) = self._extract_xy(data) - - # Encode categorical features - X = self._encoding_categorical_type(X) - - self._regressor.fit(X, y) - - # Compute train root mean square error - preds = self._regressor.predict(X) - rmse = np.sqrt(mean_squared_error(y, preds)) - logger.log('Train RMSE: {}'.format(rmse)) - - def evaluate(self, dataset_path, **kwargs): - # Load CSV file as pandas dataframe - csv_path = dataset_path - data = pd.read_csv(csv_path) - - # Extract X & y from dataframe - (X, y) = self._extract_xy(data) - - # Encode categorical features - X = self._encoding_categorical_type(X) - - preds = self._regressor.predict(X) - rmse = np.sqrt(mean_squared_error(y, preds)) - return 1 / rmse - - def predict(self, queries): - queries = [pd.DataFrame(query, index=[0]) for query in queries] - results = [ - self._regressor.predict(self._features_mapping(query)).tolist()[0] - for query in queries - ] - return results - - - def destroy(self): - pass - - def dump_parameters(self): - params = {} - - # Put model parameters - regressor_bytes = pickle.dumps(self._regressor) - regressor_base64 = base64.b64encode(regressor_bytes).decode('utf-8') - params['regressor_base64'] = regressor_base64 - params['encoding_dict'] = json.dumps(self._encoding_dict) - params['features'] = json.dumps(self._features) - params['target'] = self._target - - return params - - def load_parameters(self, params): - # Load model parameters - assert 'regressor_base64' in params - regressor_base64 = params['regressor_base64'] - regressor_bytes = base64.b64decode(regressor_base64.encode('utf-8')) - self._regressor = pickle.loads(regressor_bytes) - - self._encoding_dict = json.loads(params['encoding_dict']) - self._features = json.loads(params['features']) - self._target = params['target'] - - def _extract_xy(self, data): - features = self._features - target = self._target - - if features is None: - X = data.iloc[:, :-1] - else: - X = data[features] - - if target is None: - y = data.iloc[:, -1] - else: - y = data[target] - - return (X, y) - - - def _encoding_categorical_type(self, cols): - # Apply label encoding for those categorical columns - cat_cols = list( - filter(lambda x: cols[x].dtype == 'object', cols.columns)) - encoded_cols = pd.DataFrame({col: cols[col].astype('category').cat.codes \ - if cols[col].dtype == 'object' else cols[col] for col in cols}, index=cols.index) - - # Recover the missing elements (Use XGBoost to automatically handle them) - encoded_cols = encoded_cols.replace(to_replace=-1, value=np.nan) - - # Generate the dict that maps categorical features to numerical - encoding_dict = {col: {cat: n for n, cat in enumerate(cols[col].astype('category'). \ - cat.categories)} for col in cat_cols} - self._encoding_dict = encoding_dict - - return encoded_cols - - def _features_mapping(self, df): - # Encode the categorical features with pre saved encoding dict - cat_cols = list(filter(lambda x: df[x].dtype == 'object', df.columns)) - df_temp = df.copy() - for col in cat_cols: - df_temp[col] = df[col].map(self._encoding_dict[col]) - df = df_temp - return df - - - def _build_regressor(self, criterion, splitter, min_samples_split, - max_features, random_state, min_impurity_decrease, - min_impurity_split): - regressor = DecisionTreeRegressor( - criterion=criterion, - splitter=splitter, - min_samples_split=min_samples_split, - max_features=max_features, - random_state=random_state, - min_impurity_decrease=min_impurity_decrease, - min_impurity_split=min_impurity_split, - ) - return regressor - - -if __name__ == '__main__': - test_model_class(model_file_path=__file__, - model_class='TreeReg', - task='TABULAR_REGRESSION', - dependencies={ModelDependency.SCIKIT_LEARN: '0.20.0'}, - train_dataset_path='data/bodyfat_train.csv', - val_dataset_path='data/bodyfat_val.csv', - train_args={ - 'features': [ - 'density', 'age', 'weight', 'height', 'neck', - 'chest', 'abdomen', 'hip', 'thigh', 'knee', - 'ankle', 'biceps', 'forearm', 'wrist' - ], - 'target': 'bodyfat' - }, - queries=[{ - 'density': 1.0207, - 'age': 65, - 'weight': 224.5, - 'height': 68.25, - 'neck': 38.8, - 'chest': 119.6, - 'abdomen': 118.0, - 'hip': 114.3, - 'thigh': 61.3, - 'knee': 42.1, - 'ankle': 23.4, - 'biceps': 34.9, - 'forearm': 30.1, - 'wrist': 19.4 - }]) +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +import pickle +import base64 +import numpy as np +import pandas as pd +import json + +from sklearn.tree import DecisionTreeRegressor +from sklearn.metrics import mean_squared_error + +from singa_auto.model import BaseModel, IntegerKnob, FloatKnob, CategoricalKnob, logger +from singa_auto.model.dev import test_model_class +from singa_auto.constants import ModelDependency + + +class TreeReg(BaseModel): + ''' + Implements a Decision Tree Regressor for regression task using bodyfat dataset. + ''' + + @staticmethod + def get_knob_config(): + return { + 'criterion': CategoricalKnob(['mse', 'mae']), + 'splitter': CategoricalKnob(['best', 'random']), + 'min_samples_split': IntegerKnob(2, 5), + 'max_features': CategoricalKnob(['auto', 'sqrt']), + 'random_state': IntegerKnob(1, 123), + 'min_impurity_decrease': FloatKnob(0.0, 0.2), + 'min_impurity_split': FloatKnob(1e-07, 1e-03) + } + + def __init__(self, **knobs): + self._knobs = knobs + self.__dict__.update(knobs) + self._regressor = self._build_regressor( + self._knobs.get("criterion"), self._knobs.get("splitter"), + self._knobs.get("min_samples_split"), + self._knobs.get("max_features"), self._knobs.get("random_state"), + self._knobs.get("min_impurity_decrease"), + self._knobs.get("min_impurity_split")) + + def train(self, dataset_path, features=None, target=None, **kwargs): + # Record features & target + self._features = features + self._target = target + + # Load CSV file as pandas dataframe + csv_path = dataset_path + data = pd.read_csv(csv_path) + + # Extract X & y from dataframe + (X, y) = self._extract_xy(data) + + # Encode categorical features + X = self._encoding_categorical_type(X) + + self._regressor.fit(X, y) + + # Compute train root mean square error + preds = self._regressor.predict(X) + + rmse = np.sqrt(mean_squared_error(y, preds)) + logger.log('Train RMSE: {}'.format(rmse)) + + def evaluate(self, dataset_path, **kwargs): + # Load CSV file as pandas dataframe + csv_path = dataset_path + data = pd.read_csv(csv_path) + + # Extract X & y from dataframe + (X, y) = self._extract_xy(data) + + # Encode categorical features + X = self._encoding_categorical_type(X) + + preds = self._regressor.predict(X) + + rmse = np.sqrt(mean_squared_error(y, preds)) + return 1 / rmse + + def predict(self, queries): + queries = [pd.DataFrame(query, index=[0]) for query in queries] + results = [ + self._regressor.predict(self._features_mapping(query)).tolist()[0] + for query in queries + ] + return results + + + def destroy(self): + pass + + def dump_parameters(self): + params = {} + + # Put model parameters + regressor_bytes = pickle.dumps(self._regressor) + regressor_base64 = base64.b64encode(regressor_bytes).decode('utf-8') + params['regressor_base64'] = regressor_base64 + params['encoding_dict'] = json.dumps(self._encoding_dict) + params['features'] = json.dumps(self._features) + params['target'] = self._target + + return params + + def load_parameters(self, params): + # Load model parameters + assert 'regressor_base64' in params + regressor_base64 = params['regressor_base64'] + regressor_bytes = base64.b64decode(regressor_base64.encode('utf-8')) + + self._regressor = pickle.loads(regressor_bytes) + + self._encoding_dict = json.loads(params['encoding_dict']) + self._features = json.loads(params['features']) + self._target = params['target'] + + def _extract_xy(self, data): + features = self._features + target = self._target + + if features is None: + X = data.iloc[:, :-1] + else: + X = data[features] + + if target is None: + y = data.iloc[:, -1] + else: + y = data[target] + + return (X, y) + + + def _encoding_categorical_type(self, cols): + # Apply label encoding for those categorical columns + cat_cols = list( + filter(lambda x: cols[x].dtype == 'object', cols.columns)) + encoded_cols = pd.DataFrame({col: cols[col].astype('category').cat.codes \ + if cols[col].dtype == 'object' else cols[col] for col in cols}, index=cols.index) + + # Recover the missing elements (Use XGBoost to automatically handle them) + encoded_cols = encoded_cols.replace(to_replace=-1, value=np.nan) + + # Generate the dict that maps categorical features to numerical + encoding_dict = {col: {cat: n for n, cat in enumerate(cols[col].astype('category'). \ + cat.categories)} for col in cat_cols} + self._encoding_dict = encoding_dict + + return encoded_cols + + def _features_mapping(self, df): + # Encode the categorical features with pre saved encoding dict + cat_cols = list(filter(lambda x: df[x].dtype == 'object', df.columns)) + df_temp = df.copy() + for col in cat_cols: + df_temp[col] = df[col].map(self._encoding_dict[col]) + df = df_temp + return df + + + def _build_regressor(self, criterion, splitter, min_samples_split, + max_features, random_state, min_impurity_decrease, + min_impurity_split): + regressor = DecisionTreeRegressor( + criterion=criterion, + splitter=splitter, + min_samples_split=min_samples_split, + max_features=max_features, + random_state=random_state, + min_impurity_decrease=min_impurity_decrease, + min_impurity_split=min_impurity_split, + ) + return regressor + + +if __name__ == '__main__': + test_model_class(model_file_path=__file__, + model_class='TreeReg', + task='TABULAR_REGRESSION', + dependencies={ModelDependency.SCIKIT_LEARN: '0.20.0'}, + train_dataset_path='data/bodyfat_train.csv', + val_dataset_path='data/bodyfat_val.csv', + train_args={ + 'features': [ + 'density', 'age', 'weight', 'height', 'neck', + 'chest', 'abdomen', 'hip', 'thigh', 'knee', + 'ankle', 'biceps', 'forearm', 'wrist' + ], + 'target': 'bodyfat' + }, + queries=[{ + 'density': 1.0207, + 'age': 65, + 'weight': 224.5, + 'height': 68.25, + 'neck': 38.8, + 'chest': 119.6, + 'abdomen': 118.0, + 'hip': 114.3, + 'thigh': 61.3, + 'knee': 42.1, + 'ankle': 23.4, + 'biceps': 34.9, + 'forearm': 30.1, + 'wrist': 19.4 + }]) diff --git a/examples/models/text_generation/onnx_gpt2.py b/examples/models/text_generation/onnx_gpt2.py index edeeec00..56cd5ed4 100644 --- a/examples/models/text_generation/onnx_gpt2.py +++ b/examples/models/text_generation/onnx_gpt2.py @@ -22,7 +22,7 @@ from singa_auto.model import BaseModel from singa_auto.constants import ModelDependency -from singa_auto.model.dev import make_predictions, _check_model_class, _print_header, _check_dependencies, inform_user +from singa_auto.model.dev import make_predictions_json, _check_model_class, _print_header, _check_dependencies, inform_user from singa_auto.model.utils import load_model_class from singa_auto.advisor.constants import Proposal, ParamsType @@ -150,7 +150,7 @@ def _postprocess(self, out: List[int]): proposal = Proposal(trial_no=0, knobs={}, params_type=ParamsType.LOCAL_RECENT) - (predictions, model_inst) = make_predictions(queries, task, + (predictions, model_inst) = make_predictions_json(queries, task, py_model_class, proposal, fine_tune_dataset_path=None, diff --git a/examples/scripts/quickstart.py b/examples/scripts/quickstart.py index 2c6ab0cf..8440fb95 100644 --- a/examples/scripts/quickstart.py +++ b/examples/scripts/quickstart.py @@ -43,7 +43,22 @@ def get_predictor_host(client, app): time.sleep(10) -def make_predictions(client, predictor_host, queries): +def make_predictions_image(client, predictor_host, queries): + predictions = [] + + for query in queries: + res = requests.post(url='http://{}/predict'.format(predictor_host), + files={'img': open(query, 'rb')}) + + if res.status_code != 200: + raise Exception(res.text) + + predictions.append(res.text) + + return predictions + + +def make_predictions_json(client, predictor_host, queries): predictions = [] for query in queries: @@ -131,7 +146,7 @@ def quickstart(client, train_dataset_path, val_dataset_path, gpus, hours, print('Making predictions for query images:') print(query_paths) queries = utils.dataset.load_images(query_paths) - predictions = make_predictions(client, predictor_host, queries) + predictions = make_predictions_json(client, predictor_host, queries) print('Predictions are:') print(predictions) diff --git a/examples/scripts/run_image_segmentation.py b/examples/scripts/run_image_segmentation.py new file mode 100644 index 00000000..78e4e3d4 --- /dev/null +++ b/examples/scripts/run_image_segmentation.py @@ -0,0 +1,180 @@ +from __future__ import absolute_import + +import os +import sys +sys.path.append(os.getcwd()) + +import argparse +import base64 + +from pprint import pprint + +from singa_auto.client import Client +# from singa_auto.config import SUPERADMIN_EMAIL +from singa_auto.constants import BudgetOption +from singa_auto.constants import InferenceBudgetOption +from singa_auto.constants import ModelDependency + + +from examples.scripts.quickstart import gen_id +from examples.scripts.quickstart import get_predictor_host +from examples.scripts.quickstart import make_predictions_image +from examples.scripts.quickstart import wait_until_train_job_has_stopped + +SINGA_AUTO_IMAGE_NAME = f"singa_auto/singa_auto_worker" +SINGA_AUTO_VERSION = os.environ.get('SINGA_AUTO_VERSION', '0.2.0') +IMAGE_SEGMENTATION_NAME = f'{SINGA_AUTO_IMAGE_NAME}:{SINGA_AUTO_VERSION}' + + +def run_image_segmentation(client, dataset_path, gpus, hours, **kwargs): + ''' + Conducts training with the `YoloV3` model for the task ``OBJECT_DETECTION`. + ''' + + task = 'IMAGE_SEGMENTATION' + + import time + + if "dataset" in kwargs: + dataset = kwargs["dataset"] + else: + print('Creating & uploading train dataset onto SINGA-Auto...') + curr_time = time.strftime("%m%d_%H%M", time.localtime(time.time())) + dataset = client.create_dataset('oxford_pets_{}'.format(curr_time), task, dataset_path) + pprint(dataset) + + if "model" in kwargs: + model = kwargs["model"] + else: + curr_time = time.strftime("%m%d_%H%M", time.localtime(time.time())) + model_name = 'deeplab_{}_iter_{}'.format(curr_time, 1) + print('Adding models "{}" to SINGA-Auto...'.format(model_name)) + model = client.create_model( + model_name, + task, + 'examples/models/image_segmentation/SaDeeplab.py', + 'SaDeeplab', + docker_image=IMAGE_SEGMENTATION_NAME, + dependencies={ + "opencv-python":"4.4.0.46", + "tensorflow": "2.3.0", + } + ) + pprint(model) + + # generate app & model names by time to avoid naming conflicts + curr_time = time.strftime("%m%d_%H%M", time.localtime(time.time())) + app = 'deeplab_{}_gpu_{}'.format(curr_time, gpus) + + print('Creating train job for app "{}" on SINGA-Auto...'.format(app)) + budget = {BudgetOption.TIME_HOURS: hours, BudgetOption.GPU_COUNT: gpus} + train_job = client.create_train_job( + app, + task, + dataset['id'], + dataset['id'], + budget, + models=[model['id']] + ) + pprint(train_job) + + print('Waiting for train job to complete...') + print('This might take a few minutes') + wait_until_train_job_has_stopped(client, app) + print('Train job has been stopped') + + # app = "deeplab_0519_1026_gpu_4" + + print('Listing best trials of latest train job for app "{}"...'.format(app)) + pprint(client.get_best_trials_of_train_job(app)) + + print('Creating inference job for app "{}" on SINGA-Auto...'.format(app)) + budget = {InferenceBudgetOption.GPU_COUNT: 1} + pprint(client.create_inference_job(app, budget=budget)) + predictor_host = get_predictor_host(client, app) + if not predictor_host: + raise Exception('Inference job has errored or stopped') + print('Inference job is running!') + + print('Making predictions for queries:') + queries = ['./examples/data/image_segmentaion/Persian_120.jpg'] + print(queries) + predictions = make_predictions_image(client, predictor_host, queries) + print('Predictions are:') + print(predictions) + + print('Stopping inference job...') + pprint(client.stop_inference_job(app)) + + +if "__main__" == __name__: + parser = argparse.ArgumentParser() + parser.add_argument( + '--email', + type=str, + default="superadmin@singaauto", + help='Email of user', + ) + parser.add_argument( + '--password', + type=str, + default="singa_auto", + help='Password of user', + ) + parser.add_argument( + '--gpus', + type=int, + default=1, + help='How many GPUs to use', + ) + parser.add_argument( + '--hours', + type=float, + default=1, + help='How long the train job should run for (in hours)', + ) + parser.add_argument( + '--use_old', + type=bool, + default=True, + help='whether use existing dataset and model', + ) + (args, _) = parser.parse_known_args() + + # Initialize client + client = Client() + client.login(email=args.email, password=args.password) + + print('Preprocessing dataset...') + data_dir = '/home/taomingyang/dataset/package' + dataset_path = os.path.join(data_dir, 'oxford_pets.zip') + + if args.use_old: + dataset = { + 'id': '0e6723fa-7e3e-4942-9808-07b9873b2244', + 'name': 'oxford_pets_0518_1643', + 'owner_id': 'cabd4ec6-3911-4439-b88b-660eaa7d7ad8', + 'size_bytes': 401767917, + 'stat': {}, + 'store_dataset_id': '4edfa4cc-5d5e-431b-a893-0bcebf653fd0.data', + 'task': 'IMAGE_SEGMENTATION' + } + model = { + 'id': '0a3a6bc9-a3ab-4ec7-8b4c-585af2fec948', + 'name': 'deeplab_0519_1331_iter_1', + 'user_id': 'cabd4ec6-3911-4439-b88b-660eaa7d7ad8' + } + # model = { + # 'id': '6302dbe8-22c2-4b39-bd09-dd29ffed254d', + # 'name': 'yolo_0427_1404_iter_10', + # 'user_id': '8e29b96b-ea16-4595-a1fd-86decddbab6b' + # } + + run_image_segmentation( + client, dataset_path, args.gpus, args.hours, + dataset=dataset, model=model, + ) + else: + run_image_segmentation(client, dataset_path, args.gpus, args.hours) + + print(args) diff --git a/examples/scripts/run_object_detection.py b/examples/scripts/run_object_detection.py new file mode 100644 index 00000000..60ae02aa --- /dev/null +++ b/examples/scripts/run_object_detection.py @@ -0,0 +1,207 @@ +from __future__ import absolute_import + +import os +import sys +sys.path.append(os.getcwd()) + +import argparse +import base64 + +from pprint import pprint + +from singa_auto.client import Client +# from singa_auto.config import SUPERADMIN_EMAIL +from singa_auto.constants import BudgetOption +from singa_auto.constants import InferenceBudgetOption +from singa_auto.constants import ModelDependency + + +from examples.scripts.quickstart import gen_id +from examples.scripts.quickstart import get_predictor_host +from examples.scripts.quickstart import make_predictions_image +from examples.scripts.quickstart import wait_until_train_job_has_stopped + +SINGA_AUTO_IMAGE_NAME = f"singa_auto/singa_auto_worker" +SINGA_AUTO_VERSION = os.environ.get('SINGA_AUTO_VERSION', '0.2.0') +IMAGE_OBJECT_DETECTION_NAME = f'{SINGA_AUTO_IMAGE_NAME}:{SINGA_AUTO_VERSION}' + + +def run_object_detection(client, train_dataset_path, val_dataset_path, gpus, hours, **kwargs): + ''' + Conducts training with the `YoloV3` model for the task ``OBJECT_DETECTION`. + ''' + + task = 'OBJECT_DETECTION' + + import time + + if "train_dataset" in kwargs: + train_dataset = kwargs["train_dataset"] + else: + print('Creating & uploading train dataset onto SINGA-Auto...') + curr_time = time.strftime("%m%d_%H%M", time.localtime(time.time())) + train_dataset = client.create_dataset('yolo_train_{}'.format(curr_time), task, train_dataset_path) + pprint(train_dataset) + + if "val_dataset" in kwargs: + val_dataset = kwargs["val_dataset"] + else: + print('Creating & uploading val dataset onto SINGA-Auto...') + curr_time = time.strftime("%m%d_%H%M", time.localtime(time.time())) + val_dataset = client.create_dataset('yolo_val_{}'.format(curr_time), task, val_dataset_path) + pprint(val_dataset) + + if "train_model" in kwargs: + train_model = kwargs["train_model"] + else: + curr_time = time.strftime("%m%d_%H%M", time.localtime(time.time())) + model_name = 'yolo_{}_iter_2'.format(curr_time) + print('Adding models "{}" to SINGA-Auto...'.format(model_name)) + train_model = client.create_model( + model_name, + task, + 'examples/models/image_object_detection/SaYolo.py', + 'SaYolo', + docker_image=IMAGE_OBJECT_DETECTION_NAME, + dependencies={ + "opencv-python":"4.4.0.46", + "terminaltables":"3.1.0", + "torch":"1.6.0", + "torchvision":"0.7.0", + "tqdm":"4.53.0", + "wget":"3.2", + "pycocotools":"2.0.2", + }) + pprint(train_model) + + # generate app & model names by time to avoid naming conflicts + curr_time = time.strftime("%m%d_%H%M", time.localtime(time.time())) + app = 'yolo_{}_gpu_{}'.format(curr_time, gpus) + + print('Creating train job for app "{}" on SINGA-Auto...'.format(app)) + budget = {BudgetOption.TIME_HOURS: hours, BudgetOption.GPU_COUNT: gpus} + train_job = client.create_train_job( + app, + task, + train_dataset['id'], + val_dataset['id'], + budget, + models=[train_model['id']] + ) + pprint(train_job) + + print('Waiting for train job to complete...') + print('This might take a few minutes') + wait_until_train_job_has_stopped(client, app) + print('Train job has been stopped') + + # app = "yolo_0601_0935_gpu_1" + + print('Listing best trials of latest train job for app "{}"...'.format(app)) + pprint(client.get_best_trials_of_train_job(app)) + + print('Creating inference job for app "{}" on SINGA-Auto...'.format(app)) + budget = {InferenceBudgetOption.GPU_COUNT: gpus} + pprint(client.create_inference_job(app, budget=budget)) + predictor_host = get_predictor_host(client, app) + if not predictor_host: + raise Exception('Inference job has errored or stopped') + print('Inference job is running!') + + print('Making predictions for queries:') + queries = ['./examples/data/object_detection/cat.jpg'] + print(queries) + predictions = make_predictions_image(client, predictor_host, queries) + print('Predictions are:') + print(predictions) + + print('Stopping inference job...') + pprint(client.stop_inference_job(app)) + + +if "__main__" == __name__: + parser = argparse.ArgumentParser() + parser.add_argument( + '--email', + type=str, + default="superadmin@singaauto", + help='Email of user', + ) + parser.add_argument( + '--password', + type=str, + default="singa_auto", + help='Password of user', + ) + parser.add_argument( + '--gpus', + type=int, + default=1, + help='How many GPUs to use', + ) + parser.add_argument( + '--hours', + type=float, + default=1, + help='How long the train job should run for (in hours)', + ) + parser.add_argument( + '--use_old', + type=bool, + default=True, + help='whether use existing dataset and model', + ) + (args, _) = parser.parse_known_args() + + # Initialize client + client = Client() + client.login(email=args.email, password=args.password) + + print('Preprocessing dataset...') + data_dir = '/home/taomingyang/dataset/package' + train_dataset_path = os.path.join(data_dir, 'coco_cat.zip') + val_dataset_path = os.path.join(data_dir, 'coco_mini.zip') + + if args.use_old: + train_dataset = { + 'id': 'a5181e1f-74d1-4916-a853-4ab75afa81d5', + 'name': 'yolo_train_0531_1650', + 'owner_id': '6d37f19f-9063-4b47-a73f-5cb6577f4f85', + 'size_bytes': 573851732, + 'stat': {}, + 'store_dataset_id': '039e05a7-917a-4441-aaff-110cf7552c73.data', + 'task': 'OBJECT_DETECTION' + } + val_dataset = { + 'id': '3bb9113f-2339-4cd4-9c22-d99ffcdd27b9', + 'name': 'yolo_val_0531_1650', + 'owner_id': '6d37f19f-9063-4b47-a73f-5cb6577f4f85', + 'size_bytes': 24435329, + 'stat': {}, + 'store_dataset_id': '93f4c5d2-b9d8-4585-b193-c1d19bb8026d.data', + 'task': 'OBJECT_DETECTION' + } + # train_model = { # using server dataset + # 'id': '48cd0413-ec4b-4f9b-8364-bbb51db52a45', + # 'name': 'yolo_0512_1055_iter_1', + # 'user_id': 'dd703056-e2f2-4e30-9e44-ecd1f7ccee7d' + # } + train_model = { # using local dataset, mini train + 'id': 'dcd9e8a0-e74f-4ca1-880c-864405a91f92', + 'name': 'yolo_0531_1650_iter_2', + 'user_id': '6d37f19f-9063-4b47-a73f-5cb6577f4f85' + } + # train_model = { # using local dataset, cat train + # 'id': '4ac4b8bc-7225-46bd-8f50-786434cf0d3e', + # 'name': 'yolo_0520_0855_iter_2', + # 'user_id': 'dd703056-e2f2-4e30-9e44-ecd1f7ccee7d' + # } + + run_object_detection( + client, train_dataset_path, val_dataset_path, args.gpus, args.hours, + train_dataset=train_dataset, val_dataset=val_dataset, train_model=train_model, + ) + else: + run_object_detection(client, train_dataset_path, val_dataset_path, args.gpus, args.hours) + + print(args) diff --git a/examples/scripts/run_pos_tagging.py b/examples/scripts/run_pos_tagging.py index 7afb14cd..9dda7ba6 100644 --- a/examples/scripts/run_pos_tagging.py +++ b/examples/scripts/run_pos_tagging.py @@ -26,7 +26,7 @@ from singa_auto.constants import BudgetOption, ModelDependency from examples.scripts.quickstart import get_predictor_host, \ - wait_until_train_job_has_stopped, make_predictions, gen_id + wait_until_train_job_has_stopped, make_predictions_json, gen_id from examples.datasets.corpus.load_sample_ptb import load_sample_ptb @@ -102,7 +102,7 @@ def run_pos_tagging(client, train_dataset_path, val_dataset_path, gpus, hours): '1,214', 'cars', 'in', 'the', 'U.S.' ]] print(queries) - predictions = make_predictions(client, predictor_host, queries) + predictions = make_predictions_json(client, predictor_host, queries) print('Predictions are:') print(predictions) diff --git a/examples/scripts/run_speech_recognition.py b/examples/scripts/run_speech_recognition.py index e7f8c7a1..8efadd43 100644 --- a/examples/scripts/run_speech_recognition.py +++ b/examples/scripts/run_speech_recognition.py @@ -17,9 +17,12 @@ # under the License. # +import os +import sys +sys.path.append(os.getcwd()) + from pprint import pprint import argparse -import os import base64 from singa_auto.client import Client @@ -27,7 +30,7 @@ from singa_auto.constants import BudgetOption, ModelDependency from examples.scripts.quickstart import get_predictor_host, \ - wait_until_train_job_has_stopped, make_predictions, gen_id + wait_until_train_job_has_stopped, make_predictions_json, gen_id from examples.datasets.audio_files.load_librispeech import load_librispeech diff --git a/examples/scripts/run_tabular_regression.py b/examples/scripts/run_tabular_regression.py index 36e5df66..42d9f9b4 100644 --- a/examples/scripts/run_tabular_regression.py +++ b/examples/scripts/run_tabular_regression.py @@ -26,7 +26,7 @@ from singa_auto.constants import BudgetOption, ModelDependency from examples.scripts.quickstart import get_predictor_host, \ - wait_until_train_job_has_stopped, make_predictions, gen_id + wait_until_train_job_has_stopped, make_predictions_json, gen_id from examples.datasets.tabular.csv_file import load @@ -99,7 +99,7 @@ def run_tabular_regression(client, if queries is not None: print('Making predictions for queries:') print(queries) - predictions = make_predictions(client, predictor_host, queries) + predictions = make_predictions_json(client, predictor_host, queries) print('Predictions are:') print(predictions) diff --git a/scripts/.base_env.sh b/scripts/.base_env.sh index bf3b0ef0..3ee6b3eb 100644 --- a/scripts/.base_env.sh +++ b/scripts/.base_env.sh @@ -77,6 +77,7 @@ export KIBANA_EXT_PORT=31009 export DOCKER_WORKDIR_PATH=/root export DB_DIR_ROOT=db export DB_DIR_PATH=db/data +export DB_PATH_ON_MASTER=/data0/singa_auto_data export DATA_DIR_PATH=data # Shares a data folder with containers, relative to workdir export LOGS_DIR_PATH=logs # Shares a folder with containers that stores components' logs, relative to workdir export PARAMS_DIR_PATH=params # Shares a folder with containers that stores model parameters, relative to workdir @@ -97,14 +98,17 @@ export ES_DOCKER_WORKDIR_PATH=/usr/share/elasticsearch # Docker images for SINGA-Auto's custom components -export SINGA_AUTO_IMAGE_ADMIN=singaauto/singa_auto_admin -export SINGA_AUTO_IMAGE_WEB_ADMIN=singaauto/singa_auto_admin_web -export SINGA_AUTO_IMAGE_WORKER=singaauto/singa_auto_worker -export SINGA_AUTO_IMAGE_PREDICTOR=singaauto/singa_auto_predictor -export SINGA_AUTO_IMAGE_LOGSTASH=singaauto/singa_auto_logstash -export SINGA_AUTO_IMAGE_ES=singaauto/singa_auto_es - -export SINGA_AUTO_IMAGE_TEST=singaauto/singa_auto_test +export SINGA_AUTO_IMAGE_ADMIN=singa_auto/singa_auto_admin +export SINGA_AUTO_IMAGE_WEB_ADMIN=singa_auto/singa_auto_admin_web +export SINGA_AUTO_IMAGE_WORKER=singa_auto/singa_auto_worker +export SINGA_AUTO_IMAGE_WORKER_CU90=singa_auto/singa_auto_worker_cu90 +export SINGA_AUTO_IMAGE_WORKER_CU100=singa_auto/singa_auto_worker_cu100 +export SINGA_AUTO_IMAGE_WORKER_CU101=singa_auto/singa_auto_worker_cu101 +export SINGA_AUTO_IMAGE_WORKER_CU110=singa_auto/singa_auto_worker_cu110 +export SINGA_AUTO_IMAGE_PREDICTOR=singa_auto/singa_auto_predictor +export SINGA_AUTO_IMAGE_LOGSTASH=singa_auto/singa_auto_logstash +export SINGA_AUTO_IMAGE_ES=singa_auto/singa_auto_es +export SINGA_AUTO_IMAGE_TEST=singa_auto/singa_auto_test # Docker images for dependent services export IMAGE_POSTGRES=postgres:10.5-alpine diff --git a/scripts/base_build_image.sh b/scripts/base_build_image.sh index eca15499..d667d649 100644 --- a/scripts/base_build_image.sh +++ b/scripts/base_build_image.sh @@ -35,10 +35,32 @@ title "Building SINGA-Auto Admin's image..." docker build -t $SINGA_AUTO_IMAGE_ADMIN:$SINGA_AUTO_VERSION -f ./dockerfiles/admin.Dockerfile \ --build-arg DOCKER_WORKDIR_PATH=$DOCKER_WORKDIR_PATH \ --build-arg CONDA_ENVIORNMENT=$CONDA_ENVIORNMENT $PWD || exit 1 + title "Building SINGA-Auto Worker's image..." docker build -t $SINGA_AUTO_IMAGE_WORKER:$SINGA_AUTO_VERSION -f ./dockerfiles/worker.Dockerfile \ --build-arg DOCKER_WORKDIR_PATH=$DOCKER_WORKDIR_PATH \ --build-arg CONDA_ENVIORNMENT=$CONDA_ENVIORNMENT $PWD || exit 1 + +title "Building SINGA-Auto Worker's image with cu90..." +docker build -t $SINGA_AUTO_IMAGE_WORKER_CU90:$SINGA_AUTO_VERSION -f ./dockerfiles/worker_cu90.Dockerfile \ + --build-arg DOCKER_WORKDIR_PATH=$DOCKER_WORKDIR_PATH \ + --build-arg CONDA_ENVIORNMENT=$CONDA_ENVIORNMENT $PWD || exit 1 + +title "Building SINGA-Auto Worker's image with cu100..." +docker build -t $SINGA_AUTO_IMAGE_WORKER_CU100:$SINGA_AUTO_VERSION -f ./dockerfiles/worker_cu100.Dockerfile \ + --build-arg DOCKER_WORKDIR_PATH=$DOCKER_WORKDIR_PATH \ + --build-arg CONDA_ENVIORNMENT=$CONDA_ENVIORNMENT $PWD || exit 1 + +title "Building SINGA-Auto Worker's image with cu101..." +docker build -t $SINGA_AUTO_IMAGE_WORKER_CU101:$SINGA_AUTO_VERSION -f ./dockerfiles/worker_cu101.Dockerfile \ + --build-arg DOCKER_WORKDIR_PATH=$DOCKER_WORKDIR_PATH \ + --build-arg CONDA_ENVIORNMENT=$CONDA_ENVIORNMENT $PWD || exit 1 + +title "Building SINGA-Auto Worker's image with cu110..." +docker build -t $SINGA_AUTO_IMAGE_WORKER_CU110:$SINGA_AUTO_VERSION -f ./dockerfiles/worker_cu110.Dockerfile \ + --build-arg DOCKER_WORKDIR_PATH=$DOCKER_WORKDIR_PATH \ + --build-arg CONDA_ENVIORNMENT=$CONDA_ENVIORNMENT $PWD || exit 1 + title "Building SINGA-Auto Predictor's image..." docker build -t $SINGA_AUTO_IMAGE_PREDICTOR:$SINGA_AUTO_VERSION -f ./dockerfiles/predictor.Dockerfile \ --build-arg DOCKER_WORKDIR_PATH=$DOCKER_WORKDIR_PATH \ diff --git a/scripts/docker_swarm/test/start_monitor.sh b/scripts/docker_swarm/test/start_monitor.sh index 2c5cef01..5949f307 100644 --- a/scripts/docker_swarm/test/start_monitor.sh +++ b/scripts/docker_swarm/test/start_monitor.sh @@ -37,7 +37,7 @@ title "Starting SINGA-Auto's Monitor..." -e KAFKA_HOST=$KAFKA_HOST \ -p $LOGSTASH_PORT:$LOGSTASH_PORT \ -v $HOST_WORKDIR_PATH/$LOGS_DIR_PATH:$LOGSTASH_DOCKER_WORKDIR_PATH/$LOGS_DIR_PATH \ - -v $HOST_WORKDIR_PATH/scripts/config/logstash.conf:$LOGSTASH_DOCKER_WORKDIR_PATH/logstash.conf \ + -v $HOST_WORKDIR_PATH/log_minitor/config/logstash.conf:$LOGSTASH_DOCKER_WORKDIR_PATH/logstash.conf \ -d $SINGA_AUTO_IMAGE_LOGSTASH:$SINGA_AUTO_VERSION \ &> LOGSTADH_LOG_FILE_PATH) & diff --git a/scripts/kubernetes/.env.sh b/scripts/kubernetes/.env.sh index dd7d71ea..72ea1026 100644 --- a/scripts/kubernetes/.env.sh +++ b/scripts/kubernetes/.env.sh @@ -49,11 +49,11 @@ export CONTAINER_MODE=K8S # Cluster Mode for SINGA-auto export CLUSTER_MODE=SINGLE # CLUSTER or SINGLE +source $HOST_WORKDIR_PATH/scripts/.base_env.sh $IP_ADRESS $SINGA_AUTO_VERSION || exit 1 + if [ "$CLUSTER_MODE" = "CLUSTER" ]; then export POSTGRES_HOST=stolon-proxy-service export NFS_HOST_IP=$IP_ADRESS # NFS Host IP - if used nfs as pv for database storage export RUN_DIR_PATH=run # Shares a folder with containers that stores components' running info, relative to workdir fi -source $HOST_WORKDIR_PATH/scripts/.base_env.sh $IP_ADRESS $SINGA_AUTO_VERSION || exit 1 - diff --git a/scripts/kubernetes/create_config.py b/scripts/kubernetes/create_config.py index 26ddba39..bbd38f5f 100644 --- a/scripts/kubernetes/create_config.py +++ b/scripts/kubernetes/create_config.py @@ -101,6 +101,7 @@ SINGA_AUTO_IMAGE_SPARKAPP = sys.argv[68] SPAEK_DOCKER_JARS_PATH = sys.argv[69] ES_DOCKER_WORKDIR_PATH = sys.argv[70] + DB_PATH_ON_MASTER = sys.argv[71] #zk service content = {} @@ -373,6 +374,8 @@ env.append({'name': 'CONTAINER_MODE', 'value': CONTAINER_MODE}) env.append({'name': 'INGRESS_NAME', 'value': INGRESS_NAME}) env.append({'name': 'INGRESS_EXT_PORT', 'value': INGRESS_EXT_PORT}) + env.append({'name': 'KUBERNETES_ADVERTISE_ADDR', 'value': KUBERNETES_ADVERTISE_ADDR}) + env.append({'name': 'DB_PATH_ON_MASTER', 'value': DB_PATH_ON_MASTER}) container.setdefault('env', env) with open('{}/scripts/kubernetes/start_admin_deployment.json'.format(PYTHONPATH), 'w') as f: f.write(json.dumps(content, indent=4)) @@ -474,7 +477,7 @@ {'name': 'log-path', 'mountPath': '{}/{}'.format(LOGSTASH_DOCKER_WORKDIR_PATH, LOGS_DIR_PATH)}, \ {'name': 'docker-path', 'mountPath': '/var/run/docker.sock'}]) template['spec']['volumes'] = [ - {'name': 'conf-path', 'hostPath': {'path': '{}/scripts/config/logstash.conf'.format(HOST_WORKDIR_PATH)}}, \ + {'name': 'conf-path', 'hostPath': {'path': '{}/log_minitor/config/logstash.conf'.format(HOST_WORKDIR_PATH)}}, \ {'name': 'log-path', 'hostPath': {'path': '{}/{}'.format(HOST_WORKDIR_PATH, LOGS_DIR_PATH)}}, \ {'name': 'docker-path', 'hostPath': {'path': '/var/run/docker.sock'}}] @@ -569,7 +572,7 @@ [{'name': 'conf-path', 'mountPath': '{}/config/elasticsearch.yml'.format(LOGSTASH_DOCKER_WORKDIR_PATH)},\ {'name': 'docker-path', 'mountPath': '/var/run/docker.sock'}]) template['spec']['volumes'] = [ - {'name': 'conf-path', 'hostPath': {'path': '{}/scripts/config/elasticsearch.yml'.format(HOST_WORKDIR_PATH)}}, \ + {'name': 'conf-path', 'hostPath': {'path': '{}/log_minitor/config/elasticsearch.yml'.format(HOST_WORKDIR_PATH)}}, \ {'name': 'docker-path', 'hostPath': {'path': '/var/run/docker.sock'}}] with open('{}/scripts/kubernetes/start_es_deployment.json'.format(PYTHONPATH), 'w') as f: diff --git a/scripts/kubernetes/create_nfs_pv.sh b/scripts/kubernetes/create_nfs_pv.sh index 5f31ddce..834495ef 100755 --- a/scripts/kubernetes/create_nfs_pv.sh +++ b/scripts/kubernetes/create_nfs_pv.sh @@ -6,7 +6,7 @@ if [ $# -lt 8 ]; then fi TMP_NFS_PV_YAML=$HOST_WORKDIR_PATH/scripts/kubernetes/yaml/tmp-nfs-pv.yaml -cp $HOST_WORKDIR_PATH/scripts/kubernetes//yaml/nfs-pv.yaml.template $TMP_NFS_PV_YAML +cp $HOST_WORKDIR_PATH/scripts/kubernetes/yaml/nfs-pv.yaml.template $TMP_NFS_PV_YAML sed -ri "s/PV_NAME/$1/g" $TMP_NFS_PV_YAML sed -ri "s/PV_IP/$2/g" $TMP_NFS_PV_YAML sed -ri "s#PV_PATH#$3/#" $TMP_NFS_PV_YAML diff --git a/scripts/kubernetes/generate_config.sh b/scripts/kubernetes/generate_config.sh index d2669573..e61f2ed8 100644 --- a/scripts/kubernetes/generate_config.sh +++ b/scripts/kubernetes/generate_config.sh @@ -95,4 +95,5 @@ $SINGA_AUTO_IMAGE_ES \ $KIBANA_EXT_PORT \ $SINGA_AUTO_IMAGE_SPARKAPP \ $SPAEK_DOCKER_JARS_PATH \ -$ES_DOCKER_WORKDIR_PATH +$ES_DOCKER_WORKDIR_PATH \ +$DB_PATH_ON_MASTER \ No newline at end of file diff --git a/scripts/kubernetes/start_stolon.sh b/scripts/kubernetes/start_stolon.sh index 9b32bf8e..d8364c57 100644 --- a/scripts/kubernetes/start_stolon.sh +++ b/scripts/kubernetes/start_stolon.sh @@ -31,8 +31,18 @@ fi echo "Create PV..." # With stolon, we use some default parameters to make nfs as pv, if your have another choice or want to change the default parameters, # your should modify this script -bash $HOST_WORKDIR_PATH/scripts/kubernetes/create_nfs_pv.sh database-pv-0 $NFS_HOST_IP /home/singa_auto/database/db0 100Gi ReadWriteOnce Retain pv database-pv-0 -bash $HOST_WORKDIR_PATH/scripts/kubernetes/create_nfs_pv.sh database-pv-1 $NFS_HOST_IP /home/singa_auto/database/db1 100Gi ReadWriteOnce Retain pv database-pv-1 + +if [ ! -d $DB_PATH_ON_MASTER ]; then + echo "create database folder" + mkdir -p $DB_PATH_ON_MASTER/database/db0 + mkdir -p $DB_PATH_ON_MASTER/database/db1 + mkdir -p $DB_PATH_ON_MASTER/$DATA_DIR_PATH + mkdir -p $DB_PATH_ON_MASTER/$LOGS_DIR_PATH + mkdir -p $DB_PATH_ON_MASTER/$PARAMS_DIR_PATH +fi + +bash $HOST_WORKDIR_PATH/scripts/kubernetes/create_nfs_pv.sh database-pv-0 $NFS_HOST_IP $DB_PATH_ON_MASTER/database/db0 100Gi ReadWriteOnce Retain pv database-pv-0 +bash $HOST_WORKDIR_PATH/scripts/kubernetes/create_nfs_pv.sh database-pv-1 $NFS_HOST_IP $DB_PATH_ON_MASTER/database/db1 100Gi ReadWriteOnce Retain pv database-pv-1 echo "Create PVC..." # PVC Name is Fixed bash $HOST_WORKDIR_PATH/scripts/kubernetes/create_nfs_pvc.sh database-stolon-keeper-0 100Gi ReadWriteOnce pv database-pv-0 diff --git a/scripts/kubernetes/stolon/generate_stolon_yaml.sh b/scripts/kubernetes/stolon/generate_stolon_yaml.sh index f98f6c76..f557cfc1 100644 --- a/scripts/kubernetes/stolon/generate_stolon_yaml.sh +++ b/scripts/kubernetes/stolon/generate_stolon_yaml.sh @@ -1,22 +1,22 @@ -STOLON_PATH=$HOST_WORKDIR_PATH/scripts/kubernetes/stolon -cp -f $STOLON_PATH/secret.yaml.template $STOLON_PATH/secret.yaml -sed -ri "s/STOLON_PASSWD/$POSTGRES_STOLON_PASSWD/g" $STOLON_PATH/secret.yaml - -# replace config for stolon keeper -cp -f $STOLON_PATH/stolon-keeper.yaml.template $STOLON_PATH/stolon-keeper.yaml -sed -ri "s#SINGA_AUTO_IMAGE_STOLON#$SINGA_AUTO_IMAGE_STOLON#" $STOLON_PATH/stolon-keeper.yaml -sed -ri "s/POSTGRES_PORT/$POSTGRES_PORT/g" $STOLON_PATH/stolon-keeper.yaml - -# replace config for stolon proxy -cp -f $STOLON_PATH/stolon-proxy.yaml.template $STOLON_PATH/stolon-proxy.yaml -sed -ri "s#SINGA_AUTO_IMAGE_STOLON#$SINGA_AUTO_IMAGE_STOLON#" $STOLON_PATH/stolon-proxy.yaml -sed -ri "s/POSTGRES_PORT/$POSTGRES_PORT/g" $STOLON_PATH/stolon-proxy.yaml - -# replace config for stolon sentinel -cp -f $STOLON_PATH/stolon-sentinel.yaml.template $STOLON_PATH/stolon-sentinel.yaml -sed -ri "s#RAFIKI_IMAGE_STOLON#$RAFIKI_IMAGE_STOLON#" $STOLON_PATH/stolon-sentinel.yaml - -# replace config for stolon proxy service -cp -f $STOLON_PATH/stolon-proxy-service.yaml.template $STOLON_PATH/stolon-proxy-service.yaml -sed -ri "s/POSTGRES_EXT_PORT/$POSTGRES_EXT_PORT/g" $STOLON_PATH/stolon-proxy-service.yaml -sed -ri "s/POSTGRES_PORT/$POSTGRES_PORT/g" $STOLON_PATH/stolon-proxy-service.yaml +STOLON_PATH=scripts/kubernetes/stolon +cp -f $STOLON_PATH/secret.yaml.template $STOLON_PATH/secret.yaml +sed -ri "s/STOLON_PASSWD/$POSTGRES_STOLON_PASSWD/g" $STOLON_PATH/secret.yaml + +# replace config for stolon keeper +cp -f $STOLON_PATH/stolon-keeper.yaml.template $STOLON_PATH/stolon-keeper.yaml +sed -ri "s#SINGA_AUTO_IMAGE_STOLON#$SINGA_AUTO_IMAGE_STOLON#" $STOLON_PATH/stolon-keeper.yaml +sed -ri "s/POSTGRES_PORT/$POSTGRES_PORT/g" $STOLON_PATH/stolon-keeper.yaml + +# replace config for stolon proxy +cp -f $STOLON_PATH/stolon-proxy.yaml.template $STOLON_PATH/stolon-proxy.yaml +sed -ri "s#SINGA_AUTO_IMAGE_STOLON#$SINGA_AUTO_IMAGE_STOLON#" $STOLON_PATH/stolon-proxy.yaml +sed -ri "s/POSTGRES_PORT/$POSTGRES_PORT/g" $STOLON_PATH/stolon-proxy.yaml + +# replace config for stolon sentinel +cp -f $STOLON_PATH/stolon-sentinel.yaml.template $STOLON_PATH/stolon-sentinel.yaml +sed -ri "s#SINGA_AUTO_IMAGE_STOLON#$SINGA_AUTO_IMAGE_STOLON#" $STOLON_PATH/stolon-sentinel.yaml + +# replace config for stolon proxy service +cp -f $STOLON_PATH/stolon-proxy-service.yaml.template $STOLON_PATH/stolon-proxy-service.yaml +sed -ri "s/POSTGRES_EXT_PORT/$POSTGRES_EXT_PORT/g" $STOLON_PATH/stolon-proxy-service.yaml +sed -ri "s/POSTGRES_PORT/$POSTGRES_PORT/g" $STOLON_PATH/stolon-proxy-service.yaml diff --git a/scripts/kubernetes/stolon/stolon-keeper.yaml.template b/scripts/kubernetes/stolon/stolon-keeper.yaml.template index 004f4c29..ddac6f7d 100644 --- a/scripts/kubernetes/stolon/stolon-keeper.yaml.template +++ b/scripts/kubernetes/stolon/stolon-keeper.yaml.template @@ -21,7 +21,7 @@ spec: terminationGracePeriodSeconds: 10 containers: - name: stolon-keeper - image: RAFIKI_IMAGE_STOLON + image: SINGA_AUTO_IMAGE_STOLON command: - "/bin/bash" - "-ec" diff --git a/scripts/kubernetes/stolon/stolon-sentinel.yaml.template b/scripts/kubernetes/stolon/stolon-sentinel.yaml.template index cd3e9ff1..fdaace30 100644 --- a/scripts/kubernetes/stolon/stolon-sentinel.yaml.template +++ b/scripts/kubernetes/stolon/stolon-sentinel.yaml.template @@ -15,7 +15,7 @@ spec: spec: containers: - name: stolon-sentinel - image: RAFIKI_IMAGE_STOLON + image: SINGA_AUTO_IMAGE_STOLON command: - "/bin/bash" - "-ec" diff --git a/scripts/kubernetes/stop.sh b/scripts/kubernetes/stop.sh index 2ffe8d72..6270f58b 100644 --- a/scripts/kubernetes/stop.sh +++ b/scripts/kubernetes/stop.sh @@ -83,7 +83,7 @@ then else -# kubectl delete -f $HOST_WORKDIR_PATH/scripts/kubernetes/nvidia-device-plugin.yml +# kubectl delete -f $HOST_WORKDIR_PATH/scripts/kubernetes/yaml/nvidia-device-plugin.yml title "Stopping any existing jobs..." python $HOST_WORKDIR_PATH/scripts/stop_all_jobs.py diff --git a/scripts/start_worker.py b/scripts/start_worker.py index a1f51493..fec1930e 100644 --- a/scripts/start_worker.py +++ b/scripts/start_worker.py @@ -1,71 +1,70 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import os - - -# Run install command -install_command = os.environ.get('WORKER_INSTALL_COMMAND', '') - -for i in range(10): - exit_code = os.system(install_command) - if exit_code != 0: - print('Install command gave non-zero exit code: "{}"'.format(install_command)) - import time - time.sleep(3) - else: - break -else: - raise Exception( - 'Install command gave non-zero exit code: "{}"'.format(install_command)) - -worker = None - -from singa_auto.constants import ServiceType -from singa_auto.utils.service import run_worker -from singa_auto.meta_store import MetaStore - - -def start_worker(service_id, service_type, container_id): - global worker - - if service_type == ServiceType.TRAIN: - from singa_auto.worker.train import TrainWorker - worker = TrainWorker(service_id, container_id) - worker.start() - elif service_type == ServiceType.INFERENCE: - from singa_auto.worker.inference import InferenceWorker - worker = InferenceWorker(service_id, container_id) - worker.start() - elif service_type == ServiceType.ADVISOR: - from singa_auto.worker.advisor import AdvisorWorker - worker = AdvisorWorker(service_id) - worker.start() - else: - raise Exception('Invalid service type: {}'.format(service_type)) - - -def stop_worker(): - global worker - if worker is not None: - worker.stop() - - -meta_store = MetaStore() -run_worker(meta_store, start_worker, stop_worker) +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +import os + +from singa_auto.constants import ServiceType +from singa_auto.utils.service import run_worker +from singa_auto.meta_store import MetaStore + +# Run install command +install_command = os.environ.get('WORKER_INSTALL_COMMAND', '') + +for i in range(10): + exit_code = os.system(install_command) + if exit_code != 0: + print('Install command gave non-zero exit code: "{}"'.format(install_command)) + import time + time.sleep(3) + else: + break +else: + raise Exception( + 'Install command gave non-zero exit code: "{}"'.format(install_command)) + +worker = None + + +def start_worker(service_id, service_type, container_id): + global worker + + if service_type == ServiceType.TRAIN: + from singa_auto.worker.train import TrainWorker + worker = TrainWorker(service_id, container_id) + worker.start() + elif service_type == ServiceType.INFERENCE: + from singa_auto.worker.inference import InferenceWorker + worker = InferenceWorker(service_id, container_id) + worker.start() + elif service_type == ServiceType.ADVISOR: + from singa_auto.worker.advisor import AdvisorWorker + worker = AdvisorWorker(service_id) + worker.start() + else: + raise Exception('Invalid service type: {}'.format(service_type)) + + +def stop_worker(): + global worker + if worker is not None: + worker.stop() + + +meta_store = MetaStore() +run_worker(meta_store, start_worker, stop_worker) diff --git a/singa_auto/admin/requirements.txt b/singa_auto/admin/requirements.txt index 81a833c7..2e868415 100644 --- a/singa_auto/admin/requirements.txt +++ b/singa_auto/admin/requirements.txt @@ -1,4 +1,4 @@ bcrypt>=3.1.4 Flask==1.0.2 Flask-Cors==3.0.6 -kubernetes==10.0.0 +kubernetes==10.0.1 diff --git a/singa_auto/admin/services_manager.py b/singa_auto/admin/services_manager.py index 5f32530c..8b07637c 100644 --- a/singa_auto/admin/services_manager.py +++ b/singa_auto/admin/services_manager.py @@ -16,6 +16,7 @@ # specific language governing permissions and limitations # under the License. # + import json import os import logging @@ -45,9 +46,10 @@ class ServiceDeploymentError(Exception): 'SUPERADMIN_PASSWORD', 'REDIS_HOST', 'REDIS_PORT', 'REDIS_PASSWORD', 'ADMIN_HOST', 'ADMIN_PORT', - 'DATA_DIR_PATH', 'LOGS_DIR_PATH', 'PARAMS_DIR_PATH', + 'DATA_DIR_PATH', 'LOGS_DIR_PATH', 'PARAMS_DIR_PATH', 'DB_PATH_ON_MASTER', 'KAFKA_HOST', 'KAFKA_PORT', - + 'CONTAINER_MODE', 'KUBERNETES_ADVERTISE_ADDR', + 'KUBERNETES_INNER_NETWORK_RANGE' ] DEFAULT_TRAIN_GPU_COUNT = 0 @@ -84,6 +86,7 @@ def __init__(self, self._data_dir_path = os.environ['DATA_DIR_PATH'] self._logs_dir_path = os.environ['LOGS_DIR_PATH'] self._params_dir_path = os.environ['PARAMS_DIR_PATH'] + self._params_root_path = os.environ['DB_PATH_ON_MASTER'] self._host_workdir_path = os.environ['HOST_WORKDIR_PATH'] self._docker_workdir_path = os.environ['DOCKER_WORKDIR_PATH'] self._predictor_image = f"{os.environ['SINGA_AUTO_IMAGE_PREDICTOR']}:{version}" @@ -204,13 +207,17 @@ def create_train_services(self, train_job_id): # Create advisor self._create_advisor(sub_train_job) - # 1 GPU per worker - for _ in range(gpus): - self._create_train_job_worker(sub_train_job, dist_workers=dist_workers) + # # 1 GPU per worker + # for gpu_idx in range(gpus): + # self._create_train_job_worker(sub_train_job, dist_workers=dist_workers) + # logger.info("gpu idx {} created".format(gpu_idx)) - # CPU workers - for _ in range(cpus): - self._create_train_job_worker(sub_train_job, dist_workers=dist_workers, gpus=0) + # # CPU workers + # for cpu_idx in range(cpus): + # self._create_train_job_worker(sub_train_job, dist_workers=dist_workers, gpus=0) + # logger.info("cpu idx {} created".format(cpu_idx)) + + self._create_train_job_worker(sub_train_job, dist_workers=dist_workers, gpus=gpus) return train_job @@ -380,7 +387,7 @@ def _create_inference_job_worker(self, } service = self._create_service(service_type=service_type, - docker_image=model.docker_image, + docker_image=self._get_docker_image_by_dependency(model.docker_image, gpus, model.dependencies), environment_vars=environment_vars, gpus=gpus) @@ -411,6 +418,33 @@ def _create_predictor(self, inference_job, inferenceAppName: str): return service + def _get_docker_image_by_dependency(self, docker_image_name, gpus, dependencies): + logger.info("docker_image_name is {} with type {}".format(docker_image_name, type(docker_image_name))) + if gpus == 0: + selected_image_name = docker_image_name + elif "torch" in dependencies: + torch_dependency = '.'.join(dependencies["torch"].split('.')[:2]) + if torch_dependency in ["0.4", "1.0", "1.1"]: + selected_image_name = docker_image_name.replace("singa_auto_worker", "singa_auto_worker_cu90") + elif torch_dependency in ["1.2", "1.3", "1.4"]: + selected_image_name = docker_image_name.replace("singa_auto_worker", "singa_auto_worker_cu100") + elif torch_dependency in ["1.5", "1.6", "1.7", "1.8"]: + selected_image_name = docker_image_name.replace("singa_auto_worker", "singa_auto_worker_cu101") + elif "tensorflow" in dependencies: + tf_dependency = '.'.join(dependencies["tensorflow"].split('.')[:2]) + if tf_dependency in ["1.5", "1.6", "1.7", "1.8", "1.9", "1.10", "1.11", "1.12"]: + selected_image_name = docker_image_name.replace("singa_auto_worker", "singa_auto_worker_cu90") + elif tf_dependency in ["1.13", "1.14", "1.15", "2.0"]: + selected_image_name = docker_image_name.replace("singa_auto_worker", "singa_auto_worker_cu100") + elif tf_dependency in ["2.1", "2.2", "2.3"]: + selected_image_name = docker_image_name.replace("singa_auto_worker", "singa_auto_worker_cu101") + elif tf_dependency in ["2.4"]: + selected_image_name = docker_image_name.replace("singa_auto_worker", "singa_auto_worker_cu110") + else: + selected_image_name = docker_image_name + + return selected_image_name + def _create_train_job_worker(self, sub_train_job, dist_workers=0, gpus=1): model = self._meta_store.get_model(sub_train_job.model_id) service_type = ServiceType.TRAIN @@ -421,7 +455,7 @@ def _create_train_job_worker(self, sub_train_job, dist_workers=0, gpus=1): } service = self._create_service(service_type=service_type, - docker_image=model.docker_image, + docker_image=self._get_docker_image_by_dependency(model.docker_image, gpus, model.dependencies), environment_vars=environment_vars, gpus=gpus, dist_workers=dist_workers) @@ -507,20 +541,22 @@ def _create_service(self, } if self._app_mode == 'DEV': - # Mount whole root directory - mounts = {self._host_workdir_path: self._docker_workdir_path} - else: - # Mount only data, logs and params folders to containers' work directories mounts = { os.path.join(self._host_workdir_path, self._data_dir_path): - os.path.join(self._docker_workdir_path, - self._data_dir_path), + os.path.join(self._docker_workdir_path, self._data_dir_path), os.path.join(self._host_workdir_path, self._logs_dir_path): - os.path.join(self._docker_workdir_path, - self._logs_dir_path), + os.path.join(self._docker_workdir_path, self._logs_dir_path), os.path.join(self._host_workdir_path, self._params_dir_path): - os.path.join(self._docker_workdir_path, - self._params_dir_path) + os.path.join(self._docker_workdir_path, self._params_dir_path) + } + else: + mounts = { + os.path.join(self._params_root_path, self._data_dir_path): + os.path.join(self._docker_workdir_path, self._data_dir_path), + os.path.join(self._params_root_path, self._logs_dir_path): + os.path.join(self._docker_workdir_path, self._logs_dir_path), + os.path.join(self._params_root_path, self._params_dir_path): + os.path.join(self._docker_workdir_path, self._params_dir_path) } # Expose container port if it exists @@ -541,8 +577,32 @@ def _create_service(self, container_service_name = '{}-{}-{}'.format( service_app_name, service_type.lower(), service.id.split('-')[0]) + if service_type in ["TRAIN", "INFERENCE"]: + gpu_allocated = dict() + gpu_in_use_by_train = self._meta_store.get_services(service_type = "TRAIN") + for service_info in gpu_in_use_by_train: + if service_info.status in ["RUNNING", "DEPLOYING"] and "default" != service_info.container_service_info["node_id"]: + gpu_node_name = service_info.container_service_info["node_id"] + if gpu_node_name not in gpu_allocated: + gpu_allocated[gpu_node_name] = [] + gpu_list = (service_info.container_service_info["gpu_list"] if "gpu_list" in service_info.container_service_info else "").split(',') + for gpu_idx in gpu_list: + gpu_allocated[gpu_node_name].append(gpu_idx.strip()) + + gpu_in_use_by_inference = self._meta_store.get_services(service_type = "INFERENCE") + for service_info in gpu_in_use_by_inference: + if service_info.status in ["RUNNING", "DEPLOYING"] and "default" != service_info.container_service_info["node_id"]: + gpu_node_name = service_info.container_service_info["node_id"] + if gpu_node_name not in gpu_allocated: + gpu_allocated[gpu_node_name] = [] + gpu_list = (service_info.container_service_info["gpu_list"] if "gpu_list" in service_info.container_service_info else "").split(',') + gpu_allocated[gpu_node_name].append(gpu_list) + else: + gpu_allocated=None + container_service = self._container_manager.create_service( service_name=container_service_name, + service_type=service_type, docker_image=docker_image, replicas=replicas, args=args, @@ -550,7 +610,8 @@ def _create_service(self, mounts=mounts, publish_port=publish_port, gpus=gpus, - dist_workers=dist_workers) + dist_workers=dist_workers, + gpu_allocated=gpu_allocated) self._meta_store.mark_service_as_deploying( service, @@ -573,8 +634,7 @@ def _create_service(self, service_port=int(self._predictor_port)) self._container_manager.update_ingress(ingress_name=_ingress_name, - ingress_body=json.loads(ingress_info.ingress_body) - ) + ingress_body=json.loads(ingress_info.ingress_body)) self._meta_store.commit() diff --git a/singa_auto/advisor/requirements.txt b/singa_auto/advisor/requirements.txt index bdfb7dfa..88c51e2c 100644 --- a/singa_auto/advisor/requirements.txt +++ b/singa_auto/advisor/requirements.txt @@ -1,3 +1,2 @@ scikit-learn==0.22 scikit-optimize==0.5.2 -tensorflow==1.15.5 diff --git a/singa_auto/client/client.py b/singa_auto/client/client.py index c5adf77d..b46d6970 100644 --- a/singa_auto/client/client.py +++ b/singa_auto/client/client.py @@ -1,862 +1,862 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import requests -import json -import pickle -import os -from functools import wraps -from typing import Type, Dict, List, Any - -from singa_auto.constants import ModelAccessRight, ModelDependencies, Budget, BudgetOption, \ - InferenceBudget, InferenceBudgetOption, UserType, ModelType -from singa_auto.model import Params, BaseModel -from singa_auto.error_code import generate_error - - -class SingaAutoConnectionError(ConnectionError): - pass - - -DOCS_URL = 'https://nginyc.github.io/rafiki/docs/latest/docs/src/python/rafiki.client.Client.html' - -def rafiki_response_handler(resp): - # if isinstance(resp, dict): - # if resp.get('success', 0) == 0: - # return resp['data'] - # else: - # raise generate_error(resp.get('error_code', 500)) - # else: - # return resp - return resp - -# Returns a decorator that warns user about the method being deprecated -def _deprecated(msg=None): - - def deco(func): - nonlocal msg - msg = msg or f'`{func.__name__}` has been deprecated.' - - @wraps(func) - def deprecated_func(*args, **kwargs): - _warn(f'{msg}\n' \ - f'Refer to the updated documentation at {DOCS_URL}') - return func(*args, **kwargs) - - return deprecated_func - - return deco - - -class Client: - ''' - Initializes the Client to connect to a running - SINGA-Auto Admin instance that the Client connects to. - - :param admin_host: Host of SINGA-Auto Admin - :param admin_port: Port of SINGA-Auto Admin - ''' - - def __init__(self, - admin_host: str = os.environ.get('SINGA_AUTO_ADDR', - 'localhost'), - admin_port: int = os.environ.get('ADMIN_EXT_PORT', 3000)): - self._admin_host = admin_host - self._admin_port = admin_port - self._token = None - self._user = None - - def login(self, email: str, password: str) -> Dict[str, Any]: - ''' - Creates a login session as a SINGA-Auto user. You will have to be logged in to perform any actions. - - App developers can create, list and stop train and inference jobs, as well as list models. - Model developers can create and list models. - - The login session (the session token) expires in 1 hour. - - :param email: User's email - :param password: User's password - - :returns: Logged-in user as dictionary - ''' - data = self._post('/tokens', - json={ - 'email': email, - 'password': password - }) - self._token = data['token'] - - # Save user's data - self._user = {'id': data['user_id'], 'user_type': data['user_type']} - - return self._user - - def get_current_user(self) -> Dict[str, Any]: - ''' - Gets currently logged in user's data. - - :returns: Current user as dictionary, or ``None`` if client is not logged in - ''' - return self._user - - def logout(self): - ''' - Clears the current login session. - ''' - self._token = None - self._user = None - - #################################### - # User - #################################### - - def create_user(self, email: str, password: str, - user_type: UserType) -> Dict[str, Any]: - ''' - Creates a SINGA-Auto user. - - Only admins can create users (except for admins). - Only superadmins can create admins. - - :param email: The new user's email - :param password: The new user's password - :param user_type: The new user's type - - :returns: Created user as dictionary - ''' - data = self._post('/users', - json={ - 'email': email, - 'password': password, - 'user_type': user_type - }) - return data - - @_deprecated('`create_users` has been removed') - def create_users(self, *args, **kwargs): - pass - - def get_users(self) -> List[Dict[str, Any]]: - ''' - Lists all SINGA-Auto users. - - Only admins can list all users. - - :returns: List of users as list of dictionaries - ''' - data = self._get('/users') - return data - - def ban_user(self, email: str) -> Dict[str, Any]: - ''' - Bans a SINGA-Auto user, disallowing logins. - - This action is irrevisible. - Only admins can ban users (except for admins). - Only superadmins can ban admins. - - :param email: The user's email - - :returns: Banned user as dictionary - ''' - data = self._delete('/users', json={'email': email}) - return data - - #################################### - # Datasets - #################################### - - def create_dataset(self, - name: str, - task: str, - dataset_path: str = None, - dataset_url: str = None) -> Dict[str, Any]: - ''' - Creates a dataset on SINGA-Auto, either by uploading the dataset file from your filesystem or specifying a URL where the dataset file can be downloaded. - The dataset should be in a format specified by the task - Either `dataset_url` or `dataset_path` should be specified. - - Only admins, model developers and app developers can manage their own datasets. - - :param name: Name for the dataset, does not need to be unique - :param task: Task associated to the dataset - :param dataset_path: Path to the dataset file to upload from the local filesystem - :param dataset_url: Publicly accessible URL where the dataset file can be downloaded - :returns: Created dataset as dictionary - ''' - - dataset = dict() - - form_data = {'name': name, 'task': task, 'dataset_url': dataset_url} - - if dataset_path is not None: - dataset = { - 'dataset': ('dataset', open(dataset_path, - 'rb'), 'application/zip') - } - else: - print( - 'Waiting for server finish downloading the dataset from URL...') - - data = self._post_stream(path='/datasets', - files=dataset, - form_data=form_data) - - return data - - def get_datasets(self, task: str = None) -> List[Dict[str, Any]]: - ''' - Lists all datasets owned by the current user, optionally filtering by task. - - :param task: Task name - :returns: List of datasets as list of dictionaries - ''' - data = self._get('/datasets', params={'task': task}) - return data - - #################################### - # Models - #################################### - - def create_model(self, - name: str, - task: str, - model_file_path: str, - model_class: str, - model_preload_file_path: str = None, - dependencies: ModelDependencies = None, - access_right: ModelAccessRight = ModelAccessRight.PRIVATE, - docker_image: str = None, - model_type: str = ModelType.PYTHON_FILE, - model_file_name: str = None, - model_description: str = None) -> Dict[str, Any]: - ''' - Creates a model on SINGA-Auto. - - Only admins & model developers can manage models. - - :param name: Name of the model, which must be unique across all models added by the current user - :param task: Task associated with the model, where the model must adhere to the specification of the task - :param model_file_path: Path to a single Python file that contains the definition for the model class - :param model_class: The name of the model class inside the Python file. This class should implement :class:`singa_auto.model.BaseModel` - :param dependencies: List of Python dependencies & their versions - :param access_right: Model access right - :param model_preload_file_path: pretrained mdoel file - :param docker_image: A custom Docker image that extends ``singa_auto/singa_auto_worker``, publicly available on Docker Hub. - :returns: Created model as dictionary - - Refer to :ref:`model-development` for more details on how to write & test models for SINGA-Auto. - - ``model_file_path`` should point to a *single* file that contains all necessary Python code for the model's implementation. - If the Python file imports any external Python modules, you should list it in ``dependencies`` or create a custom - ``docker_image``. - - If a model's ``access_right`` is set to ``PUBLIC``, this model will be publicly available to all other users on SINGA-Auto for training - and inference. By default, a model's access is ``PRIVATE``. - - ``dependencies`` should be a dictionary of ``{ : }``, where - ```` corresponds to the name of the Python Package Index (PyPI) package (e.g. ``tensorflow``) - and ```` corresponds to the version of the PyPI package (e.g. ``1.12.0``). - Refer to :ref:`configuring-model-environment` to understand more about this option. - ''' - - model_files = { - 'model_file_bytes': (model_file_path, open(model_file_path, 'rb'), - 'application/octet-stream') - } - pretrained_files = {} - - if model_preload_file_path is not None: - pretrained_files = {'checkpoint_id': ( - model_preload_file_path, - open(model_preload_file_path, 'rb'), - 'application/octet-stream')} - - files = {**model_files, **pretrained_files} - - form_data = { - 'name': name, - 'task': task, - 'dependencies': json.dumps(dependencies), - 'docker_image': docker_image, - 'model_class': model_class, - 'access_right': access_right, - 'model_type': model_type, - 'model_file_name': model_file_name, - 'model_description': model_description - } - - data = self._post_stream(path='/models', - files=files, - form_data=form_data) - - return data - - def get_model(self, model_id: str) -> Dict[str, Any]: - ''' - Retrieves details of a single model. - - Model developers can only view their own models. - - :param model_id: ID of model - :returns: Model as dictionary - ''' - _note('`get_model` now requires `model_id` instead of `name`') - - data = self._get('/models/{}'.format(model_id)) - return data - - def download_model_file(self, model_id: str, - out_model_file_path: str) -> Dict[str, any]: - ''' - Downloads the Python model class file for the SINGA-Auto model. - - Model developers can only download their own models. - - :param model_id: ID of model - :param out_model_file_path: Absolute/relative path to save model class file to - :returns: Model as dictionary - ''' - _note('`download_model_file` now requires `model_id` instead of `name`') - - model_file_bytes = self._get('/models/{}/model_file'.format(model_id)) - - with open(out_model_file_path, 'wb') as f: - f.write(model_file_bytes) - - data = self.get_model(model_id) - dependencies = data.get('dependencies') - model_class = data.get('model_class') - - print('Model file downloaded to "{}"!'.format( - os.path.join(os.getcwd(), out_model_file_path))) - - if dependencies: - print( - 'You\'ll need to install the following model dependencies locally: {}' - .format(dependencies)) - - print('From the file, import the model class `{}`.'.format(model_class)) - - return data - - @_deprecated( - '`get_models` & `get_models_of_task` have been combined into `get_available_models`' - ) - def get_models(self, *args, **kwargs): - pass - - @_deprecated( - '`get_models` & `get_models_of_task` have been combined into `get_available_models`' - ) - def get_models_of_task(self, *args, **kwargs): - pass - - def get_available_models(self, task: str = None) -> List[Dict[str, Any]]: - ''' - Lists all SINGA-Auto models available to the current user, optionally filtering by task. - - :param task: Task name - :returns: Available models as list of dictionaries - ''' - data = self._get('/models/available', params={'task': task}) - return data - - def delete_model(self, model_id: str) -> Dict[str, Any]: - ''' - Deletes a single model. Models that have been used in train jobs cannot be deleted. - - Model developers can only delete their own models. - - :param str model_id: ID of model - :returns: Deleted model as dictionary - ''' - data = self._delete('/models/{}'.format(model_id)) - return data - - #################################### - # Train Jobs - #################################### - - def create_train_job(self, - app: str, - task: str, - train_dataset_id: str, - val_dataset_id: str, - budget: Budget, - annotation_dataset_id: str = None, - models: List[str] = None, - train_args: Dict[str, any] = None) -> Dict[str, Any]: - ''' - Creates and starts a train job on SINGA-Auto. - - A train job is uniquely identified by user, its associated app, and the app version (returned in output). - - Only admins, model developers & app developers can manage train jobs. Model developers & app developers can only manage their own train jobs. - - :param app: Name of the app associated with the train job - :param task: Task associated with the train job, - the train job will train models associated with the task - :param train_dataset_id: ID of the train dataset, previously created on SINGA-Auto - :param val_dataset_id: ID of the validation dataset, previously created on SINGA-Auto - :param budget: Budget for train job - :param models: List of IDs of model to use for train job. Defaults to all available models - :param train_args: Additional arguments to pass to models during training, if any. - Refer to the task's specification for appropriate arguments - :returns: Created train job as dictionary - - If ``models`` is unspecified, all models accessible to the user for the specified task will be used. - - ``budget`` should be a dictionary of ``{ : }``, where - ```` is one of :class:`singa_auto.constants.BudgetOption` and - ```` specifies the amount for the associated budget option. - - The following describes the budget options available: - - ===================== ===================== - **Budget Option** **Description** - --------------------- --------------------- - ``TIME_HOURS`` Max no. of hours to train (soft target). Defaults to 0.1. - ``GPU_COUNT`` No. of GPUs to allocate for training, across all models. Defaults to 0. - ``MODEL_TRIAL_COUNT`` Max no. of trials to conduct for each model (soft target). -1 for unlimited. Defaults to -1. - ===================== ===================== - ''' - _note( - '`create_train_job` now requires `models` as a list of model IDs instead of a list of model names' - ) - - if 'ENABLE_GPU' in budget: - _warn('The `ENABLE_GPU` option has been changed to `GPU_COUNT`') - - # Have defaults for budget - budget = { - BudgetOption.TIME_HOURS: 0.1, - BudgetOption.GPU_COUNT: 0, - **budget - } - - postJSON = { - 'app': app, - 'task': task, - 'train_dataset_id': train_dataset_id, - 'val_dataset_id': val_dataset_id, - 'budget': budget, - } - if train_args: - postJSON['train_args'] = train_args - if models: - postJSON['model_ids'] = models - if annotation_dataset_id: - postJSON['annotation_dataset_id'] = annotation_dataset_id - - print("postJSON: ", postJSON) - # print will show up in docker exec terminal - - data = self._post('/train_jobs', json=postJSON) - return data - - def get_train_jobs_by_user(self, user_id: str) -> List[Dict[str, Any]]: - ''' - Lists all of user's train jobs on SINGA-Auto. - - :param user_id: ID of the user - :returns: Train jobs as list of dictionaries - ''' - data = self._get('/train_jobs', params={'user_id': user_id}) - return data - - def get_train_jobs_of_app(self, app: str) -> List[Dict[str, Any]]: - ''' - Lists all of current user's train jobs associated to the app name on SINGA-Auto. - - :param app: Name of the app - :returns: Train jobs as list of dictionaries - ''' - data = self._get('/train_jobs/app', params={'app': app}) - return data - - def get_train_job(self, app: str, app_version: int = -1) -> Dict[str, Any]: - ''' - Retrieves details of the current user's train job identified by an app and an app version, - including workers' details. - - :param app: Name of the app - :param app_version: Version of the app (-1 for latest version) - :returns: Train job as dictionary - ''' - data = self._get('/train_jobs/app/app_version', params={'app': app, 'app_version': app_version}) - return data - - def stop_train_job(self, app: str, app_version: int = -1) -> Dict[str, Any]: - ''' - Prematurely stops the current user's train job identified by an app and an app version. - Otherwise, the train job should stop by itself when its budget is reached. - - :param app: Name of the app - :param app_version: Version of the app (-1 for latest version) - :returns: Stopped train job as dictionary - ''' - data = self._post('/train_jobs/app/app_version/stop', json={'app': app, 'app_version': app_version}) - return data - - #################################### - # Trials - #################################### - - def get_trial(self, trial_id: str) -> Dict[str, Any]: - ''' - Gets a specific trial. - - :param trial_id: ID of trial - :returns: Trial as dictionary - ''' - data = self._get('/trials/{}'.format(trial_id)) - return data - - def get_best_trials_of_train_job( - self, - app: str, - app_version: int = -1, - max_count: int = 2) -> List[Dict[str, Any]]: - ''' - Lists the best scoring trials of the current user's train job identified by an app and an app version, - ordered by descending score. - - :param app: Name of the app - :param app_version: Version of the app (-1 for latest version) - :param max_count: Maximum number of trials to return - :returns: Trials as list of dictionaries - ''' - data = self._get('/train_jobs/{}/{}/trials'.format(app, app_version), - params={ - 'type': 'best', - 'max_count': max_count - }) - return data - - def get_trials_of_train_job(self, - app: str, - app_version: int = -1) -> List[Dict[str, Any]]: - ''' - Lists all trials of the current user's train job identified by an app and an app version, - ordered by when the trial started. - - :param app: Name of the app - :param app_version: Version of the app (-1 for latest version) - :returns: Trials as list of dictionaries - ''' - data = self._get('/train_jobs/app/app_version/trials', params={'app': app, 'app_version': app_version}) - return data - - def get_trial_logs(self, trial_id: str) -> Dict[str, Any]: - ''' - Gets the logs for a specific trial. - - :param trial_id: ID of trial - :returns: Logs of trial as dictionary - ''' - data = self._get('/trials/{}/logs'.format(trial_id)) - return data - - def get_trial_parameters(self, trial_id: str) -> Params: - ''' - Gets parameters of the model associated with the trial. The trial's model parameters must have been saved. - - :param trial_id: ID of trial - :returns: Parameters of the *trained* model associated with the trial - ''' - data = self._get('/trials/{}/parameters'.format(trial_id)) - parameters = pickle.loads(data) - return parameters - - def load_trial_model(self, trial_id: str, - ModelClass: Type[BaseModel]) -> BaseModel: - ''' - Loads an instance of a trial's model with the trial's knobs & parameters. - - Before this, you must have the trial's model class file already in your local filesystem, - the dependencies of the model must have been installed separately, and the model class must have been - imported and passed into this method. - - Wraps :meth:`get_trial_parameters` and :meth:`get_trial`. - - :param trial_id: ID of trial - :param ModelClass: model class that conincides with the trial's model class - :returns: A *trained* model instance of ``ModelClass``, loaded with the trial's knobs and parameters - ''' - data = self.get_trial(trial_id) - assert 'proposal' in data - knobs = data['proposal']['knobs'] - parameters = self.get_trial_parameters(trial_id) - model_inst = ModelClass(**knobs) - model_inst.load_parameters(parameters) - return model_inst - - #################################### - # Inference Jobs - #################################### - - def create_inference_job(self, - app: str, - app_version: int = -1, - budget: InferenceBudget = None, - description: str = None) -> Dict[str, Any]: - ''' - Creates and starts a inference job on SINGA-Auto with the best-scoring trials of the associated train job. - The train job must have the status of ``STOPPED``.The inference job would be tagged with the train job's app and app version. - Throws an error if an inference job of the same train job is already running. - - In this method's response, `predictor_host` is this inference job's predictor's host. - - Only admins, model developers & app developers can manage inference jobs. Model developers & app developers can only manage their own inference jobs. - - :param app: Name of the app identifying the train job to use - :param app_version: Version of the app identifying the train job to use - :param budget: Budget for inference job - :returns: Created inference job as dictionary - - ``budget`` should be a dictionary of ``{ : }``, where - ```` is one of :class:`singa_auto.constants.InferenceBudgetOption` and - ```` specifies the amount for the associated budget option. - - The following describes the budget options available: - - ===================== ===================== - **Budget Option** **Description** - --------------------- --------------------- - ``GPU_COUNT`` No. of GPUs to allocate for inference, across all trials. Defaults to 0. - ===================== ===================== - ''' - - # Have defaults for budget - budget = {InferenceBudgetOption.GPU_COUNT: 0, **(budget or {})} - - data = self._post('/inference_jobs', - json={ - 'app': app, - 'app_version': app_version, - 'budget': budget, - 'description': description - }) - return data - - def create_inference_job_by_checkpoint(self, - model_name: str, - budget: InferenceBudget = None, - description: str = None) -> Dict[str, Any]: - ''' - Creates and starts a inference job on SINGA-Auto with the best-scoring trials of the associated train job. - The train job must have the status of ``STOPPED``.The inference job would be tagged with the train job's app and app version. - Throws an error if an inference job of the same train job is already running. - - In this method's response, `predictor_host` is this inference job's predictor's host. - - Only admins, model developers & app developers can manage inference jobs. Model developers & app developers can only manage their own inference jobs. - - :param app: Name of the app identifying the train job to use - :param app_version: Version of the app identifying the train job to use - :param budget: Budget for inference job - :returns: Created inference job as dictionary - - ``budget`` should be a dictionary of ``{ : }``, where - ```` is one of :class:`singa_auto.constants.InferenceBudgetOption` and - ```` specifies the amount for the associated budget option. - - The following describes the budget options available: - - ===================== ===================== - **Budget Option** **Description** - --------------------- --------------------- - ``GPU_COUNT`` No. of GPUs to allocate for inference, across all trials. Defaults to 0. - ===================== ===================== - ''' - - # Have defaults for budget - budget = { - InferenceBudgetOption.GPU_COUNT: 0, - **(budget or {}) - } - - data = self._post('/inference_jobs/checkpoint', json={ - 'model_name': model_name, - 'budget': budget, - 'description': description - }) - return data - - def get_inference_jobs_by_user(self, user_id: str) -> List[Dict[str, Any]]: - ''' - Lists all of user's inference jobs on SINGA-Auto. - - :param user_id: ID of the user - :returns: Inference jobs as list of dictionaries - ''' - data = self._get('/inference_jobs', params={'user_id': user_id}) - return data - - def get_inference_jobs_of_app(self, app: str) -> List[Dict[str, Any]]: - ''' - Lists all inference jobs associated to an app on SINGA-Auto. - - :param app: Name of the app - :returns: Inference jobs as list of dictionaries - ''' - data = self._get('/inference_jobs/app', params={'app': app}) - return data - - def get_running_inference_job(self, - app: str, - app_version: int = -1) -> Dict[str, Any]: - ''' - Retrieves details of the *running* inference job identified by an app and an app version, - including workers' details. - - :param app: Name of the app - :param app_version: Version of the app (-1 for latest version) - :returns: Inference job as dictionary - ''' - data = self._get('/inference_jobs/app/app_version', params={'app': app, 'app_version': app_version}) - return data - - def stop_inference_job(self, - app: str, - app_version: int = -1) -> Dict[str, Any]: - ''' - Stops the inference job identified by an app and an app version. - - :param app: Name of the app - :param app_version: Version of the app (-1 for latest version) - :returns: Stopped inference job as dictionary - ''' - data = self._post('/inference_jobs/app/app_version/stop', json={'app': app, 'app_version': app_version}) - return data - - # TODO: Add predict method? - - #################################### - # Administrative - #################################### - - def stop_all_jobs(self): - ''' - Stops all train and inference jobs on SINGA-Auto. - - Only the superadmin can call this. - ''' - data = self._post('/actions/stop_all_jobs') - return data - - #################################### - # SINGA-Auto Internal - #################################### - - def send_event(self, name, **params): - data = self._post('/event/{}'.format(name), json=params) - return data - - #################################### - # Private - #################################### - - def _get(self, path, params=None): - url = self._make_url(path) - headers = self._get_headers() - res = requests.get(url, headers=headers, params=params or {}) - return self._parse_response(res) - - def _post(self, path, params=None, files=None, form_data=None, json=None): - url = self._make_url(path) - headers = self._get_headers() - res = requests.post(url, - headers=headers, - params=params or {}, - data=form_data, - json=json, - files=files or {}) - return self._parse_response(res) - - def _post_stream(self, path, files=None, form_data=None): - from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor - - def my_callback(monitor): - progress = (monitor.bytes_read / monitor.len) * 100 - print("\r uploading...:%d%%(%d/%d)" % - (progress, monitor.bytes_read, monitor.len), - end=" ") - - url = self._make_url(path) - headers = self._get_headers() - m = MultipartEncoderMonitor( - MultipartEncoder(fields={ - **files, - **form_data - }), my_callback) - res = requests.post(url, - data=m, - headers={ - **{ - 'Content-Type': m.content_type - }, - **headers - }) - return self._parse_response(res) - - def _delete(self, path, params=None, files=None, form_data=None, json=None): - url = self._make_url(path) - headers = self._get_headers() - res = requests.delete(url, - headers=headers, - params=params or {}, - data=form_data or {}, - json=json, - files=files) - return self._parse_response(res) - - def _make_url(self, path): - url = 'http://{}:{}{}'.format(self._admin_host, self._admin_port, path) - return url - - def _parse_response(self, res): - if res.status_code != 200 and res.status_code != 400: - raise SingaAutoConnectionError(res.text) - - content_type = res.headers.get('content-type') - if content_type == 'application/json': - res = rafiki_response_handler(res.json()) - return res - elif content_type == 'application/octet-stream': - return res.content - else: - raise SingaAutoConnectionError( - 'Invalid response content type: {}'.format(content_type)) - - def _get_headers(self): - if self._token is not None: - return {'Authorization': 'Bearer ' + self._token} - else: - return {} - - -def _warn(msg): - print(f'\033[93mWARNING: {msg}\033[0m') - - -def _note(msg): - print(f'\033[94m{msg}\033[0m') +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +import requests +import json +import pickle +import os +from functools import wraps +from typing import Type, Dict, List, Any + +from singa_auto.constants import ModelAccessRight, ModelDependencies, Budget, BudgetOption, \ + InferenceBudget, InferenceBudgetOption, UserType, ModelType +from singa_auto.model import Params, BaseModel +from singa_auto.error_code import generate_error + + +class SingaAutoConnectionError(ConnectionError): + pass + + +DOCS_URL = 'https://nginyc.github.io/rafiki/docs/latest/docs/src/python/rafiki.client.Client.html' + +def singa_auto_response_handler(resp): + # if isinstance(resp, dict): + # if resp.get('success', 0) == 0: + # return resp['data'] + # else: + # raise generate_error(resp.get('error_code', 500)) + # else: + # return resp + return resp + +# Returns a decorator that warns user about the method being deprecated +def _deprecated(msg=None): + + def deco(func): + nonlocal msg + msg = msg or f'`{func.__name__}` has been deprecated.' + + @wraps(func) + def deprecated_func(*args, **kwargs): + _warn(f'{msg}\n' \ + f'Refer to the updated documentation at {DOCS_URL}') + return func(*args, **kwargs) + + return deprecated_func + + return deco + + +class Client: + ''' + Initializes the Client to connect to a running + SINGA-Auto Admin instance that the Client connects to. + + :param admin_host: Host of SINGA-Auto Admin + :param admin_port: Port of SINGA-Auto Admin + ''' + + def __init__(self, + admin_host: str = os.environ.get('SINGA_AUTO_ADDR', + 'localhost'), + admin_port: int = os.environ.get('ADMIN_EXT_PORT', 3000)): + self._admin_host = admin_host + self._admin_port = admin_port + self._token = None + self._user = None + + def login(self, email: str, password: str) -> Dict[str, Any]: + ''' + Creates a login session as a SINGA-Auto user. You will have to be logged in to perform any actions. + + App developers can create, list and stop train and inference jobs, as well as list models. + Model developers can create and list models. + + The login session (the session token) expires in 1 hour. + + :param email: User's email + :param password: User's password + + :returns: Logged-in user as dictionary + ''' + data = self._post('/tokens', + json={ + 'email': email, + 'password': password + }) + self._token = data['token'] + + # Save user's data + self._user = {'id': data['user_id'], 'user_type': data['user_type']} + + return self._user + + def get_current_user(self) -> Dict[str, Any]: + ''' + Gets currently logged in user's data. + + :returns: Current user as dictionary, or ``None`` if client is not logged in + ''' + return self._user + + def logout(self): + ''' + Clears the current login session. + ''' + self._token = None + self._user = None + + #################################### + # User + #################################### + + def create_user(self, email: str, password: str, + user_type: UserType) -> Dict[str, Any]: + ''' + Creates a SINGA-Auto user. + + Only admins can create users (except for admins). + Only superadmins can create admins. + + :param email: The new user's email + :param password: The new user's password + :param user_type: The new user's type + + :returns: Created user as dictionary + ''' + data = self._post('/users', + json={ + 'email': email, + 'password': password, + 'user_type': user_type + }) + return data + + @_deprecated('`create_users` has been removed') + def create_users(self, *args, **kwargs): + pass + + def get_users(self) -> List[Dict[str, Any]]: + ''' + Lists all SINGA-Auto users. + + Only admins can list all users. + + :returns: List of users as list of dictionaries + ''' + data = self._get('/users') + return data + + def ban_user(self, email: str) -> Dict[str, Any]: + ''' + Bans a SINGA-Auto user, disallowing logins. + + This action is irrevisible. + Only admins can ban users (except for admins). + Only superadmins can ban admins. + + :param email: The user's email + + :returns: Banned user as dictionary + ''' + data = self._delete('/users', json={'email': email}) + return data + + #################################### + # Datasets + #################################### + + def create_dataset(self, + name: str, + task: str, + dataset_path: str = None, + dataset_url: str = None) -> Dict[str, Any]: + ''' + Creates a dataset on SINGA-Auto, either by uploading the dataset file from your filesystem or specifying a URL where the dataset file can be downloaded. + The dataset should be in a format specified by the task + Either `dataset_url` or `dataset_path` should be specified. + + Only admins, model developers and app developers can manage their own datasets. + + :param name: Name for the dataset, does not need to be unique + :param task: Task associated to the dataset + :param dataset_path: Path to the dataset file to upload from the local filesystem + :param dataset_url: Publicly accessible URL where the dataset file can be downloaded + :returns: Created dataset as dictionary + ''' + + dataset = dict() + + form_data = {'name': name, 'task': task, 'dataset_url': dataset_url} + + if dataset_path is not None: + dataset = { + 'dataset': ('dataset', open(dataset_path, + 'rb'), 'application/zip') + } + else: + print( + 'Waiting for server finish downloading the dataset from URL...') + + data = self._post_stream(path='/datasets', + files=dataset, + form_data=form_data) + + return data + + def get_datasets(self, task: str = None) -> List[Dict[str, Any]]: + ''' + Lists all datasets owned by the current user, optionally filtering by task. + + :param task: Task name + :returns: List of datasets as list of dictionaries + ''' + data = self._get('/datasets', params={'task': task}) + return data + + #################################### + # Models + #################################### + + def create_model(self, + name: str, + task: str, + model_file_path: str, + model_class: str, + model_preload_file_path: str = None, + dependencies: ModelDependencies = None, + access_right: ModelAccessRight = ModelAccessRight.PRIVATE, + docker_image: str = None, + model_type: str = ModelType.PYTHON_FILE, + model_file_name: str = None, + model_description: str = None) -> Dict[str, Any]: + ''' + Creates a model on SINGA-Auto. + + Only admins & model developers can manage models. + + :param name: Name of the model, which must be unique across all models added by the current user + :param task: Task associated with the model, where the model must adhere to the specification of the task + :param model_file_path: Path to a single Python file that contains the definition for the model class + :param model_class: The name of the model class inside the Python file. This class should implement :class:`singa_auto.model.BaseModel` + :param dependencies: List of Python dependencies & their versions + :param access_right: Model access right + :param model_preload_file_path: pretrained mdoel file + :param docker_image: A custom Docker image that extends ``singa_auto/singa_auto_worker``, publicly available on Docker Hub. + :returns: Created model as dictionary + + Refer to :ref:`model-development` for more details on how to write & test models for SINGA-Auto. + + ``model_file_path`` should point to a *single* file that contains all necessary Python code for the model's implementation. + If the Python file imports any external Python modules, you should list it in ``dependencies`` or create a custom + ``docker_image``. + + If a model's ``access_right`` is set to ``PUBLIC``, this model will be publicly available to all other users on SINGA-Auto for training + and inference. By default, a model's access is ``PRIVATE``. + + ``dependencies`` should be a dictionary of ``{ : }``, where + ```` corresponds to the name of the Python Package Index (PyPI) package (e.g. ``tensorflow``) + and ```` corresponds to the version of the PyPI package (e.g. ``1.12.0``). + Refer to :ref:`configuring-model-environment` to understand more about this option. + ''' + + model_files = { + 'model_file_bytes': (model_file_path, open(model_file_path, 'rb'), + 'application/octet-stream') + } + pretrained_files = {} + + if model_preload_file_path is not None: + pretrained_files = {'checkpoint_id': ( + model_preload_file_path, + open(model_preload_file_path, 'rb'), + 'application/octet-stream')} + + files = {**model_files, **pretrained_files} + + form_data = { + 'name': name, + 'task': task, + 'dependencies': json.dumps(dependencies), + 'docker_image': docker_image, + 'model_class': model_class, + 'access_right': access_right, + 'model_type': model_type, + 'model_file_name': model_file_name, + 'model_description': model_description + } + + data = self._post_stream(path='/models', + files=files, + form_data=form_data) + + return data + + def get_model(self, model_id: str) -> Dict[str, Any]: + ''' + Retrieves details of a single model. + + Model developers can only view their own models. + + :param model_id: ID of model + :returns: Model as dictionary + ''' + _note('`get_model` now requires `model_id` instead of `name`') + + data = self._get('/models/{}'.format(model_id)) + return data + + def download_model_file(self, model_id: str, + out_model_file_path: str) -> Dict[str, any]: + ''' + Downloads the Python model class file for the SINGA-Auto model. + + Model developers can only download their own models. + + :param model_id: ID of model + :param out_model_file_path: Absolute/relative path to save model class file to + :returns: Model as dictionary + ''' + _note('`download_model_file` now requires `model_id` instead of `name`') + + model_file_bytes = self._get('/models/{}/model_file'.format(model_id)) + + with open(out_model_file_path, 'wb') as f: + f.write(model_file_bytes) + + data = self.get_model(model_id) + dependencies = data.get('dependencies') + model_class = data.get('model_class') + + print('Model file downloaded to "{}"!'.format( + os.path.join(os.getcwd(), out_model_file_path))) + + if dependencies: + print( + 'You\'ll need to install the following model dependencies locally: {}' + .format(dependencies)) + + print('From the file, import the model class `{}`.'.format(model_class)) + + return data + + @_deprecated( + '`get_models` & `get_models_of_task` have been combined into `get_available_models`' + ) + def get_models(self, *args, **kwargs): + pass + + @_deprecated( + '`get_models` & `get_models_of_task` have been combined into `get_available_models`' + ) + def get_models_of_task(self, *args, **kwargs): + pass + + def get_available_models(self, task: str = None) -> List[Dict[str, Any]]: + ''' + Lists all SINGA-Auto models available to the current user, optionally filtering by task. + + :param task: Task name + :returns: Available models as list of dictionaries + ''' + data = self._get('/models/available', params={'task': task}) + return data + + def delete_model(self, model_id: str) -> Dict[str, Any]: + ''' + Deletes a single model. Models that have been used in train jobs cannot be deleted. + + Model developers can only delete their own models. + + :param str model_id: ID of model + :returns: Deleted model as dictionary + ''' + data = self._delete('/models/{}'.format(model_id)) + return data + + #################################### + # Train Jobs + #################################### + + def create_train_job(self, + app: str, + task: str, + train_dataset_id: str, + val_dataset_id: str, + budget: Budget, + annotation_dataset_id: str = None, + models: List[str] = None, + train_args: Dict[str, any] = None) -> Dict[str, Any]: + ''' + Creates and starts a train job on SINGA-Auto. + + A train job is uniquely identified by user, its associated app, and the app version (returned in output). + + Only admins, model developers & app developers can manage train jobs. Model developers & app developers can only manage their own train jobs. + + :param app: Name of the app associated with the train job + :param task: Task associated with the train job, + the train job will train models associated with the task + :param train_dataset_id: ID of the train dataset, previously created on SINGA-Auto + :param val_dataset_id: ID of the validation dataset, previously created on SINGA-Auto + :param budget: Budget for train job + :param models: List of IDs of model to use for train job. Defaults to all available models + :param train_args: Additional arguments to pass to models during training, if any. + Refer to the task's specification for appropriate arguments + :returns: Created train job as dictionary + + If ``models`` is unspecified, all models accessible to the user for the specified task will be used. + + ``budget`` should be a dictionary of ``{ : }``, where + ```` is one of :class:`singa_auto.constants.BudgetOption` and + ```` specifies the amount for the associated budget option. + + The following describes the budget options available: + + ===================== ===================== + **Budget Option** **Description** + --------------------- --------------------- + ``TIME_HOURS`` Max no. of hours to train (soft target). Defaults to 0.1. + ``GPU_COUNT`` No. of GPUs to allocate for training, across all models. Defaults to 0. + ``MODEL_TRIAL_COUNT`` Max no. of trials to conduct for each model (soft target). -1 for unlimited. Defaults to -1. + ===================== ===================== + ''' + _note( + '`create_train_job` now requires `models` as a list of model IDs instead of a list of model names' + ) + + if 'ENABLE_GPU' in budget: + _warn('The `ENABLE_GPU` option has been changed to `GPU_COUNT`') + + # Have defaults for budget + budget = { + BudgetOption.TIME_HOURS: 0.1, + BudgetOption.GPU_COUNT: 0, + **budget + } + + postJSON = { + 'app': app, + 'task': task, + 'train_dataset_id': train_dataset_id, + 'val_dataset_id': val_dataset_id, + 'budget': budget, + } + if train_args: + postJSON['train_args'] = train_args + if models: + postJSON['model_ids'] = models + if annotation_dataset_id: + postJSON['annotation_dataset_id'] = annotation_dataset_id + + print("postJSON: ", postJSON) + # print will show up in docker exec terminal + + data = self._post('/train_jobs', json=postJSON) + return data + + def get_train_jobs_by_user(self, user_id: str) -> List[Dict[str, Any]]: + ''' + Lists all of user's train jobs on SINGA-Auto. + + :param user_id: ID of the user + :returns: Train jobs as list of dictionaries + ''' + data = self._get('/train_jobs', params={'user_id': user_id}) + return data + + def get_train_jobs_of_app(self, app: str) -> List[Dict[str, Any]]: + ''' + Lists all of current user's train jobs associated to the app name on SINGA-Auto. + + :param app: Name of the app + :returns: Train jobs as list of dictionaries + ''' + data = self._get('/train_jobs/app', params={'app': app}) + return data + + def get_train_job(self, app: str, app_version: int = -1) -> Dict[str, Any]: + ''' + Retrieves details of the current user's train job identified by an app and an app version, + including workers' details. + + :param app: Name of the app + :param app_version: Version of the app (-1 for latest version) + :returns: Train job as dictionary + ''' + data = self._get('/train_jobs/app/app_version', params={'app': app, 'app_version': app_version}) + return data + + def stop_train_job(self, app: str, app_version: int = -1) -> Dict[str, Any]: + ''' + Prematurely stops the current user's train job identified by an app and an app version. + Otherwise, the train job should stop by itself when its budget is reached. + + :param app: Name of the app + :param app_version: Version of the app (-1 for latest version) + :returns: Stopped train job as dictionary + ''' + data = self._post('/train_jobs/app/app_version/stop', json={'app': app, 'app_version': app_version}) + return data + + #################################### + # Trials + #################################### + + def get_trial(self, trial_id: str) -> Dict[str, Any]: + ''' + Gets a specific trial. + + :param trial_id: ID of trial + :returns: Trial as dictionary + ''' + data = self._get('/trials/{}'.format(trial_id)) + return data + + def get_best_trials_of_train_job( + self, + app: str, + app_version: int = -1, + max_count: int = 2) -> List[Dict[str, Any]]: + ''' + Lists the best scoring trials of the current user's train job identified by an app and an app version, + ordered by descending score. + + :param app: Name of the app + :param app_version: Version of the app (-1 for latest version) + :param max_count: Maximum number of trials to return + :returns: Trials as list of dictionaries + ''' + data = self._get('/train_jobs/{}/{}/trials'.format(app, app_version), + params={ + 'type': 'best', + 'max_count': max_count + }) + return data + + def get_trials_of_train_job(self, + app: str, + app_version: int = -1) -> List[Dict[str, Any]]: + ''' + Lists all trials of the current user's train job identified by an app and an app version, + ordered by when the trial started. + + :param app: Name of the app + :param app_version: Version of the app (-1 for latest version) + :returns: Trials as list of dictionaries + ''' + data = self._get('/train_jobs/app/app_version/trials', params={'app': app, 'app_version': app_version}) + return data + + def get_trial_logs(self, trial_id: str) -> Dict[str, Any]: + ''' + Gets the logs for a specific trial. + + :param trial_id: ID of trial + :returns: Logs of trial as dictionary + ''' + data = self._get('/trials/{}/logs'.format(trial_id)) + return data + + def get_trial_parameters(self, trial_id: str) -> Params: + ''' + Gets parameters of the model associated with the trial. The trial's model parameters must have been saved. + + :param trial_id: ID of trial + :returns: Parameters of the *trained* model associated with the trial + ''' + data = self._get('/trials/{}/parameters'.format(trial_id)) + parameters = pickle.loads(data) + return parameters + + def load_trial_model(self, trial_id: str, + ModelClass: Type[BaseModel]) -> BaseModel: + ''' + Loads an instance of a trial's model with the trial's knobs & parameters. + + Before this, you must have the trial's model class file already in your local filesystem, + the dependencies of the model must have been installed separately, and the model class must have been + imported and passed into this method. + + Wraps :meth:`get_trial_parameters` and :meth:`get_trial`. + + :param trial_id: ID of trial + :param ModelClass: model class that conincides with the trial's model class + :returns: A *trained* model instance of ``ModelClass``, loaded with the trial's knobs and parameters + ''' + data = self.get_trial(trial_id) + assert 'proposal' in data + knobs = data['proposal']['knobs'] + parameters = self.get_trial_parameters(trial_id) + model_inst = ModelClass(**knobs) + model_inst.load_parameters(parameters) + return model_inst + + #################################### + # Inference Jobs + #################################### + + def create_inference_job(self, + app: str, + app_version: int = -1, + budget: InferenceBudget = None, + description: str = None) -> Dict[str, Any]: + ''' + Creates and starts a inference job on SINGA-Auto with the best-scoring trials of the associated train job. + The train job must have the status of ``STOPPED``.The inference job would be tagged with the train job's app and app version. + Throws an error if an inference job of the same train job is already running. + + In this method's response, `predictor_host` is this inference job's predictor's host. + + Only admins, model developers & app developers can manage inference jobs. Model developers & app developers can only manage their own inference jobs. + + :param app: Name of the app identifying the train job to use + :param app_version: Version of the app identifying the train job to use + :param budget: Budget for inference job + :returns: Created inference job as dictionary + + ``budget`` should be a dictionary of ``{ : }``, where + ```` is one of :class:`singa_auto.constants.InferenceBudgetOption` and + ```` specifies the amount for the associated budget option. + + The following describes the budget options available: + + ===================== ===================== + **Budget Option** **Description** + --------------------- --------------------- + ``GPU_COUNT`` No. of GPUs to allocate for inference, across all trials. Defaults to 0. + ===================== ===================== + ''' + + # Have defaults for budget + budget = {InferenceBudgetOption.GPU_COUNT: 0, **(budget or {})} + + data = self._post('/inference_jobs', + json={ + 'app': app, + 'app_version': app_version, + 'budget': budget, + 'description': description + }) + return data + + def create_inference_job_by_checkpoint(self, + model_name: str, + budget: InferenceBudget = None, + description: str = None) -> Dict[str, Any]: + ''' + Creates and starts a inference job on SINGA-Auto with the best-scoring trials of the associated train job. + The train job must have the status of ``STOPPED``.The inference job would be tagged with the train job's app and app version. + Throws an error if an inference job of the same train job is already running. + + In this method's response, `predictor_host` is this inference job's predictor's host. + + Only admins, model developers & app developers can manage inference jobs. Model developers & app developers can only manage their own inference jobs. + + :param app: Name of the app identifying the train job to use + :param app_version: Version of the app identifying the train job to use + :param budget: Budget for inference job + :returns: Created inference job as dictionary + + ``budget`` should be a dictionary of ``{ : }``, where + ```` is one of :class:`singa_auto.constants.InferenceBudgetOption` and + ```` specifies the amount for the associated budget option. + + The following describes the budget options available: + + ===================== ===================== + **Budget Option** **Description** + --------------------- --------------------- + ``GPU_COUNT`` No. of GPUs to allocate for inference, across all trials. Defaults to 0. + ===================== ===================== + ''' + + # Have defaults for budget + budget = { + InferenceBudgetOption.GPU_COUNT: 0, + **(budget or {}) + } + + data = self._post('/inference_jobs/checkpoint', json={ + 'model_name': model_name, + 'budget': budget, + 'description': description + }) + return data + + def get_inference_jobs_by_user(self, user_id: str) -> List[Dict[str, Any]]: + ''' + Lists all of user's inference jobs on SINGA-Auto. + + :param user_id: ID of the user + :returns: Inference jobs as list of dictionaries + ''' + data = self._get('/inference_jobs', params={'user_id': user_id}) + return data + + def get_inference_jobs_of_app(self, app: str) -> List[Dict[str, Any]]: + ''' + Lists all inference jobs associated to an app on SINGA-Auto. + + :param app: Name of the app + :returns: Inference jobs as list of dictionaries + ''' + data = self._get('/inference_jobs/app', params={'app': app}) + return data + + def get_running_inference_job(self, + app: str, + app_version: int = -1) -> Dict[str, Any]: + ''' + Retrieves details of the *running* inference job identified by an app and an app version, + including workers' details. + + :param app: Name of the app + :param app_version: Version of the app (-1 for latest version) + :returns: Inference job as dictionary + ''' + data = self._get('/inference_jobs/app/app_version', params={'app': app, 'app_version': app_version}) + return data + + def stop_inference_job(self, + app: str, + app_version: int = -1) -> Dict[str, Any]: + ''' + Stops the inference job identified by an app and an app version. + + :param app: Name of the app + :param app_version: Version of the app (-1 for latest version) + :returns: Stopped inference job as dictionary + ''' + data = self._post('/inference_jobs/app/app_version/stop', json={'app': app, 'app_version': app_version}) + return data + + # TODO: Add predict method? + + #################################### + # Administrative + #################################### + + def stop_all_jobs(self): + ''' + Stops all train and inference jobs on SINGA-Auto. + + Only the superadmin can call this. + ''' + data = self._post('/actions/stop_all_jobs') + return data + + #################################### + # SINGA-Auto Internal + #################################### + + def send_event(self, name, **params): + data = self._post('/event/{}'.format(name), json=params) + return data + + #################################### + # Private + #################################### + + def _get(self, path, params=None): + url = self._make_url(path) + headers = self._get_headers() + res = requests.get(url, headers=headers, params=params or {}) + return self._parse_response(res) + + def _post(self, path, params=None, files=None, form_data=None, json=None): + url = self._make_url(path) + headers = self._get_headers() + res = requests.post(url, + headers=headers, + params=params or {}, + data=form_data, + json=json, + files=files or {}) + return self._parse_response(res) + + def _post_stream(self, path, files=None, form_data=None): + from requests_toolbelt import MultipartEncoder, MultipartEncoderMonitor + + def my_callback(monitor): + progress = (monitor.bytes_read / monitor.len) * 100 + print("\r uploading...:%d%%(%d/%d)" % + (progress, monitor.bytes_read, monitor.len), + end=" ") + + url = self._make_url(path) + headers = self._get_headers() + m = MultipartEncoderMonitor( + MultipartEncoder(fields={ + **files, + **form_data + }), my_callback) + res = requests.post(url, + data=m, + headers={ + **{ + 'Content-Type': m.content_type + }, + **headers + }) + return self._parse_response(res) + + def _delete(self, path, params=None, files=None, form_data=None, json=None): + url = self._make_url(path) + headers = self._get_headers() + res = requests.delete(url, + headers=headers, + params=params or {}, + data=form_data or {}, + json=json, + files=files) + return self._parse_response(res) + + def _make_url(self, path): + url = 'http://{}:{}{}'.format(self._admin_host, self._admin_port, path) + return url + + def _parse_response(self, res): + if res.status_code != 200 and res.status_code != 400: + raise SingaAutoConnectionError(res.text) + + content_type = res.headers.get('content-type') + if content_type == 'application/json': + res = singa_auto_response_handler(res.json()) + return res + elif content_type == 'application/octet-stream': + return res.content + else: + raise SingaAutoConnectionError( + 'Invalid response content type: {}'.format(content_type)) + + def _get_headers(self): + if self._token is not None: + return {'Authorization': 'Bearer ' + self._token} + else: + return {} + + +def _warn(msg): + print(f'\033[93mWARNING: {msg}\033[0m') + + +def _note(msg): + print(f'\033[94m{msg}\033[0m') diff --git a/singa_auto/container/docker_swarm.py b/singa_auto/container/docker_swarm.py index 582b94a6..8bcd63be 100644 --- a/singa_auto/container/docker_swarm.py +++ b/singa_auto/container/docker_swarm.py @@ -56,6 +56,7 @@ def __init__( def create_service(self, service_name, + service_type, docker_image, replicas, args, @@ -63,7 +64,8 @@ def create_service(self, mounts=None, publish_port=None, gpus=0, - dist_workers=0) -> ContainerService: + dist_workers=0, + gpu_allocated=None) -> ContainerService: if mounts is None: mounts = {} diff --git a/singa_auto/container/kubernetes_operation.py b/singa_auto/container/kubernetes_operation.py index 1c9bbcce..59949da6 100644 --- a/singa_auto/container/kubernetes_operation.py +++ b/singa_auto/container/kubernetes_operation.py @@ -34,6 +34,10 @@ logger = logging.getLogger(__name__) +ENVIRONMENT_VARIABLES_AUTOFORWARD = [ + 'KUBERNETES_ADVERTISE_ADDR',# 'DB_PATH_ON_MASTER', +] + class KubernetesContainerManager(ContainerManager): @@ -50,6 +54,9 @@ def __init__(self, **kwargs): os.getenv('KUBERNETES_SERVICE_HOST'), os.getenv('KUBERNETES_SERVICE_PORT')) + # self._params_root_path = os.environ['DB_PATH_ON_MASTER'] + self._kubernetes_advertise_addr = os.environ['KUBERNETES_ADVERTISE_ADDR'] + # Security part. # In this simple example we are not going to verify the SSL certificate of # the remote cluster (for simplicity reason) @@ -67,6 +74,7 @@ def __init__(self, **kwargs): self._client_deployment = client.AppsV1Api(aApiClient) self._client_service = client.CoreV1Api(aApiClient) self.api_instance = client.NetworkingV1beta1Api(aApiClient) + self._client_networkpolicy = client.NetworkingV1Api(aApiClient) def update_ingress(self, ingress_name: str, ingress_body: dict): paths = self._update_ingress_paths(ingress_body) @@ -125,11 +133,24 @@ def _update_ingress_paths(self, ingress_body: dict) -> list: return paths def destroy_service(self, service: ContainerService): - self._client_deployment.delete_namespaced_deployment(service.id, namespace='default') - self._client_service.delete_namespaced_service(service.id, namespace='default') + try: + self._client_deployment.delete_namespaced_deployment(service.id, namespace='default') + except(Exception): + logger.error('Error while stopping kubernetes deployment {}.'.format(service.id)) + + try: + self._client_networkpolicy.delete_namespaced_network_policy(name=service.id, namespace='default') + except(Exception): + logger.error('Error while stopping kubernetes network policy {}.'.format(service.id)) + + try: + self._client_service.delete_namespaced_service(service.id, namespace='default') + except(Exception): + logger.error('Error while stopping kubernetes service {}.'.format(service.id)) def create_service(self, service_name, + service_type, docker_image, replicas, args, @@ -137,10 +158,14 @@ def create_service(self, mounts=None, publish_port=None, gpus=0, - dist_workers=0) -> ContainerService: + dist_workers=0, + gpu_allocated=None) -> ContainerService: if mounts is None: mounts = {} hostname = service_name + node_name = 'default' + gpu_list = "" + if publish_port is not None: service_config = self._create_service_config(service_name, docker_image, replicas, args, environment_vars, mounts, publish_port, @@ -167,7 +192,7 @@ def create_service(self, if gpus > 0: # run the scheduler algorithm, choose the gpu and node for few pods. - node_gpuid = self._get_top_gpus(dist_workers) + node_gpuid = self._get_dist_top_gpus(dist_workers) for index in range(dist_workers): environment_vars["RANK"] = str(index) @@ -209,19 +234,24 @@ def create_service(self, environment_vars, mounts, select_gpu, select_node_name) print("pod_config", pod_config) _retry(self._client_service.create_namespaced_pod)(namespace='default', body=pod_config) - else: - deployment_config = self._create_deployment_config(service_name, docker_image, replicas, - environment_vars, mounts, gpus - ) + list_hostname = [] + list_gpu_selected = [] + deployment_config = self._create_deployment_config(hostname, service_name, service_type, docker_image, replicas, + environment_vars, mounts, gpus, gpu_allocated, list_gpu_selected, list_hostname) + if len(list_hostname) > 0: + node_name = list_hostname[0] + if len(list_gpu_selected) > 0: + gpu_list = list_gpu_selected[0] _retry(self._client_deployment.create_namespaced_deployment)(namespace='default', body=deployment_config) info = { - 'node_id': 'default', + 'node_id': node_name, 'gpu_nos': gpus, 'service_name': service_name, - 'replicas': replicas + 'replicas': replicas, + 'gpu_list': gpu_list, } service = ContainerService( @@ -247,7 +277,8 @@ def _create_pod_config(self, }) volumes.append({ 'name': 'v' + str(mounts_count), - 'hostPath': { + 'nfs':{ + 'server': self._kubernetes_advertise_addr, 'path': k } }) @@ -297,12 +328,17 @@ def _create_pod_config(self, return content def _create_deployment_config(self, + hostname, service_name, + service_type, docker_image, replicas, environment_vars, mounts, - gpus=0 + gpus=0, + gpu_allocated=None, + list_gpu_selected=[], + list_hostname=[] ): content = {} content.setdefault('apiVersion', 'apps/v1') @@ -330,38 +366,50 @@ def _create_deployment_config(self, }) volumes.append({ 'name': 'v' + str(mounts_count), - 'hostPath': { + 'nfs':{ + 'server': self._kubernetes_advertise_addr, 'path': k } }) mounts_count += 1 - template.setdefault('spec', { - 'containers': [container], - 'volumes': volumes - }) env = [{'name': k, 'value': v} for (k, v) in environment_vars.items()] + select_node_name = hostname + if gpus > 0: - node_gpuid = self._get_top_gpus(1) - if node_gpuid and node_gpuid[0]: - select_node_name, select_gpu = node_gpuid[0]["nodeName"], node_gpuid[0]["GPUID"] - # nodeSelector can be used to bind a pod to a node - nodeSelector = {NodeLabes.NodeName: select_node_name} - template["spec"]["nodeSelector"] = nodeSelector - - # NVIDIA_VISIBLE_DEVICES is used to expose a specific gpu to this pod - env.append({"name": "NVIDIA_VISIBLE_DEVICES", "value": select_gpu}) - - container.setdefault('resources', - {'limits': { - 'nvidia.com/gpu': gpus - }}) + node_gpuid = self._get_gpus_on_node(gpus, gpu_allocated) + + if node_gpuid and "max_min_free_node" in node_gpuid and "max_gpu_free_ratio" in node_gpuid: + select_node_name = node_gpuid["max_gpu_free_ratio"] + + list_hostname.append(select_node_name) + + env.append({"name": "NVIDIA_VISIBLE_DEVICES", "value": ', '.join(node_gpuid[select_node_name]["gpu_id"])}) + list_gpu_selected.append(', '.join(node_gpuid[select_node_name]["gpu_id"])) + container.setdefault('env', env) - return content - def _get_top_gpus(self, n) -> List[dict]: + if gpus > 0: + template.setdefault('spec', { + 'nodeName': select_node_name, + 'containers': [container], + 'volumes': volumes + }) + container.setdefault('resources', { + 'limits': { + 'nvidia.com/gpu': gpus + }, + }) + else: + template.setdefault('spec', { + # 'nodeName': select_node_name, + 'containers': [container], + 'volumes': volumes + }) + return content + def _get_dist_top_gpus(self, n) -> List[dict]: """ This method is used to find the top n gpus, the one with most free memory nodeInfo is format of following: @@ -476,6 +524,59 @@ def _get_top_gpus(self, n) -> List[dict]: print("node_gpuid: ", node_gpuid) return node_gpuid + def _get_gpus_on_node(self, n, gpu_allocated=None) -> List[dict]: + node_infos = self._client_service.list_node() + + node_gpuid = dict() + + max_min_free_memory = 0 + max_gpu_used_ratio = 0 + + + for node_info in node_infos.items: + # if the node doesnt have gpu label or gpu is false, skip this node + if NodeLabes.Gpu not in node_info.metadata.labels or not node_info.metadata.labels[NodeLabes.Gpu]: + continue + + gpu_summary = node_info.metadata.labels[NodeLabes.GpuSummary] + + node_name = node_info.metadata.labels[NodeLabes.NodeName] + + num_gpu = int(node_info.status.allocatable["nvidia.com/gpu"]) + if num_gpu < 1: + continue + + gpu_used_on_node = [] + if node_name in gpu_allocated: + gpu_used_on_node = gpu_allocated[node_name] + + node_gpus = dict() + for gpu_info in gpu_summary.split("."): + gpu_device_id = gpu_info.split("_")[0] + if gpu_device_id not in gpu_used_on_node: + free_memory = gpu_info.split("_")[1] + node_gpus[gpu_device_id] = free_memory + + if len(node_gpus) < n: + continue + + top_n = sorted(node_gpus.items(), key=lambda d: d[1], reverse=False)[:n] + logger.info("top_n: {}".format(top_n)) + node_gpuid[node_name] = { + "gpu_id": [ele[0] for ele in top_n], + "min_free_memory": top_n[n-1][1], + "gpu_free_ratio": len(node_gpus) / num_gpu, + } + node_min_free_memory = int(node_gpuid[node_name]["min_free_memory"]) + if max_min_free_memory < node_min_free_memory: + max_min_free_memory = node_min_free_memory + node_gpuid["max_min_free_node"] = node_name + node_gpu_used_ratio = len(node_gpus) / num_gpu + if max_gpu_used_ratio < node_gpu_used_ratio: + max_gpu_used_ratio = node_gpu_used_ratio + node_gpuid["max_gpu_free_ratio"] = node_name + return node_gpuid + def _create_clusterip_service_config(self, service_name, publish_port): content = \ {'apiVersion': 'v1', diff --git a/singa_auto/container/requirements.txt b/singa_auto/container/requirements.txt index 8f21ead2..10095770 100644 --- a/singa_auto/container/requirements.txt +++ b/singa_auto/container/requirements.txt @@ -1 +1,2 @@ -docker==3.5.0 +docker==4.4.0 +kubernetes==10.0.1 \ No newline at end of file diff --git a/singa_auto/darknet/model.py b/singa_auto/darknet/model.py new file mode 100644 index 00000000..22ac19d3 --- /dev/null +++ b/singa_auto/darknet/model.py @@ -0,0 +1,375 @@ + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from singa_auto.darknet.utils import build_targets, to_cpu + + +class Upsample(nn.Module): + """ nn.Upsample is deprecated """ + + def __init__(self, scale_factor, mode="nearest"): + super(Upsample, self).__init__() + self.scale_factor = scale_factor + self.mode = mode + + def forward(self, x): + x = F.interpolate(x, scale_factor=self.scale_factor, mode=self.mode) + return x + + +class EmptyLayer(nn.Module): + """Placeholder for 'route' and 'shortcut' layers""" + + def __init__(self): + super(EmptyLayer, self).__init__() + + +class YOLOLayer(nn.Module): + """Detection layer""" + + def __init__(self, anchors, num_classes, img_dim=416): + super(YOLOLayer, self).__init__() + self.anchors = anchors + self.num_anchors = len(anchors) + self.num_classes = num_classes + self.ignore_thresh = 0.5 + self.mse_loss = nn.MSELoss() + self.bce_loss = nn.BCELoss() + self.obj_scale = 1 + self.noobj_scale = 100 + self.metrics = {} + self.img_dim = img_dim + self.grid_size = 0 # grid size + + def compute_grid_offsets(self, grid_size, cuda=True): + self.grid_size = grid_size + g = self.grid_size + FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor + self.stride = self.img_dim / self.grid_size + # Calculate offsets for each grid + self.grid_x = torch.arange(g).repeat(g, 1).view([1, 1, g, g]).type(FloatTensor) + self.grid_y = torch.arange(g).repeat(g, 1).t().view([1, 1, g, g]).type(FloatTensor) + self.scaled_anchors = FloatTensor([(a_w / self.stride, a_h / self.stride) for a_w, a_h in self.anchors]) + self.anchor_w = self.scaled_anchors[:, 0:1].view((1, self.num_anchors, 1, 1)) + self.anchor_h = self.scaled_anchors[:, 1:2].view((1, self.num_anchors, 1, 1)) + + def forward(self, x, targets=None, img_dim=None): + + # Tensors for cuda support + FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor + LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor + ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor + + self.img_dim = img_dim + num_samples = x.size(0) + grid_size = x.size(2) + + prediction = ( + x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size) + .permute(0, 1, 3, 4, 2) + .contiguous() + ) + + # Get outputs + x = torch.sigmoid(prediction[..., 0]) # Center x + y = torch.sigmoid(prediction[..., 1]) # Center y + w = prediction[..., 2] # Width + h = prediction[..., 3] # Height + pred_conf = torch.sigmoid(prediction[..., 4]) # Conf + pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. + + # If grid size does not match current we compute new offsets + if grid_size != self.grid_size: + self.compute_grid_offsets(grid_size, cuda=x.is_cuda) + + # Add offset and scale with anchors + pred_boxes = FloatTensor(prediction[..., :4].shape) + pred_boxes[..., 0] = x.data + self.grid_x + pred_boxes[..., 1] = y.data + self.grid_y + pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w + pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h + + output = torch.cat( + ( + pred_boxes.view(num_samples, -1, 4) * self.stride, + pred_conf.view(num_samples, -1, 1), + pred_cls.view(num_samples, -1, self.num_classes), + ), + -1, + ) + + if targets is None: + return output, 0 + else: + iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets( + pred_boxes=pred_boxes, + pred_cls=pred_cls, + target=targets, + anchors=self.scaled_anchors, + ignore_thresh=self.ignore_thresh, + ) + + # Loss : Mask outputs to ignore non-existing objects (except with conf. loss) + loss_x = self.mse_loss(x[obj_mask], tx[obj_mask]) + loss_y = self.mse_loss(y[obj_mask], ty[obj_mask]) + loss_w = self.mse_loss(w[obj_mask], tw[obj_mask]) + loss_h = self.mse_loss(h[obj_mask], th[obj_mask]) + loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask]) + loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask]) + loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj + loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask]) + total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls + + # Metrics + cls_acc = 100 * class_mask[obj_mask].mean() + conf_obj = pred_conf[obj_mask].mean() + conf_noobj = pred_conf[noobj_mask].mean() + conf50 = (pred_conf > 0.5).float() + iou50 = (iou_scores > 0.5).float() + iou75 = (iou_scores > 0.75).float() + detected_mask = conf50 * class_mask * tconf + precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16) + recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16) + recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16) + + self.metrics = { + "loss": to_cpu(total_loss).item(), + "x": to_cpu(loss_x).item(), + "y": to_cpu(loss_y).item(), + "w": to_cpu(loss_w).item(), + "h": to_cpu(loss_h).item(), + "conf": to_cpu(loss_conf).item(), + "cls": to_cpu(loss_cls).item(), + "cls_acc": to_cpu(cls_acc).item(), + "recall50": to_cpu(recall50).item(), + "recall75": to_cpu(recall75).item(), + "precision": to_cpu(precision).item(), + "conf_obj": to_cpu(conf_obj).item(), + "conf_noobj": to_cpu(conf_noobj).item(), + "grid_size": grid_size, + } + + return output, total_loss + +class DarkNet(nn.Module): + """YOLOv3 object detection model""" + + def __init__(self, config_path=None, model_cfg=None, img_size=416): + """ + one of config_path and model_cfg is valid, not both + """ + super(DarkNet, self).__init__() + self._model_cfg = None + + self.module_defs = self.parse_model_config(path=config_path, model_cfg=model_cfg) + self.hyperparams, self.module_list = self.create_modules(self.module_defs) + self.yolo_layers = [layer[0] for layer in self.module_list if hasattr(layer[0], "metrics")] + self.img_size = img_size + self.seen = 0 + self.header_info = np.array([0, 0, 0, self.seen, 0], dtype=np.int32) + + @property + def model_cfg(self): + return self._model_cfg + + def create_modules(self, module_defs): + """ + Constructs module list of layer blocks from module configuration in module_defs + """ + + hyperparams = module_defs.pop(0) + output_filters = [int(hyperparams["channels"])] + module_list = nn.ModuleList() + + for module_i, module_def in enumerate(module_defs): + modules = nn.Sequential() + + if module_def["type"] == "convolutional": + bn = int(module_def["batch_normalize"]) + filters = int(module_def["filters"]) + kernel_size = int(module_def["size"]) + pad = (kernel_size - 1) // 2 + modules.add_module( + f"conv_{module_i}", + nn.Conv2d( + in_channels=output_filters[-1], + out_channels=filters, + kernel_size=kernel_size, + stride=int(module_def["stride"]), + padding=pad, + bias=not bn, + ), + ) + if bn: + modules.add_module(f"batch_norm_{module_i}", nn.BatchNorm2d(filters, momentum=0.9, eps=1e-5)) + if module_def["activation"] == "leaky": + modules.add_module(f"leaky_{module_i}", nn.LeakyReLU(0.1)) + + elif module_def["type"] == "maxpool": + kernel_size = int(module_def["size"]) + stride = int(module_def["stride"]) + if kernel_size == 2 and stride == 1: + modules.add_module(f"_debug_padding_{module_i}", nn.ZeroPad2d((0, 1, 0, 1))) + maxpool = nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=int((kernel_size - 1) // 2)) + modules.add_module(f"maxpool_{module_i}", maxpool) + + elif module_def["type"] == "upsample": + upsample = Upsample(scale_factor=int(module_def["stride"]), mode="nearest") + modules.add_module(f"upsample_{module_i}", upsample) + + elif module_def["type"] == "route": + layers = [int(x) for x in module_def["layers"].split(",")] + filters = sum([output_filters[1:][i] for i in layers]) + modules.add_module(f"route_{module_i}", EmptyLayer()) + + elif module_def["type"] == "shortcut": + filters = output_filters[1:][int(module_def["from"])] + modules.add_module(f"shortcut_{module_i}", EmptyLayer()) + + elif module_def["type"] == "yolo": + anchor_idxs = [int(x) for x in module_def["mask"].split(",")] + # Extract anchors + anchors = [int(x) for x in module_def["anchors"].split(",")] + anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)] + anchors = [anchors[i] for i in anchor_idxs] + num_classes = int(module_def["classes"]) + img_size = int(hyperparams["height"]) + # Define detection layer + yolo_layer = YOLOLayer(anchors, num_classes, img_size) + modules.add_module(f"yolo_{module_i}", yolo_layer) + # Register module list and number of output filters + module_list.append(modules) + output_filters.append(filters) + + return hyperparams, module_list + + def forward(self, x, targets=None): + img_dim = x.shape[2] + loss = 0 + layer_outputs, yolo_outputs = [], [] + for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)): + if module_def["type"] in ["convolutional", "upsample", "maxpool"]: + x = module(x) + elif module_def["type"] == "route": + x = torch.cat([layer_outputs[int(layer_i)] for layer_i in module_def["layers"].split(",")], 1) + elif module_def["type"] == "shortcut": + layer_i = int(module_def["from"]) + x = layer_outputs[-1] + layer_outputs[layer_i] + elif module_def["type"] == "yolo": + x, layer_loss = module[0](x, targets, img_dim) + loss += layer_loss + yolo_outputs.append(x) + layer_outputs.append(x) + yolo_outputs = to_cpu(torch.cat(yolo_outputs, 1)) + return yolo_outputs if targets is None else (loss, yolo_outputs) + + def load_darknet_weights(self, weights_path): + """Parses and loads the weights stored in 'weights_path'""" + + # Open the weights file + with open(weights_path, "rb") as f: + header = np.fromfile(f, dtype=np.int32, count=5) # First five are header values + self.header_info = header # Needed to write header when saving weights + self.seen = header[3] # number of images seen during training + weights = np.fromfile(f, dtype=np.float32) # The rest are weights + + # Establish cutoff for loading backbone weights + cutoff = None + if "darknet53.conv.74" in weights_path: + cutoff = 75 + + ptr = 0 + for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)): + if i == cutoff: + break + if module_def["type"] == "convolutional": + conv_layer = module[0] + if module_def["batch_normalize"]: + # Load BN bias, weights, running mean and running variance + bn_layer = module[1] + num_b = bn_layer.bias.numel() # Number of biases + # Bias + bn_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.bias) + bn_layer.bias.data.copy_(bn_b) + ptr += num_b + # Weight + bn_w = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.weight) + bn_layer.weight.data.copy_(bn_w) + ptr += num_b + # Running Mean + bn_rm = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_mean) + bn_layer.running_mean.data.copy_(bn_rm) + ptr += num_b + # Running Var + bn_rv = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_var) + bn_layer.running_var.data.copy_(bn_rv) + ptr += num_b + else: + # Load conv. bias + num_b = conv_layer.bias.numel() + conv_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(conv_layer.bias) + conv_layer.bias.data.copy_(conv_b) + ptr += num_b + # Load conv. weights + num_w = conv_layer.weight.numel() + conv_w = torch.from_numpy(weights[ptr : ptr + num_w]).view_as(conv_layer.weight) + conv_layer.weight.data.copy_(conv_w) + ptr += num_w + + def parse_model_config(self, path=None, model_cfg=None): + """Parses the yolo-v3 layer configuration file and returns module definitions""" + if model_cfg is None: + if path is None: + raise ValueError("path and model_cfg should not both be None") + + file = open(path, 'r') + lines = file.read().split('\n') + lines = [x for x in lines if x and not x.startswith('#')] + self._model_cfg = [x.rstrip().lstrip() for x in lines] # get rid of fringe whitespaces + else: + self._model_cfg = model_cfg + + module_defs = [] + for line in self._model_cfg: + if line.startswith('['): # This marks the start of a new block + module_defs.append({}) + module_defs[-1]['type'] = line[1:-1].rstrip() + if module_defs[-1]['type'] == 'convolutional': + module_defs[-1]['batch_normalize'] = 0 + else: + key, value = line.split("=") + value = value.strip() + module_defs[-1][key.rstrip()] = value.strip() + + return module_defs + + def save_darknet_weights(self, path, cutoff=-1): + """ + @:param path - path of the new weights file + @:param cutoff - save layers between 0 and cutoff (cutoff = -1 -> all are saved) + """ + fp = open(path, "wb") + self.header_info[3] = self.seen + self.header_info.tofile(fp) + + # Iterate through layers + for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])): + if module_def["type"] == "convolutional": + conv_layer = module[0] + # If batch norm, load bn first + if module_def["batch_normalize"]: + bn_layer = module[1] + bn_layer.bias.data.cpu().numpy().tofile(fp) + bn_layer.weight.data.cpu().numpy().tofile(fp) + bn_layer.running_mean.data.cpu().numpy().tofile(fp) + bn_layer.running_var.data.cpu().numpy().tofile(fp) + # Load conv bias + else: + conv_layer.bias.data.cpu().numpy().tofile(fp) + # Load conv weights + conv_layer.weight.data.cpu().numpy().tofile(fp) + + fp.close() \ No newline at end of file diff --git a/singa_auto/darknet/utils.py b/singa_auto/darknet/utils.py new file mode 100644 index 00000000..9a2ef11c --- /dev/null +++ b/singa_auto/darknet/utils.py @@ -0,0 +1,352 @@ +import numpy as np +import torch +import torch.nn.functional as F + + +def ap_per_class(tp, conf, pred_cls, target_cls): + """ Compute the average precision, given the recall and precision curves. + Source: https://github.com/rafaelpadilla/Object-Detection-Metrics. + # Arguments + tp: True positives (list). + conf: Objectness value from 0-1 (list). + pred_cls: Predicted object classes (list). + target_cls: True object classes (list). + # Returns + The average precision as computed in py-faster-rcnn. + """ + + # Sort by objectness + i = np.argsort(-conf) + tp, conf, pred_cls = tp[i], conf[i], pred_cls[i] + + # Find unique classes + unique_classes = np.unique(target_cls) + + # Create Precision-Recall curve and compute AP for each class + ap, p, r = [], [], [] + for c in unique_classes: + i = pred_cls == c + n_gt = (target_cls == c).sum() # Number of ground truth objects + n_p = i.sum() # Number of predicted objects + + if n_p == 0 and n_gt == 0: + continue + elif n_p == 0 or n_gt == 0: + ap.append(0) + r.append(0) + p.append(0) + else: + # Accumulate FPs and TPs + fpc = (1 - tp[i]).cumsum() + tpc = (tp[i]).cumsum() + + # Recall + recall_curve = tpc / (n_gt + 1e-16) + r.append(recall_curve[-1]) + + # Precision + precision_curve = tpc / (tpc + fpc) + p.append(precision_curve[-1]) + + # AP from recall-precision curve + ap.append(compute_ap(recall_curve, precision_curve)) + + # Compute F1 score (harmonic mean of precision and recall) + p, r, ap = np.array(p), np.array(r), np.array(ap) + f1 = 2 * p * r / (p + r + 1e-16) + + return p, r, ap, f1, unique_classes.astype("int32") + + +def bbox_iou(box1, box2, x1y1x2y2=True): + """ + Returns the IoU of two bounding boxes + """ + if not x1y1x2y2: + # Transform from center and width to exact coordinates + b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 + b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 + b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 + b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 + else: + # Get the coordinates of bounding boxes + b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] + b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] + + # get the corrdinates of the intersection rectangle + inter_rect_x1 = torch.max(b1_x1, b2_x1) + inter_rect_y1 = torch.max(b1_y1, b2_y1) + inter_rect_x2 = torch.min(b1_x2, b2_x2) + inter_rect_y2 = torch.min(b1_y2, b2_y2) + + # Intersection area + inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1 + 1, min=0) * torch.clamp( + inter_rect_y2 - inter_rect_y1 + 1, min=0 + ) + + # Union Area + b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) + b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) + + iou = inter_area / (b1_area + b2_area - inter_area + 1e-16) + + return iou + + +def bbox_wh_iou(wh1, wh2): + wh2 = wh2.t() + w1, h1 = wh1[0], wh1[1] + w2, h2 = wh2[0], wh2[1] + inter_area = torch.min(w1, w2) * torch.min(h1, h2) + union_area = (w1 * h1 + 1e-16) + w2 * h2 - inter_area + return inter_area / union_area + + +def build_targets(pred_boxes, pred_cls, target, anchors, ignore_thresh): + + # ByteTensor = torch.cuda.ByteTensor if pred_boxes.is_cuda else torch.ByteTensor + ByteTensor = torch.cuda.BoolTensor if pred_boxes.is_cuda else torch.BoolTensor + FloatTensor = torch.cuda.FloatTensor if pred_boxes.is_cuda else torch.FloatTensor + + nB = pred_boxes.size(0) + nA = pred_boxes.size(1) + nC = pred_cls.size(-1) + nG = pred_boxes.size(2) + + # Output tensors + obj_mask = ByteTensor(nB, nA, nG, nG).fill_(0) + noobj_mask = ByteTensor(nB, nA, nG, nG).fill_(1) + class_mask = FloatTensor(nB, nA, nG, nG).fill_(0) + iou_scores = FloatTensor(nB, nA, nG, nG).fill_(0) + tx = FloatTensor(nB, nA, nG, nG).fill_(0) + ty = FloatTensor(nB, nA, nG, nG).fill_(0) + tw = FloatTensor(nB, nA, nG, nG).fill_(0) + th = FloatTensor(nB, nA, nG, nG).fill_(0) + tcls = FloatTensor(nB, nA, nG, nG, nC).fill_(0) + + + # # note: solver multi gpu problem + # target = target[target.sum(dim=1) != 0] + + # Convert to position relative to box + target_boxes = target[:, 2:6] * nG + gxy = target_boxes[:, :2] + gwh = target_boxes[:, 2:] + # Get anchors with best iou + ious = torch.stack([bbox_wh_iou(anchor, gwh) for anchor in anchors]) + best_ious, best_n = ious.max(0) + # Separate target values + b, target_labels = target[:, :2].long().t() + gx, gy = gxy.t() + gw, gh = gwh.t() + gi, gj = gxy.long().t() + + # prevent index out of boundary + gi = gi.clamp(0, nG - 1) + gj = gj.clamp(0, nG - 1) + + # Set masks + obj_mask[b, best_n, gj, gi] = 1 + noobj_mask[b, best_n, gj, gi] = 0 + + # Set noobj mask to zero where iou exceeds ignore threshold + for i, anchor_ious in enumerate(ious.t()): + noobj_mask[b[i], anchor_ious > ignore_thresh, gj[i], gi[i]] = 0 + + # Coordinates + tx[b, best_n, gj, gi] = gx - gx.floor() + ty[b, best_n, gj, gi] = gy - gy.floor() + # Width and height + tw[b, best_n, gj, gi] = torch.log(gw / anchors[best_n][:, 0] + 1e-16) + th[b, best_n, gj, gi] = torch.log(gh / anchors[best_n][:, 1] + 1e-16) + # One-hot encoding of label + tcls[b, best_n, gj, gi, target_labels] = 1 + # Compute label correctness and iou at best anchor + class_mask[b, best_n, gj, gi] = (pred_cls[b, best_n, gj, gi].argmax(-1) == target_labels).float() + iou_scores[b, best_n, gj, gi] = bbox_iou(pred_boxes[b, best_n, gj, gi], target_boxes, x1y1x2y2=False) + + tconf = obj_mask.float() + return iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf + + +def compute_ap(recall, precision): + """ + Compute the average precision, given the recall and precision curves. + Code originally from https://github.com/rbgirshick/py-faster-rcnn. + + # Arguments + recall: The recall curve (list). + precision: The precision curve (list). + # Returns + The average precision as computed in py-faster-rcnn. + """ + # correct AP calculation + # first append sentinel values at the end + mrec = np.concatenate(([0.0], recall, [1.0])) + mpre = np.concatenate(([0.0], precision, [0.0])) + + # compute the precision envelope + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + i = np.where(mrec[1:] != mrec[:-1])[0] + + # and sum (\Delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap + + +def get_batch_statistics(outputs, targets, iou_thresh): + """ Compute true positives, predicted scores and predicted labels per sample """ + batch_metrics = [] + for sample_i in range(len(outputs)): + + if outputs[sample_i] is None: + continue + + output = outputs[sample_i] + pred_boxes = output[:, :4] + pred_scores = output[:, 4] + pred_labels = output[:, -1] + + true_positives = np.zeros(pred_boxes.shape[0]) + + annotations = targets[targets[:, 0] == sample_i][:, 1:] + target_labels = annotations[:, 0] if len(annotations) else [] + if len(annotations): + detected_boxes = [] + target_boxes = annotations[:, 1:] + + for pred_i, (pred_box, pred_label) in enumerate(zip(pred_boxes, pred_labels)): + + # If targets are found break + if len(detected_boxes) == len(annotations): + break + + # Ignore if label is not one of the target labels + if pred_label not in target_labels: + continue + + iou, box_index = bbox_iou(pred_box.unsqueeze(0), target_boxes).max(0) + if iou >= iou_thresh and box_index not in detected_boxes: + true_positives[pred_i] = 1 + detected_boxes += [box_index] + batch_metrics.append([true_positives, pred_scores, pred_labels]) + return batch_metrics + + +def non_max_suppression(prediction, conf_thresh=0.5, nms_thresh=0.4): + """ + Removes detections with lower object confidence score than 'conf_thresh' and performs + Non-Maximum Suppression to further filter detections. + Returns detections with shape: + (x1, y1, x2, y2, object_conf, class_score, class_pred) + """ + + # From (center x, center y, width, height) to (x1, y1, x2, y2) + prediction[..., :4] = xywh2xyxy(prediction[..., :4]) + output = [None for _ in range(len(prediction))] + for image_i, image_pred in enumerate(prediction): + # Filter out confidence scores below threshold + image_pred = image_pred[image_pred[:, 4] >= conf_thresh] + + # If none are remaining => process next image + if not image_pred.size(0): + continue + + # Object confidence times class confidence + score = image_pred[:, 4] * image_pred[:, 5:].max(1)[0] + + # Sort by it + image_pred = image_pred[(-score).argsort()] + class_confs, class_preds = image_pred[:, 5:].max(1, keepdim=True) + detections = torch.cat((image_pred[:, :5], class_confs.float(), class_preds.float()), 1) + + # Perform non-maximum suppression + keep_boxes = [] + try_round = 1000 # avoiding infinite loop + while detections.size(0) and try_round >= 0: + try_round -= 1 + + large_overlap = bbox_iou(detections[0, :4].unsqueeze(0), detections[:, :4]) > nms_thresh + label_match = detections[0, -1] == detections[:, -1] + + # Indices of boxes with lower confidence scores, large IOUs and matching labels + invalid = large_overlap & label_match + weights = detections[invalid, 4:5] + + # Merge overlapping bboxes by order of confidence + detections[0, :4] = (weights * detections[invalid, :4]).sum(0) / weights.sum() + keep_boxes += [detections[0]] + detections = detections[~invalid] + + if keep_boxes: + output[image_i] = torch.stack(keep_boxes) + + return output + + +def pad_to_square(img, pad_value): + c, h, w = img.shape + dim_diff = np.abs(h - w) + + # (upper / left) padding and (lower / right) padding + pad1, pad2 = dim_diff // 2, dim_diff - dim_diff // 2 + + # Determine padding + pad = (0, 0, pad1, pad2) if h <= w else (pad1, pad2, 0, 0) + + # Add padding + img = F.pad(img, pad, "constant", value=pad_value) + + return img, pad + + +def rescale_boxes(boxes, current_dim, original_shape): + """ Rescales bounding boxes to the original shape """ + orig_h, orig_w = original_shape + + # The amount of padding that was added + pad_x = max(orig_h - orig_w, 0) * (current_dim / max(original_shape)) + pad_y = max(orig_w - orig_h, 0) * (current_dim / max(original_shape)) + + # Image height and width after padding is removed + unpad_h = current_dim - pad_y + unpad_w = current_dim - pad_x + + # Rescale bounding boxes to dimension of original image + boxes[:, 0] = ((boxes[:, 0] - pad_x // 2) / unpad_w) * orig_w + boxes[:, 1] = ((boxes[:, 1] - pad_y // 2) / unpad_h) * orig_h + boxes[:, 2] = ((boxes[:, 2] - pad_x // 2) / unpad_w) * orig_w + boxes[:, 3] = ((boxes[:, 3] - pad_y // 2) / unpad_h) * orig_h + + return boxes + + +def resize(image, size): + image = F.interpolate(image.unsqueeze(0), size=size, mode="nearest").squeeze(0) + return image + + +def to_cpu(tensor): + return tensor.detach().cpu() + + +def weights_init_normal(m): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + torch.nn.init.normal_(m.weight.data, 0.0, 0.02) + elif classname.find("BatchNorm2d") != -1: + torch.nn.init.normal_(m.weight.data, 1.0, 0.02) + torch.nn.init.constant_(m.bias.data, 0.0) + + +def xywh2xyxy(x): + y = x.new(x.shape) + y[..., 0] = x[..., 0] - x[..., 2] / 2 + y[..., 1] = x[..., 1] - x[..., 3] / 2 + y[..., 2] = x[..., 0] + x[..., 2] / 2 + y[..., 3] = x[..., 1] + x[..., 3] / 2 + return y \ No newline at end of file diff --git a/singa_auto/darknet/yolov3.cfg b/singa_auto/darknet/yolov3.cfg new file mode 100644 index 00000000..946e0154 --- /dev/null +++ b/singa_auto/darknet/yolov3.cfg @@ -0,0 +1,788 @@ +[net] +# Testing +#batch=1 +#subdivisions=1 +# Training +batch=16 +subdivisions=1 +width=416 +height=416 +channels=3 +momentum=0.9 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.001 +burn_in=1000 +max_batches = 500200 +policy=steps +steps=400000,450000 +scales=.1,.1 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +# Downsample + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=32 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +# Downsample + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=2 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[shortcut] +from=-3 +activation=linear + +###################### + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 6,7,8 +anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 + + +[route] +layers = -4 + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = -1, 61 + + + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=512 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 3,4,5 +anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 + + + +[route] +layers = -4 + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[upsample] +stride=2 + +[route] +layers = -1, 36 + + + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=256 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=255 +activation=linear + + +[yolo] +mask = 0,1,2 +anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 +classes=80 +num=9 +jitter=.3 +ignore_thresh = .7 +truth_thresh = 1 +random=1 diff --git a/singa_auto/datasets/dataset.py b/singa_auto/datasets/dataset.py index 5155a42c..38669207 100644 --- a/singa_auto/datasets/dataset.py +++ b/singa_auto/datasets/dataset.py @@ -202,6 +202,7 @@ def _(image_bytes): pil_images = [] for image_path in image_paths: pil_images.append(load(image_path)) + images = np.array([np.asarray(x) for x in pil_images]) return pil_images @@ -220,6 +221,7 @@ class CorpusDataset(ModelDataset): ''' def __init__(self, dataset_path, tags, split_by): + super().__init__(dataset_path) self.tags = tags (self.size, self.tag_num_classes, self.max_token_len, self.max_sent_len, self._sents) = \ self._load(dataset_path, self.tags, split_by) @@ -289,10 +291,13 @@ class AudioFilesDataset(ModelDataset): ''' def __init__(self, dataset_path, dataset_dir): + super().__init__(dataset_path) self._dataset_dir = dataset_dir self.df = self._load(dataset_path) + def __getitem__(self, idx): return self.df.iloc[idx] + def _load(self, dataset_path): ''' Loading the dataset into a pandas dataframe. Called in the class __init__ method. diff --git a/singa_auto/datasets/dataset_base.py b/singa_auto/datasets/dataset_base.py index 4df12f41..ec1b7f63 100644 --- a/singa_auto/datasets/dataset_base.py +++ b/singa_auto/datasets/dataset_base.py @@ -12,8 +12,8 @@ def _load_pil_image(image_path, mode='RGB'): try: with open(image_path, 'rb') as f: - encoded = io.BytesIO(f.read()) - pil_image = Image.open(encoded).convert(mode) + #encoded = io.BytesIO(f.read()) + pil_image = Image.open(image_path).convert(mode) except: print('error accurs when handling : ', image_path) raise diff --git a/singa_auto/datasets/image_classification_dataset.py b/singa_auto/datasets/image_classification_dataset.py index cf72e779..95a3d03f 100644 --- a/singa_auto/datasets/image_classification_dataset.py +++ b/singa_auto/datasets/image_classification_dataset.py @@ -9,6 +9,9 @@ import numpy as np from singa_auto.datasets.dataset_base import _load_pil_image, ClfModelDataset import pandas as pd +import logging +logger = logging.getLogger(__name__) + class ImageDataset4Clf(ClfModelDataset): @@ -29,10 +32,12 @@ def __init__(self, self.mode = mode self.path = dataset_path self.dataset_zipfile = None + self.label_mapper = dict() (self._image_names, self._image_classes, self.size, self.classes) = self._extract_zip(self.path) + self.min_image_size = min_image_size self.max_image_size = max_image_size - self.label_mapper = dict() + self.image_size = None if if_shuffle: (self._image_names, @@ -44,6 +49,7 @@ def __getitem__(self, index): raise StopIteration try: pil_image = self._extract_item(item_path=self._image_names[index]) + (image, image_size) = self._preprocess(pil_image, self.min_image_size, self.max_image_size) @@ -53,6 +59,8 @@ def __getitem__(self, index): return (image, image_class) except: + logging.error('getitem') + logging.error(self._image_names[index]) raise def _preprocess(self, pil_image, min_image_size, max_image_size): @@ -78,46 +86,69 @@ def _extract_item(self, item_path): with tempfile.TemporaryDirectory() as d: extracted_item_path = self.dataset_zipfile.extract(item_path, path=d) + + pil_image = _load_pil_image(extracted_item_path, mode=self.mode) return pil_image def _extract_zip(self, dataset_path): + + flag=0 self.dataset_zipfile = zipfile.ZipFile(dataset_path, 'r') - if 'images.csv' in self.dataset_zipfile.namelist(): + print(self.dataset_zipfile.namelist()) + with tempfile.TemporaryDirectory() as d: + for fileName in self.dataset_zipfile.namelist(): + if fileName.endswith('class_name.csv'): + class_csv_path = self.dataset_zipfile.extract(fileName, + path=d) + + csv = pd.read_csv(class_csv_path) + name = csv[csv.columns[1]] + label = csv[csv.columns[0]] + for single_name,single_label in zip(name,label): + self.label_mapper[str(single_label)]=single_name + print('label_mapper') + print(self.label_mapper) + + for fileName in self.dataset_zipfile.namelist(): + if fileName.endswith('images.csv'): + flag=1 # Create temp directory to unzip to extract paths/classes/numbers only, # no actual images would be extracted - with tempfile.TemporaryDirectory() as d: - # obtain csv file - for fileName in self.dataset_zipfile.namelist(): - if fileName.endswith('.csv'): - # Extract a single csv file from zip - images_csv_path = self.dataset_zipfile.extract(fileName, - path=d) - break - try: - csv = pd.read_csv(images_csv_path) - image_classes = csv[csv.columns[1:]] - image_paths = csv[csv.columns[0]] - except: - traceback.print_stack() - raise - num_classes = len(csv[csv.columns[1]].unique()) - num_labeled_samples = len(csv[csv.columns[0]].unique()) - image_classes = tuple(np.array(image_classes).squeeze().tolist()) - image_paths = tuple(image_paths) - - else: - # make image name list and remove dir from list - image_paths = [ - x for x in self.dataset_zipfile.namelist() - if x.endswith('/') == False - ] - num_labeled_samples = len(image_paths) - str_labels = [os.path.dirname(x) for x in image_paths] - self.str_labels_set = list(set(str_labels)) - num_classes = len(self.str_labels_set) - image_classes = [self.str_labels_set.index(x) for x in str_labels] + if flag==1: + with tempfile.TemporaryDirectory() as d: + # obtain csv file + for fileName in self.dataset_zipfile.namelist(): + if fileName.endswith('images.csv'): + # Extract a single csv file from zip + images_csv_path = self.dataset_zipfile.extract(fileName, + path=d) + break + try: + csv = pd.read_csv(images_csv_path) + image_classes = csv[csv.columns[1]] + image_paths = csv[csv.columns[0]] + print(image_classes) + except: + traceback.print_stack() + raise + num_classes = len(csv[csv.columns[1]].unique()) + num_labeled_samples = len(csv[csv.columns[0]].unique()) + image_classes = tuple(np.array(image_classes).squeeze().tolist()) + image_paths = tuple(image_paths) + + else: + # make image name list and remove dir from list + image_paths = [ + x for x in self.dataset_zipfile.namelist() + if x.endswith('/') == False + ] + num_labeled_samples = len(image_paths) + str_labels = [os.path.dirname(x) for x in image_paths] + self.str_labels_set = list(set(str_labels)) + num_classes = len(self.str_labels_set) + image_classes = [self.str_labels_set.index(x) for x in str_labels] return (image_paths, image_classes, num_labeled_samples, num_classes) def _shuffle(self, images, classes): diff --git a/singa_auto/datasets/image_detection_dataset.py b/singa_auto/datasets/image_detection_dataset.py index 5a700f24..9b20c67b 100644 --- a/singa_auto/datasets/image_detection_dataset.py +++ b/singa_auto/datasets/image_detection_dataset.py @@ -1,15 +1,29 @@ import copy +import cv2 +import itertools +import json import numpy as np import os -import zipfile +import random import tempfile -from singa_auto.datasets.dataset_base import DetectionModelDataset +import time import torch import torch.utils.data +import zipfile + from PIL import Image +from PIL import ImageFile +ImageFile.LOAD_TRUNCATED_IMAGES = True + +from collections import defaultdict from pycocotools.coco import COCO +from torchvision.transforms import transforms + +from singa_auto.darknet.utils import pad_to_square, resize +from singa_auto.datasets.dataset_base import DetectionModelDataset from singa_auto.datasets.torch_utils import get_transform + COCO_INSTANCE_CATEGORY_NAMES = [ '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign', @@ -26,6 +40,487 @@ ] +def fetch_from_train_set(root_path, split_ratio=0.8): + image_train_folder = os.path.join(root_path, "train", "image") + image_val_folder = os.path.join(root_path, "val", "image") + annotation_train_folder = os.path.join(root_path, "train", "annotation") + annotation_val_folder = os.path.join(root_path, "val", "annotation") + + os.makedirs(image_val_folder, exist_ok=True) + os.makedirs(annotation_val_folder, exist_ok=True) + + list_image = list(sorted(os.listdir(image_train_folder))) + list_annotation = list(sorted(os.listdir(annotation_train_folder))) + + union_list = [] + for image_name in list_image: + base_name, _ = os.path.splitext(image_name) + + if base_name + ".json" in list_annotation: + union_list.append(image_name) + + disordered_index = np.random.permutation(range(len(union_list))) + val_list = disordered_index[np.int(len(union_list) * split_ratio):] + import shutil + + for image_idx in val_list: + image_name = union_list[image_idx] + annotation_name = os.path.splitext(image_name)[0] + ".json" + + shutil.move(os.path.join(image_train_folder, image_name), os.path.join(image_val_folder, image_name)) + shutil.move(os.path.join(annotation_train_folder, annotation_name), os.path.join(annotation_val_folder, annotation_name)) + + +def split_dataset(root_path, split_ratio=0.8): + image_path = os.path.join(root_path, "image") + annotation_path = os.path.join(root_path, "annotation") + + image_train_folder = os.path.join(root_path, "train", "image") + image_val_folder = os.path.join(root_path, "val", "image") + annotation_train_folder = os.path.join(root_path, "train", "annotation") + annotation_val_folder = os.path.join(root_path, "val", "annotation") + + os.makedirs(image_train_folder, exist_ok=True) + os.makedirs(image_val_folder, exist_ok=True) + os.makedirs(annotation_train_folder, exist_ok=True) + os.makedirs(annotation_val_folder, exist_ok=True) + + list_image = list(sorted(os.listdir(image_path))) + list_annotation = list(sorted(os.listdir(annotation_path))) + + union_list = [] + for image_name in list_image: + base_name, _ = os.path.splitext(image_name) + + if base_name + ".json" in list_annotation: + union_list.append(image_name) + + disordered_index = np.random.permutation(range(len(union_list))) + train_list = disordered_index[:np.int(len(union_list) * split_ratio)] + val_list = disordered_index[np.int(len(union_list) * split_ratio):] + + import shutil + for image_idx, image_name in enumerate(union_list): + annotation_name = os.path.splitext(image_name)[0] + ".json" + + if image_idx in train_list: + shutil.copy(os.path.join(image_path, image_name), os.path.join(image_train_folder, image_name)) + shutil.copy(os.path.join(annotation_path, annotation_name), os.path.join(annotation_train_folder, annotation_name)) + else: + shutil.copy(os.path.join(image_path, image_name), os.path.join(image_val_folder, image_name)) + shutil.copy(os.path.join(annotation_path, annotation_name), os.path.join(annotation_val_folder, annotation_name)) + + +class YoloCoco(object): + def __init__(self, annotation_path=None, is_single_json_file=False): + """ + dataset for YOLO, according with coco + @ annotation_path: annotation path, filename if a single json, folder path is multiple jsons + """ + self.dataset, self.anns, self.cats, self.imgs = dict(), dict(), dict(), dict() + self.img_to_ann, self.cat_to_img = defaultdict(list), defaultdict(list) + + if annotation_path is not None: + print("loading annotations into memory") + tic = time.time() + + if is_single_json_file: + # load annotations from single json + with open(annotation_path, 'r') as f: + dataset = json.load(f) + else: + # load annotations from json files + dataset = self.load_scattered_json(annotation_path) + + assert type(dataset)==dict, "annotation file format {} not supported".format(type(dataset)) + print("Done (t={:0.2f}s)".format(time.time()- tic)) + self.dataset = dataset + else: + raise ValueError("annotation_path should not be None") + + self.create_index() + + def _is_array_like(self, obj): + return hasattr(obj, '__iter__') and hasattr(obj, '__len__') + + def load_scattered_json(self, annotation_path): + """ + merge annotation into a dataset, in accordancy with pycocotool + """ + list_annotation = list(sorted(os.listdir(annotation_path))) + + dataset = { + "images": list(), + "annotations": list(), + "categories": list(), + } + + dict_category = dict() + dict_image = dict() + last_category_id = 0 + last_annotation_id = 0 + last_image_id = 0 + + # for all json files + for annotation_idx, annotation_filename in enumerate(list_annotation): + with open(os.path.join(annotation_path, annotation_filename), 'r') as f: + json_info = json.load(f) + + # process image info + image_id = int(json_info["imagePath"][15:-4]) + if image_id not in dict_image: + dict_image[image_id] = last_image_id + last_image_id += 1 + + image_info = { + "file_name": json_info["imagePath"], + "height": json_info["imageHeight"], + "width": json_info["imageWidth"], + "id": image_id, + } + + dataset["images"].append(image_info) + + # process bounding box information + for bounding_box_info in json_info["shapes"]: + if bounding_box_info["label"] not in dict_category: + dict_category[bounding_box_info["label"]] = last_category_id + + category_info = { + "id": last_category_id, + "name":bounding_box_info["label"], + } + + dataset["categories"].append(category_info) + last_category_id += 1 + + annotation_info = { + "image_id": image_id, + "bbox": list(np.array(np.concatenate((bounding_box_info["points"][0], bounding_box_info["points"][1]), axis=0), dtype=np.int)), + "category_id": dict_category[bounding_box_info["label"]], + "id": last_annotation_id, + } + last_annotation_id += 1 + + dataset["annotations"].append(annotation_info) + return dataset + + def create_index(self): + print("creating index") + anns, cats, imgs = dict(), dict(), dict() + img_to_ann, cat_to_img = defaultdict(list), defaultdict(list) + + if "annotations" in self.dataset: + for ann in self.dataset["annotations"]: + img_to_ann[ann["image_id"]].append(ann) + anns[ann["id"]] = ann + + if "images" in self.dataset: + for img in self.dataset["images"]: + imgs[img["id"]] = img + + if "categories" in self.dataset: + for cat in self.dataset["categories"]: + cats[cat["id"]] = cat + + if "annotations" in self.dataset and "categories" in self.dataset: + for ann in self.dataset["annotations"]: + cat_to_img[ann["category_id"]].append(ann["image_id"]) + + print("index created") + + # create class member + self.anns = anns + self.cats = cats + self.imgs = imgs + self.cat_to_img = cat_to_img + self.img_to_ann = img_to_ann + + def info(self): + """ + Print information about the annotation file. + :return: + """ + for key, value in self.dataset['info'].items(): + print('{}: {}'.format(key, value)) + + def get_ann_id(self, img_id=[], cat_id=[], area_rng=[], is_crowd=None): + """ + Get ann ids that satisfy given filter conditions. default skips that filter + :param: img_id (int array) get anns for given imgs + :param: cat_id (int array) get anns for given cats + :param: area_rng (float array) get anns for given area range (e.g. [0 inf]) + :param: is_crowd (boolean) get anns for given crowd label (False or True) + :return: ids (int array) integer array of ann ids + """ + img_id = img_id if self._is_array_like(img_id) else [img_id] + cat_id = cat_id if self._is_array_like(cat_id) else [cat_id] + + if len(img_id) == len(cat_id) == len(area_rng) == 0: + anns = self.dataset['annotations'] + else: + if not len(img_id) == 0: + lists = [self.img_to_ann[imgId] for imgId in img_id if imgId in self.img_to_ann] + anns = list(itertools.chain.from_iterable(lists)) + else: + anns = self.dataset['annotations'] + anns = anns if len(cat_id) == 0 else [ann for ann in anns if ann['category_id'] in cat_id] + anns = anns if len(area_rng) == 0 else [ann for ann in anns if ann['area'] > area_rng[0] and ann['area'] < area_rng[1]] + if not is_crowd is None: + ids = [ann['id'] for ann in anns if ann['is_crowd'] == is_crowd] + else: + ids = [ann['id'] for ann in anns] + return ids + + def get_cat_id(self, cat_nms=[], sup_nms=[], cat_id=[]): + """ + filtering parameters. default skips that filter. + :param: cat_nms (str array) : get cats for given cat names + :param: sup_nms (str array) : get cats for given supercategory names + :param: cat_id (int array) : get cats for given cat ids + :return: ids (int array) : integer array of cat ids + """ + cat_nms = cat_nms if self._is_array_like(cat_nms) else [cat_nms] + sup_nms = sup_nms if self._is_array_like(sup_nms) else [sup_nms] + cat_id = cat_id if self._is_array_like(cat_id) else [cat_id] + + if len(cat_nms) == len(sup_nms) == len(cat_id) == 0: + cats = self.dataset['categories'] + else: + cats = self.dataset['categories'] + cats = cats if len(cat_nms) == 0 else [cat for cat in cats if cat['name'] in cat_nms] + cats = cats if len(sup_nms) == 0 else [cat for cat in cats if cat['supercategory'] in sup_nms] + cats = cats if len(cat_id) == 0 else [cat for cat in cats if cat['id'] in cat_id] + ids = [cat['id'] for cat in cats] + return ids + + def get_img_id(self, img_id=[], cat_id=[]): + """ + Get img ids that satisfy given filter conditions. + :param: img_id (int array) get imgs for given ids + :param: cat_id (int array) : get imgs with all given cats + :return: ids (int array) : integer array of img ids + """ + img_id = img_id if self._is_array_like(img_id) else [img_id] + cat_id = cat_id if self._is_array_like(cat_id) else [cat_id] + + if len(img_id) == len(cat_id) == 0: + ids = self.imgs.keys() + else: + ids = set(img_id) + for i, cat_id in enumerate(cat_id): + if i == 0 and len(ids) == 0: + ids = set(self.cat_to_img[cat_id]) + else: + # original &=, but should be |= + ids |= set(self.cat_to_img[cat_id]) + return list(ids) + + def load_ann(self, ids=[]): + """ + Load anns with the specified ids. + :param: ids (int array) : integer ids specifying anns + :return: anns (object array) : loaded ann objects + """ + if self._is_array_like(ids): + return [self.anns[id] for id in ids] + elif type(ids) == int: + return [self.anns[ids]] + + def load_cat(self, ids=[]): + """ + Load cats with the specified ids. + :param: ids (int array) : integer ids specifying cats + :return: cats (object array) : loaded cat objects + """ + if self._is_array_like(ids): + return [self.cats[id] for id in ids] + elif type(ids) == int: + return [self.cats[ids]] + + def load_imgs(self, ids=[]): + """ + Load anns with the specified ids. + :param: ids (int array) : integer ids specifying img + :return: imgs (object array) : loaded img objects + """ + if self._is_array_like(ids): + return [self.imgs[id] for id in ids] + elif type(ids) == int: + return [self.imgs[ids]] + + def load_numpy_annotation(self, data): + """ + Convert result data from a numpy array [Nx7] where each row contains {imageID,x1,y1,w,h,score,class} + :param: data (numpy.ndarray) + :return: annotations (python nested list) + """ + print('Converting ndarray to lists...') + assert(type(data) == np.ndarray) + print(data.shape) + assert(data.shape[1] == 7) + N = data.shape[0] + ann = [] + for i in range(N): + if i % 1000000 == 0: + print('{}/{}'.format(i,N)) + ann += [{ + 'image_id' : int(data[i, 0]), + 'bbox' : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ], + 'score' : data[i, 5], + 'category_id': int(data[i, 6]), + }] + return ann + + + +class YoloDataset(DetectionModelDataset, torch.utils.data.Dataset): + """ + dataset of yolo + """ + def __init__(self, image_path, annotation_path, is_single_json_file, filter_classes, is_train, img_size=416, augment=True, multiscale=True, normalized_labels=True): + self.root_path = image_path + self.imgs = list(sorted(os.listdir(image_path))) + self.annotation_path = annotation_path + self.coco = YoloCoco(self.annotation_path, is_single_json_file=is_single_json_file) + # eg: filter_classes: ['person', 'dog'] + self.cat_ids = self.coco.get_cat_id(cat_nms=filter_classes) + self.ids = self.coco.get_img_id(cat_id=self.cat_ids) + + self.cat_to_label = {v: key+1 for key, v in enumerate(self.cat_ids)} + self.label_to_cat = {key+1: v for key, v in enumerate(self.cat_ids)} + + self.img_size = img_size + self.max_objects = 100 + self.augment = augment if is_train else False + self.multiscale = multiscale if is_train else False + self.normalized_labels = normalized_labels + self.min_size = self.img_size - 3 * 32 + self.max_size = self.img_size + 3 * 32 + self.batch_count = 0 + + # if os.path.exists(r"./rectangle_images/"): + # import shutil + # shutil.rmtree(r"./rectangle_images/") + # os.makedirs(r"./rectangle_images/", exist_ok=True) + + def __getitem__(self, index): + img_id = self.ids[index % len(self.ids)] + ann_id = self.coco.get_ann_id(img_id=img_id) + + img_path = os.path.join(self.root_path, self.coco.load_imgs(img_id)[0]["file_name"]) + + # Extract image as PyTorch tensor + img = transforms.ToTensor()(Image.open(img_path).convert('RGB')) + + # Handle images with less than three channels + if len(img.shape) != 3: + img = img.unsqueeze(0) + img = img.expand((3, img.shape[1:])) + + _, h, w = img.shape + h_factor, w_factor = (h, w) if self.normalized_labels else (1, 1) + # Pad to square resolution + img, pad = pad_to_square(img, 0) + _, padded_h, padded_w = img.shape + + # --------- + # Label + # --------- + coco_annotation = self.coco.load_ann(ann_id) + + tmp_label = [] + box_info = [] + for ann in coco_annotation: + if ann["category_id"] not in self.cat_ids: + continue + boxes = torch.zeros((1, 6), dtype=torch.float32) + x1 = round(max(ann['bbox'][0], 0)) + y1 = round(max(ann['bbox'][1], 0)) + x2 = round(min(x1 + ann['bbox'][2], w - 1)) + y2 = round(min(y1 + ann['bbox'][3], h - 1)) + + # Adjust for added padding + x1 += pad[0] + y1 += pad[2] + x2 += pad[1] + y2 += pad[3] + + box_info.append(((x1, y1), (x2, y2))) + + # print(x1, x2, y1, y2, padded_h, padded_w) + # Returns (x, y, w, h) + boxes[0, 2] = (x2 + x1) / 2 / padded_w + boxes[0, 3] = (y2 + y1) / 2 / padded_h + boxes[0, 4] = (x2 - x1) / padded_w + boxes[0, 5] = (y2 - y1) / padded_h + boxes[0, 1] = self.cat_to_label[ann["category_id"]] + tmp_label.append(boxes) + + # self.get_bounding_box(img, os.path.basename(img_path), box_info) + + # targets from list to tensor + targets = torch.cat(tmp_label, dim=0) + + # Apply augmentations + if self.augment: + if np.random.random() < 0.5: + img, targets = self.horisontal_flip(img, targets) + + return img_path, img, targets + + def __len__(self): + return len(self.ids) + + def _extract_zip(self, dataset_path, annotation_path): + dataset_zipfile = zipfile.ZipFile(dataset_path, 'r') + annotation_zipfile = zipfile.ZipFile(annotation_path, 'r') + + # create temp dir + self.root_path = tempfile.TemporaryDirectory() + + # extract images and annotations + dataset_zipfile.extractall(path=self.root_path.name) + annotation_zipfile.extractall(path=self.root_path.name) + imgs = list(sorted(os.listdir(os.path.join(self.root_path.name, self.img_folder_name)))) + annotation_file = os.path.join(self.root_path.name, "annotations", self.annotation_file_name) + + return imgs, annotation_file + + def collate_fn(self, batch): + paths, imgs, targets = list(zip(*batch)) + # Remove empty placeholder targets + targets = [boxes for boxes in targets if boxes is not None] + # Add sample index to targets + for i, boxes in enumerate(targets): + boxes[:, 0] = i + targets = torch.cat(targets, 0) + # Selects new image size every tenth batch + if self.multiscale and self.batch_count % 10 == 0: + self.img_size = random.choice(range(self.min_size, self.max_size + 1, 32)) + # Resize images to input shape + imgs = torch.stack([resize(img, self.img_size) for img in imgs]) + self.batch_count += 1 + return paths, imgs, targets + + # def get_bounding_box(self, img, basename, boxes, rect_th=3): + # """ + # draw the bounding box on img + # """ + # tmp = img.squeeze().detach().permute((1, 2, 0)).mul(255).clamp(0, 255).numpy() + # tmp = cv2.cvtColor(tmp, cv2.COLOR_RGB2BGR) + # + # for rect_info in boxes: + # cv2.rectangle(tmp, rect_info[0], rect_info[1], (0, 255, 0), rect_th) + # + # cv2.imwrite('./rectangle_images/{}'.format(basename), tmp) + + def horisontal_flip(self, images, targets): + images = torch.flip(images, [-1]) + targets[:, 2] = 1 - targets[:, 2] + return images, targets + + class PennFudanDataset(DetectionModelDataset, torch.utils.data.Dataset): def __init__(self, dataset_path, is_train): self.root_path = None diff --git a/singa_auto/datasets/image_segmentation_dataset.py b/singa_auto/datasets/image_segmentation_dataset.py new file mode 100644 index 00000000..45065657 --- /dev/null +++ b/singa_auto/datasets/image_segmentation_dataset.py @@ -0,0 +1,140 @@ +from torch.utils.data import Dataset +import numpy as np +from tqdm import tqdm +import os +from copy import deepcopy +from PIL import Image +import torch +from glob import glob + + +def ImageFetch(train_data_path, split_rate=0.9): + ''' + load image as PIL.Image into a list for dataloader, split train/val subsets automatically + train_data_path: already unzipped dataset folder path + split_rate: ratio of train/val data + ''' + folder_name = train_data_path + + image_train = [] + mask_train = [] + image_val = [] + mask_val = [] + + # split train and val subsets + images_folder = os.path.join(folder_name, "image") + masks_folder = os.path.join(folder_name, "mask") + + if not os.path.isdir(images_folder) or not os.path.isdir(masks_folder): + print("imges folder or mask folder does not exist, please check the upload file") + + image_list = sorted(glob(os.path.join(images_folder, '*'))) # use sorted list to control train/val split + num_img = len(image_list) + + train_num = int(num_img * split_rate) + train_list = image_list[0:train_num] + val_list = image_list[train_num:] + + # load images and masks from their folders + for idx, image_name in tqdm(enumerate(train_list), total=len(train_list), desc="load train images......"): + image_name = image_name.split('/')[-1] + + image_path = os.path.join(images_folder, image_name) + mask_path = os.path.join(masks_folder, os.path.splitext(image_name)[0] + ".png") # use image name to find the corresponding mask + + image = Image.open(image_path) + image_train.append(image) + + mask = Image.open(mask_path) + mask_train.append(mask) + + for idx, image_name in tqdm(enumerate(val_list), total=len(val_list), desc="load validation images......"): + image_name = image_name.split('/')[-1] + + image_path = os.path.join(images_folder, image_name) + mask_path = os.path.join(masks_folder, os.path.splitext(image_name)[0] + ".png") + + image = Image.open(image_path) + image_val.append(image) + + mask = Image.open(mask_path) + mask_val.append(mask) + + return image_train, mask_train, image_val, mask_val + + +def trainImageFetch(folder_name): + ''' + load train image as PIL.Image into a list for dataloader, need train/val subsets split before execution + folder_name: already unzipped train dataset folder path + ''' + image_train = [] + mask_train = [] + + # load images and masks from their folders + images_folder = os.path.join(folder_name, "train", "image") + masks_folder = os.path.join(folder_name, "train", "mask") + image_list = os.listdir(images_folder) + for idx, image_name in tqdm(enumerate(image_list), total=len(image_list), desc="load train images......"): + image_path = os.path.join(images_folder, image_name) + mask_path = os.path.join(masks_folder, os.path.splitext(image_name)[0] + ".png") + + image = Image.open(image_path) + image_train.append(image) + + mask = Image.open(mask_path) + mask_train.append(mask) + + return image_train, mask_train + + +def valImageFetch(folder_name): + ''' + load validation image as PIL.Image into a list for dataloader, need train/val subsets split before execution + folder_name: already unzipped validation dataset folder path + ''' + image_val = [] + mask_val = [] + + images_folder = os.path.join(folder_name, "val", "image") + masks_folder = os.path.join(folder_name, "val", "mask") + + image_list = os.listdir(images_folder) + for idx, image_name in tqdm(enumerate(image_list), total=len(image_list), desc="load validation images......"): + image_path = os.path.join(images_folder, image_name) + mask_path = os.path.join(masks_folder, os.path.splitext(image_name)[0] + ".png") + + image = Image.open(image_path) + image_val.append(image) + + mask = Image.open(mask_path) + mask_val.append(mask) + + return image_val, mask_val + + +class SegDataset(Dataset): + ''' + prepare image dataset with certain transforms + ''' + def __init__(self, image_list, mask_list, transform_img, transform_mask): + self.transform_img = transform_img + self.transform_mask = transform_mask + self.imagelist = image_list + self.masklist = mask_list + + + def __len__(self): + return len(self.imagelist) + + + def __getitem__(self, idx): + image = deepcopy(self.imagelist[idx]) + mask = deepcopy(self.masklist[idx]) + + image = self.transform_img(image) # apply transform + + mask = self.transform_mask(mask) + mask = torch.as_tensor(np.array(mask), dtype=torch.int64) # mask transform does not contain to_tensor function + + return image, mask \ No newline at end of file diff --git a/singa_auto/meta_store/meta_store.py b/singa_auto/meta_store/meta_store.py index 1f84729a..91be3c78 100644 --- a/singa_auto/meta_store/meta_store.py +++ b/singa_auto/meta_store/meta_store.py @@ -422,13 +422,16 @@ def get_service(self, service_id): service = self._session.query(Service).get(service_id) return service - def get_services(self, status=None): + def get_services(self, status=None, service_type=None): query = self._session.query(Service) if status is not None: # pylint: disable=E1111 query = query.filter(Service.status == status) - + + if service_type is not None: + query = query.filter(Service.service_type == service_type) + services = query.all() return services diff --git a/singa_auto/meta_store/requirements.txt b/singa_auto/meta_store/requirements.txt index 0bb6f98e..760235e8 100644 --- a/singa_auto/meta_store/requirements.txt +++ b/singa_auto/meta_store/requirements.txt @@ -1,2 +1,2 @@ SQLAlchemy==1.3.0 -psycopg2==2.7.5 \ No newline at end of file +psycopg2-binary==2.8.6 \ No newline at end of file diff --git a/singa_auto/model/__init__.py b/singa_auto/model/__init__.py index 172baed3..0d644c3d 100644 --- a/singa_auto/model/__init__.py +++ b/singa_auto/model/__init__.py @@ -20,6 +20,7 @@ from .image_classification import ImageClfBase from .model import BaseModel, Params, KnobConfig, Knobs from .object_detection import ObjtDetModel +from .image_segmentation import SegmentationModel from .post_tagging import PosTagModel from .tabular_classification import TabularClfModel from .log import LoggerUtils diff --git a/singa_auto/model/dev.py b/singa_auto/model/dev.py index ec16955c..bb483e85 100644 --- a/singa_auto/model/dev.py +++ b/singa_auto/model/dev.py @@ -209,8 +209,9 @@ def tune_model( return (best_proposal, best_model_test_score, best_params) -def make_predictions(queries: List[Any], task: str, - py_model_class: Type[BaseModel], proposal: Proposal, fine_tune_dataset_path, +def make_predictions_json(queries: List[Any], task: str, + py_model_class: Type[BaseModel], proposal: Proposal, + fine_tune_dataset_path, params: Params) -> List[Any]: inference_cache: InferenceCache = InferenceCache() worker_id = 'local' @@ -249,8 +250,8 @@ def make_predictions(queries: List[Any], task: str, _print_header('Making predictions with trained model...') predictions = model_inst.predict([x.query for x in queries_at_worker]) - - predictions = [Prediction(x, query.id, worker_id) + predictions = [ + Prediction(x, query.id, worker_id) for (x, query) in zip(predictions, queries_at_worker) ] @@ -265,6 +266,7 @@ def make_predictions(queries: List[Any], task: str, assert prediction is not None predictions_at_predictor.append(prediction) + # Predictor ensembles predictions ensemble_method = get_ensemble_method(task) print(f'Ensemble method: {ensemble_method}') out_predictions = [] @@ -341,7 +343,7 @@ def test_model_class(model_file_path: str, model_inst = None predictions = None if best_proposal is not None and best_params is not None and queries is not None: - (predictions, model_inst) = make_predictions(queries, task, + (predictions, model_inst) = make_predictions_json(queries, task, py_model_class, best_proposal, fine_tune_dataset_path, best_params) @@ -546,4 +548,4 @@ class DeprecatedModelUtils(): def _print_header(msg): print('-' * (len(msg) + 4)) print('| {} |'.format(msg)) - print('-' * (len(msg) + 4)) \ No newline at end of file + print('-' * (len(msg) + 4)) diff --git a/singa_auto/model/image_segmentation.py b/singa_auto/model/image_segmentation.py new file mode 100644 index 00000000..c235f8b9 --- /dev/null +++ b/singa_auto/model/image_segmentation.py @@ -0,0 +1,26 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +from .model import BaseModel + + +class SegmentationModel(BaseModel): + # TODO Find some thing in common for this task, and and abstract them to here + pass + diff --git a/singa_auto/model/model.py b/singa_auto/model/model.py index 47f62187..cb157fb3 100644 --- a/singa_auto/model/model.py +++ b/singa_auto/model/model.py @@ -1,158 +1,157 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import abc -import numpy as np -from typing import Union, Dict, Optional, Any, List - -from .knob import BaseKnob - -KnobConfig = Dict[str, BaseKnob] -Knobs = Dict[str, Any] -Params = Dict[str, Union[str, int, float, np.ndarray]] - - -class BaseModel(abc.ABC): - ''' - SINGA-Auto's base model class that SINGA-Auto models must extend. - - SINGA-Auto models must implement all abstract methods below, according to the specification of its associated task (see :ref:`tasks`). - They configure how this model template will be trained, evaluated, tuned, serialized and served on SINGA-Auto. - - In the model's ``__init__`` method, call ``super().__init__(**knobs)`` as the first line, - followed by the model's initialization logic. The model should be initialize itself with ``knobs``, - a set of generated knob values for the created model instance. - - These knob values are chosen by SINGA-Auto based on the model's knob configuration (defined in :meth:`singa_auto.model.BaseModel.get_knob_config`). - - For example: - - :: - - def __init__(self, **knobs): - self.__dict__.update(knobs) - ... - self._build_model(self.knob1, self.knob2) - - :param knobs: Dictionary mapping knob names to knob values - :type knobs: :obj:`singa_auto.model.Knobs` - ''' - - def __init__(self, **knobs: Knobs): - pass - - - @staticmethod - @abc.abstractmethod - def get_knob_config() -> KnobConfig: - ''' - Return a dictionary that defines the search space for this model template's knobs - (i.e. knobs' names, their types & their ranges). - - Over the course of training, your model will be initialized with different values of knobs within this search space - to maximize this model’s performance. - - Refer to :ref:`model-tuning` to understand more about how this works. - - :returns: Dictionary mapping knob names to knob specifications - ''' - raise NotImplementedError() - - @abc.abstractmethod - def train(self, - dataset_path: str, - shared_params: Optional[Params] = None, - **train_args): - ''' - Train this model instance with the given traing dataset and initialized knob values. - Additional keyword arguments could be passed depending on the task's specification. - - Additionally, trained parameters shared from previous trials could be passed, - as part of the ``SHARE_PARAMS`` policy (see :ref:`model-policies`). - - Subsequently, the model is considered *trained*. - - :param dataset_path: File path of the train dataset file in the *local filesystem*, in a format specified by the task - :param shared_params: Dictionary mapping parameter names to values, as produced by your model's :meth:`singa_auto.model.BaseModel.dump_parameters`. - ''' - raise NotImplementedError() - - @abc.abstractmethod - def evaluate(self, dataset_path: str, **kargs) -> float: - ''' - Evaluate this model instance with the given validation dataset after training. - - This will be called only when model is *trained*. - - :param dataset_path: File path of the validation dataset file in the *local filesystem*, in a format specified by the task - :returns: A score associated with the validation performance for the trained model instance, the higher the better e.g. classification accuracy. - ''' - raise NotImplementedError() - - @abc.abstractmethod - def predict(self, queries: List[Any]) -> List[Any]: - ''' - Make predictions on a batch of queries after training. - - This will be called only when model is *trained*. - - :param queries: List of queries, where a query is in the format specified by the task - :returns: List of predictions, in an order corresponding to the queries, where a prediction is in the format specified by the task - ''' - raise NotImplementedError() - - @abc.abstractmethod - def dump_parameters(self) -> Params: - ''' - Returns a dictionary of model parameters that *fully define the trained state of the model*. - This dictionary must conform to the format :obj:`singa_auto.model.Params`. - This will be used to save the trained model in SINGA-Auto. - - Additionally, trained parameters produced by this method could be shared with future trials, as - part of the ``SHARE_PARAMS`` policy (see :ref:`model-policies`). - - This will be called only when model is *trained*. - - :returns: Dictionary mapping parameter names to values - ''' - raise NotImplementedError() - - @abc.abstractmethod - def load_parameters(self, params: Params): - ''' - Loads this model instance with previously trained model parameters produced by your model's :meth:`singa_auto.model.BaseModel.dump_parameters`. - *This model instance's initialized knob values will match those during training*. - - Subsequently, the model is considered *trained*. - ''' - raise NotImplementedError() - - def destroy(self): - ''' - Destroy this model instance, freeing any resources held by this model instance. - No other instance methods will be called subsequently. - ''' - pass - - @staticmethod - def teardown(): - ''' - Runs class-wide teardown logic (e.g. close a training session shared across trials). - ''' - pass +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +import abc +import numpy as np +from typing import Union, Dict, Optional, Any, List + +from .knob import BaseKnob + +KnobConfig = Dict[str, BaseKnob] +Knobs = Dict[str, Any] +Params = Dict[str, Union[str, int, float, np.ndarray]] + + +class BaseModel(abc.ABC): + ''' + SINGA-Auto's base model class that SINGA-Auto models must extend. + + SINGA-Auto models must implement all abstract methods below, according to the specification of its associated task (see :ref:`tasks`). + They configure how this model template will be trained, evaluated, tuned, serialized and served on SINGA-Auto. + + In the model's ``__init__`` method, call ``super().__init__(**knobs)`` as the first line, + followed by the model's initialization logic. The model should be initialize itself with ``knobs``, + a set of generated knob values for the created model instance. + + These knob values are chosen by SINGA-Auto based on the model's knob configuration (defined in :meth:`singa_auto.model.BaseModel.get_knob_config`). + + For example: + + :: + + def __init__(self, **knobs): + self.__dict__.update(knobs) + ... + self._build_model(self.knob1, self.knob2) + + :param knobs: Dictionary mapping knob names to knob values + :type knobs: :obj:`singa_auto.model.Knobs` + ''' + + def __init__(self, **knobs: Knobs): + pass + + @staticmethod + @abc.abstractmethod + def get_knob_config() -> KnobConfig: + ''' + Return a dictionary that defines the search space for this model template's knobs + (i.e. knobs' names, their types & their ranges). + + Over the course of training, your model will be initialized with different values of knobs within this search space + to maximize this model’s performance. + + Refer to :ref:`model-tuning` to understand more about how this works. + + :returns: Dictionary mapping knob names to knob specifications + ''' + raise NotImplementedError() + + @abc.abstractmethod + def train(self, + dataset_path: str, + shared_params: Optional[Params] = None, + **train_args): + ''' + Train this model instance with the given traing dataset and initialized knob values. + Additional keyword arguments could be passed depending on the task's specification. + + Additionally, trained parameters shared from previous trials could be passed, + as part of the ``SHARE_PARAMS`` policy (see :ref:`model-policies`). + + Subsequently, the model is considered *trained*. + + :param dataset_path: File path of the train dataset file in the *local filesystem*, in a format specified by the task + :param shared_params: Dictionary mapping parameter names to values, as produced by your model's :meth:`singa_auto.model.BaseModel.dump_parameters`. + ''' + raise NotImplementedError() + + @abc.abstractmethod + def evaluate(self, dataset_path: str, **kargs) -> float: + ''' + Evaluate this model instance with the given validation dataset after training. + + This will be called only when model is *trained*. + + :param dataset_path: File path of the validation dataset file in the *local filesystem*, in a format specified by the task + :returns: A score associated with the validation performance for the trained model instance, the higher the better e.g. classification accuracy. + ''' + raise NotImplementedError() + + @abc.abstractmethod + def predict(self, queries: List[Any]) -> List[Any]: + ''' + Make predictions on a batch of queries after training. + + This will be called only when model is *trained*. + + :param queries: List of queries, where a query is in the format specified by the task + :returns: List of predictions, in an order corresponding to the queries, where a prediction is in the format specified by the task + ''' + raise NotImplementedError() + + @abc.abstractmethod + def dump_parameters(self) -> Params: + ''' + Returns a dictionary of model parameters that *fully define the trained state of the model*. + This dictionary must conform to the format :obj:`singa_auto.model.Params`. + This will be used to save the trained model in SINGA-Auto. + + Additionally, trained parameters produced by this method could be shared with future trials, as + part of the ``SHARE_PARAMS`` policy (see :ref:`model-policies`). + + This will be called only when model is *trained*. + + :returns: Dictionary mapping parameter names to values + ''' + raise NotImplementedError() + + @abc.abstractmethod + def load_parameters(self, params: Params): + ''' + Loads this model instance with previously trained model parameters produced by your model's :meth:`singa_auto.model.BaseModel.dump_parameters`. + *This model instance's initialized knob values will match those during training*. + + Subsequently, the model is considered *trained*. + ''' + raise NotImplementedError() + + def destroy(self): + ''' + Destroy this model instance, freeing any resources held by this model instance. + No other instance methods will be called subsequently. + ''' + pass + + @staticmethod + def teardown(): + ''' + Runs class-wide teardown logic (e.g. close a training session shared across trials). + ''' + pass diff --git a/singa_auto/model/utils.py b/singa_auto/model/utils.py index 914e249b..5e6014d5 100644 --- a/singa_auto/model/utils.py +++ b/singa_auto/model/utils.py @@ -98,7 +98,7 @@ def parse_model_install_command(dependencies, enable_gpu=False): commands.append( 'pip --no-cache-dir install scikit-learn=={}'.format(ver)) elif dep == ModelDependency.TENSORFLOW: - if enable_gpu: + if enable_gpu and dep.split('.')[0]=='1': commands.append( 'pip --no-cache-dir install tensorflow-gpu=={}'.format(ver)) else: diff --git a/singa_auto/predictor/app.py b/singa_auto/predictor/app.py index 896bdc15..515a54d8 100644 --- a/singa_auto/predictor/app.py +++ b/singa_auto/predictor/app.py @@ -1,74 +1,74 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -import os -import logging -from typing import Any, List -from flask import Flask, jsonify, g, request -from flask_cors import CORS -from .predictor import Predictor -from singa_auto.model import utils -import traceback - -service_id = os.environ['SINGA_AUTO_SERVICE_ID'] - -logger = logging.getLogger(__name__) -app = Flask(__name__) -CORS(app) - - -def get_predictor() -> Predictor: - # Allow multiple threads to each have their own instance of predictor - if not hasattr(g, 'predictor'): - g.predictor = Predictor(service_id) - - return g.predictor - - -@app.route('/') -def index(): - return 'Predictor is up.' - - -@app.route('/', methods=['POST']) -def predict(): - try: - if request.files.getlist('img'): - img_stores = request.files.getlist('img') - img_bytes = [ - img for img in [img_store.read() for img_store in img_stores] if img - ] - if not img_bytes: - return jsonify({'ErrorMsg': 'No image provided'}), 400 - queries = utils.dataset.load_images(img_bytes) - print("img_bytes_first 10 bytes", img_bytes[0][:10]) - print("queries_sizes", len(queries)) - elif request.get_json(): - data = request.get_json() - queries = [data] - else: - return jsonify({'ErrorMsg': 'data should be either at files (set "img" as key) or json payload'}), 400 - predictor = get_predictor() - predictions: List[Any] = predictor.predict(queries) - return jsonify(predictions), 200 - except: - # for debug,print the error - traceback.print_exc() - logging.error(traceback.format_exc()) - return jsonify({'ErrorMsg': 'Server Error:{}'.format(traceback.format_exc())} - ), 500 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +import os +import logging +from typing import Any, List +from flask import Flask, jsonify, g, request +from flask_cors import CORS +from .predictor import Predictor +from singa_auto.model import utils +import traceback + +service_id = os.environ['SINGA_AUTO_SERVICE_ID'] + +logger = logging.getLogger(__name__) +app = Flask(__name__) +CORS(app) + + +def get_predictor() -> Predictor: + # Allow multiple threads to each have their own instance of predictor + if not hasattr(g, 'predictor'): + g.predictor = Predictor(service_id) + + return g.predictor + + +@app.route('/') +def index(): + return 'Predictor is up.' + + +@app.route('/', methods=['POST']) +def predict(): + try: + if request.files.getlist('img'): + img_stores = request.files.getlist('img') + img_bytes = [ + img for img in [img_store.read() for img_store in img_stores] if img + ] + if not img_bytes: + return jsonify({'ErrorMsg': 'No image provided'}), 400 + queries = utils.dataset.load_images(img_bytes) + print("img_bytes_first 10 bytes", img_bytes[0][:10]) + print("queries_sizes", len(queries)) + elif request.get_json(): + data = request.get_json() + queries = [data] + else: + return jsonify({'ErrorMsg': 'data should be either at files (set "img" as key) or json payload'}), 400 + + predictor = get_predictor() + predictions: List[Any] = predictor.predict(queries) + return jsonify(predictions), 200 + except: + # for debug,print the error + traceback.print_exc() + logging.error(traceback.format_exc()) + return jsonify({'ErrorMsg': 'Server Error:{}'.format(traceback.format_exc())}), 500 diff --git a/singa_auto/worker/inference.py b/singa_auto/worker/inference.py index 271a3127..f0430fd1 100644 --- a/singa_auto/worker/inference.py +++ b/singa_auto/worker/inference.py @@ -188,6 +188,14 @@ def _predict(self, queries: List[Query]) -> List[Prediction]: try: predictions = self._model_inst.predict([x.query for x in queries]) except: + logger.error('queries') + logger.error(queries) + logger.error('x.query') + query_list=[x.query for x in queries] + logger.error(query_list) + logger.error(len(queries)) + logger.error(type(query_list[0])) + logger.error(len(query_list[0])) logger.error('Error while making predictions:') logger.error(traceback.format_exc()) predictions = [None for x in range(len(queries))] diff --git a/singa_auto_scheduler/deploy/singa-auto-monitor.yaml b/singa_auto_scheduler/deploy/singa-auto-monitor.yaml index 28559fc6..6d01d388 100644 --- a/singa_auto_scheduler/deploy/singa-auto-monitor.yaml +++ b/singa_auto_scheduler/deploy/singa-auto-monitor.yaml @@ -58,7 +58,7 @@ spec: serviceAccountName: sasche-sa containers: - name: sasche - image: singaauto/singa_auto_nodegpumonitor:dev + image: singa_auto/singa_auto_nodegpumonitor:dev imagePullPolicy: Always env: - name: NODE_NAME diff --git a/singa_auto_scheduler/deploy/singa-auto-scheduler.yaml b/singa_auto_scheduler/deploy/singa-auto-scheduler.yaml index e7b56731..049c9f9e 100644 --- a/singa_auto_scheduler/deploy/singa-auto-scheduler.yaml +++ b/singa_auto_scheduler/deploy/singa-auto-scheduler.yaml @@ -54,7 +54,7 @@ spec: name: scheduler-config containers: - name: sascheduler - image: singaauto/singa_auto_scheduler:dev + image: singa_auto/singa_auto_scheduler:dev imagePullPolicy: Always args: - sascheduler