feat: add support for YOLO11segmentation model

Seeed-Studio · Nov 22, 2024 · 509970a · 509970a
1 parent d737124
commit 509970a
Show file tree

Hide file tree

Showing 7 changed files with 377 additions and 1 deletion.
diff --git a/sscma/core/ma_types.h b/sscma/core/ma_types.h
@@ -212,6 +212,17 @@ struct ma_keypoint4f_t {
     ma_bbox_t box;
     std::vector<ma_pt4f_t> pts;
 };
+
+struct ma_segm2f_t {
+    ma_bbox_t box;
+    struct {
+        uint16_t width;
+        uint16_t height;
+        std::vector<uint8_t> data;
+    } mask;
+};
+
+
 #endif
 
 typedef enum {
@@ -271,6 +282,7 @@ typedef enum {
     MA_MODEL_TYPE_YOLO_WORLD  = 8u,
     MA_MODEL_TYPE_YOLO11      = 9u,
     MA_MODEL_TYPE_YOLO11_POSE = 10u,
+    MA_MODEL_TYPE_YOLO11_SEG = 11u,
 } ma_model_type_t;
 
 typedef struct {

diff --git a/sscma/core/model/ma_model_factory.cpp b/sscma/core/model/ma_model_factory.cpp
@@ -59,6 +59,10 @@ Model* ModelFactory::create(Engine* engine, size_t algorithm_id) {
         if (Yolo11Pose::isValid(engine)) {
             return new Yolo11Pose(engine);
         }
+    case MA_MODEL_TYPE_YOLO11_SEG:
+        if (Yolo11Seg::isValid(engine)) {
+            return new Yolo11Seg(engine);
+        }
     }
 
     return nullptr;

diff --git a/sscma/core/model/ma_model_factory.h b/sscma/core/model/ma_model_factory.h
@@ -14,11 +14,12 @@
 #include "ma_model_nvidia_det.h"
 #include "ma_model_pfld.h"
 #include "ma_model_yolo11.h"
+#include "ma_model_yolo11_pose.h"
+#include "ma_model_yolo11_seg.h"
 #include "ma_model_yolo_world.h"
 #include "ma_model_yolov5.h"
 #include "ma_model_yolov8.h"
 #include "ma_model_yolov8_pose.h"
-#include "ma_model_yolo11_pose.h"
 
 namespace ma {
 

diff --git a/sscma/core/model/ma_model_segmenter.cpp b/sscma/core/model/ma_model_segmenter.cpp
@@ -0,0 +1,108 @@
+#include "ma_model_segmenter.h"
+
+#include "core/cv/ma_cv.h"
+
+namespace ma::model {
+
+constexpr char TAG[] = "ma::model::segmenter";
+
+Segmenter::Segmenter(Engine* p_engine, const char* name, ma_model_type_t type) : Model(p_engine, name, MA_INPUT_TYPE_IMAGE | MA_OUTPUT_TYPE_SEGMENTATION | type) {
+    input_           = p_engine_->getInput(0);
+    threshold_nms_   = 0.45;
+    threshold_score_ = 0.25;
+
+    is_nhwc_ = input_.shape.dims[3] == 3 || input_.shape.dims[3] == 1;
+
+    if (is_nhwc_) {
+        img_.height = input_.shape.dims[1];
+        img_.width  = input_.shape.dims[2];
+        img_.size   = input_.shape.dims[1] * input_.shape.dims[2] * input_.shape.dims[3];
+        img_.format = input_.shape.dims[3] == 3 ? MA_PIXEL_FORMAT_RGB888 : MA_PIXEL_FORMAT_GRAYSCALE;
+
+    } else {
+        img_.height = input_.shape.dims[2];
+        img_.width  = input_.shape.dims[3];
+        img_.size   = input_.shape.dims[3] * input_.shape.dims[2] * input_.shape.dims[1];
+        img_.format = input_.shape.dims[1] == 3 ? MA_PIXEL_FORMAT_RGB888 : MA_PIXEL_FORMAT_GRAYSCALE;
+    }
+
+    img_.data = input_.data.u8;
+}
+
+Segmenter::~Segmenter() {}
+ma_err_t Segmenter::preprocess() {
+    ma_err_t ret = MA_OK;
+
+    ret = ma::cv::convert(input_img_, &img_);
+    if (ret != MA_OK) {
+        return ret;
+    }
+    if (input_.type == MA_TENSOR_TYPE_S8) {
+        for (int i = 0; i < input_.size; i++) {
+            input_.data.u8[i] -= 128;
+        }
+    }
+
+    return ret;
+}
+
+const void* Segmenter::getInput() {
+    return static_cast<const void*>(&img_);
+}
+
+const std::forward_list<ma_segm2f_t>& Segmenter::getResults() const {
+    return results_;
+}
+
+ma_err_t Segmenter::run(const ma_img_t* img) {
+    MA_ASSERT(img != nullptr);
+
+    input_img_ = img;
+
+    return underlyingRun();
+}
+
+ma_err_t Segmenter::setConfig(ma_model_cfg_opt_t opt, ...) {
+    ma_err_t ret = MA_OK;
+    va_list args;
+    va_start(args, opt);
+    switch (opt) {
+        case MA_MODEL_CFG_OPT_THRESHOLD:
+            threshold_score_ = va_arg(args, double);
+            ret              = MA_OK;
+            break;
+        case MA_MODEL_CFG_OPT_NMS:
+            threshold_nms_ = va_arg(args, double);
+            ret            = MA_OK;
+            break;
+        default:
+            ret = MA_EINVAL;
+            break;
+    }
+    va_end(args);
+    return ret;
+}
+
+ma_err_t Segmenter::getConfig(ma_model_cfg_opt_t opt, ...) {
+    ma_err_t ret = MA_OK;
+    va_list args;
+    void* p_arg = nullptr;
+    va_start(args, opt);
+    switch (opt) {
+        case MA_MODEL_CFG_OPT_THRESHOLD:
+            p_arg                          = va_arg(args, void*);
+            *(static_cast<double*>(p_arg)) = threshold_score_;
+            break;
+        case MA_MODEL_CFG_OPT_NMS:
+            p_arg                          = va_arg(args, void*);
+            *(static_cast<double*>(p_arg)) = threshold_nms_;
+            break;
+        default:
+            ret = MA_EINVAL;
+            break;
+    }
+    va_end(args);
+    return ret;
+}
+
+}  // namespace ma::model
diff --git a/sscma/core/model/ma_model_segmenter.h b/sscma/core/model/ma_model_segmenter.h
@@ -0,0 +1,43 @@
+#ifndef _MA_MODEL_SEGMENTER_H_
+#define _MA_MODEL_SEGMENTER_H_
+
+#include <vector>
+
+#include "ma_model_base.h"
+
+namespace ma::model {
+
+class Segmenter : public Model {
+protected:
+    ma_tensor_t input_;
+    ma_img_t img_;
+    const ma_img_t* input_img_;
+
+    float threshold_nms_;
+    float threshold_score_;
+
+    bool is_nhwc_;
+
+    std::forward_list<ma_segm2f_t> results_;
+
+protected:
+    ma_err_t preprocess() override;
+
+public:
+    Segmenter(Engine* engine, const char* name, ma_model_type_t type);
+    virtual ~Segmenter();
+
+    const std::forward_list<ma_segm2f_t>& getResults() const;
+
+    ma_err_t run(const ma_img_t* img);
+
+    const void* getInput() override;
+
+    ma_err_t setConfig(ma_model_cfg_opt_t opt, ...) override;
+
+    ma_err_t getConfig(ma_model_cfg_opt_t opt, ...) override;
+};
+
+}  // namespace ma::model
+
+#endif  // _MA_MODEL_SEGMENTER_H_
diff --git a/sscma/core/model/ma_model_yolo11_seg.cpp b/sscma/core/model/ma_model_yolo11_seg.cpp
@@ -0,0 +1,174 @@
+#include "ma_model_yolo11_seg.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <forward_list>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include "core/math/ma_math.h"
+#include "core/utils/ma_nms.h"
+
+constexpr char TAG[] = "ma::model::yolo11_seg";
+
+namespace ma::model {
+
+Yolo11Seg::Yolo11Seg(Engine* p_engine_) : Segmenter(p_engine_, "yolo11_seg", MA_MODEL_TYPE_YOLO11_SEG) {
+    MA_ASSERT(p_engine_ != nullptr);
+
+    bboxes_ = p_engine_->getOutput(0);
+    protos_ = p_engine_->getOutput(1);
+
+    num_class_  = bboxes_.shape.dims[1] - 36;  // 4 + 1 + 32
+    num_record_ = bboxes_.shape.dims[2];
+}
+
+Yolo11Seg::~Yolo11Seg() {}
+
+bool Yolo11Seg::isValid(Engine* engine) {
+
+    const auto inputs_count  = engine->getInputSize();
+    const auto outputs_count = engine->getOutputSize();
+
+    if (inputs_count != 1 || outputs_count != 2) {
+        return false;
+    }
+    const auto& input_shape  = engine->getInputShape(0);
+    const auto& output_shape = engine->getOutputShape(0);
+    const auto& mask_shape   = engine->getOutputShape(1);
+
+    // Validate input shape
+    if (input_shape.size != 4) {
+        return false;
+    }
+
+    int n = input_shape.dims[0], h = input_shape.dims[1], w = input_shape.dims[2], c = input_shape.dims[3];
+    bool is_nhwc = c == 3 || c == 1;
+
+    if (!is_nhwc)
+        std::swap(h, c);
+
+
+    if (n != 1 || h < 32 || h % 32 != 0 || (c != 3 && c != 1)) {
+        return false;
+    }
+
+    // Calculate expected output size based on input
+    int s = w >> 5, m = w >> 4, l = w >> 3;
+    int ibox_len = (s * s + m * m + l * l);
+
+    // Validate output shape
+    if ((output_shape.size != 3 && output_shape.size != 4) || mask_shape.size != 4) {
+        return false;
+    }
+
+    if (output_shape.dims[0] != 1 || output_shape.dims[2] != ibox_len || output_shape.dims[1] < 37) {
+        return false;
+    }
+
+    if (mask_shape.dims[0] != 1 || mask_shape.dims[1] != 32 || mask_shape.dims[2] != w >> 2 || mask_shape.dims[3] != w >> 2) {
+        return false;
+    }
+
+    return true;
+}
+
+ma_err_t Yolo11Seg::postprocess() {
+    results_.clear();
+    if (bboxes_.type == MA_TENSOR_TYPE_F32) {
+        return postProcessF32();
+    }
+    return MA_ENOTSUP;
+}
+
+ma_err_t Yolo11Seg::postProcessF32() {
+
+    std::forward_list<ma_bbox_ext_t> multi_level_bboxes;
+    auto* data = bboxes_.data.f32;
+    for (decltype(num_record_) i = 0; i < num_record_; ++i) {
+
+        float max  = threshold_score_;
+        int target = -1;
+
+        for (int c = 0; c < num_class_; c++) {
+            float score = data[i + num_record_ * (4 + c)];
+            if (score < max) [[likely]] {
+                continue;
+            }
+            max    = score;
+            target = c;
+        }
+
+        if (target < 0)
+            continue;
+
+        float x = data[i];
+        float y = data[i + num_record_];
+        float w = data[i + num_record_ * 2];
+        float h = data[i + num_record_ * 3];
+
+
+        ma_bbox_ext_t bbox;
+        bbox.level  = 0;
+        bbox.index  = i;
+        bbox.x      = x / img_.width;
+        bbox.y      = y / img_.height;
+        bbox.w      = w / img_.width;
+        bbox.h      = h / img_.height;
+        bbox.score  = max;
+        bbox.target = target;
+
+        multi_level_bboxes.emplace_front(std::move(bbox));
+    }
+
+    ma::utils::nms(multi_level_bboxes, threshold_nms_, threshold_score_, false, true);
+
+    if (multi_level_bboxes.empty())
+        return MA_OK;
+
+    // fetch mask
+    for (auto& bbox : multi_level_bboxes) {
+        ma_segm2f_t seg;
+        seg.box         = {.x = bbox.x, .y = bbox.y, .w = bbox.w, .h = bbox.h, .score = bbox.score, .target = bbox.target};
+        seg.mask.width  = protos_.shape.dims[2];
+        seg.mask.height = protos_.shape.dims[3];
+        seg.mask.data.resize(protos_.shape.dims[2] * protos_.shape.dims[3] / 8, 0);  // bitwise
+
+        const int mask_size = protos_.shape.dims[2] * protos_.shape.dims[3];
+
+        std::vector<float> masks(mask_size, 0.0f);
+
+        // TODO: parallel for
+        for (int j = 0; j < protos_.shape.dims[1]; ++j) {
+            float mask_in = bboxes_.data.f32[bbox.index + num_record_ * (4 + num_class_ + j)];
+            for (int i = 0; i < mask_size; ++i) {
+                masks[i] += mask_in * protos_.data.f32[j * mask_size + i];
+            }
+        }
+
+        int x1 = (bbox.x - bbox.w / 2) * protos_.shape.dims[2];
+        int y1 = (bbox.y - bbox.h / 2) * protos_.shape.dims[3];
+        int x2 = (bbox.x + bbox.w / 2) * protos_.shape.dims[2];
+        int y2 = (bbox.y + bbox.h / 2) * protos_.shape.dims[3];
+
+        for (int i = 0; i < protos_.shape.dims[2]; i++) {
+            for (int j = 0; j < protos_.shape.dims[3]; j++) {
+                if (i < y1 || i >= y2 || j < x1 || j >= x2) [[likely]] {
+                    continue;
+                }
+                if (masks[i * protos_.shape.dims[3] + j] > 0.5) {
+                    seg.mask.data[i * protos_.shape.dims[3] / 8 + j / 8] |= (1 << (j % 8));
+                }
+            }
+        }
+
+        results_.emplace_front(std::move(seg));
+    }
+
+
+    return MA_OK;
+}
+
+}  // namespace ma::model