From 161d341d4678cc6b47430b5a4289c78bd50141a9 Mon Sep 17 00:00:00 2001 From: Celina Hanouti Date: Tue, 8 Oct 2024 09:14:29 +0200 Subject: [PATCH] [Tasks] Add `Llama-3.2-11B-Vision-Instruct` as the first recommended model for `image-text-to-text` (#956) This PR is a proposition to add `meta-llama/Llama-3.2-11B-Vision-Instruct` as the first recommended model for [`image-text-to-text`](https://huggingface.co/tasks/image-text-to-text) task. This would be nice to have, as we also want to show in the [Inference API documentation](https://huggingface.co/docs/api-inference/index) how to perform inference with this model, since it's both supported by TGI and it's a popular conversational VLM. Feel free to suggest a better description or another ranking for this model. cc @merveenoyan @osanseviero --- packages/tasks/src/tasks/image-text-to-text/data.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/tasks/src/tasks/image-text-to-text/data.ts b/packages/tasks/src/tasks/image-text-to-text/data.ts index 4e19cf646..41084651d 100644 --- a/packages/tasks/src/tasks/image-text-to-text/data.ts +++ b/packages/tasks/src/tasks/image-text-to-text/data.ts @@ -43,8 +43,8 @@ const taskData: TaskDataCustom = { metrics: [], models: [ { - description: "Cutting-edge vision language model that can take multiple image inputs.", - id: "facebook/chameleon-7b", + description: "Powerful vision language model with great visual understanding and reasoning capabilities.", + id: "meta-llama/Llama-3.2-11B-Vision-Instruct", }, { description: "Cutting-edge conversational vision language model that can take multiple image inputs.",