Merge pull request #281 from priyanshuverma-dev/feat-image-captioning…

…-magic-tool feat: Image Captioning Magic tool
kom-senapati · Nov 7, 2024 · 64e92ca · 64e92ca
2 parents 4c79db5 + 24bfd56
commit 64e92ca
Show file tree

Hide file tree

Showing 8 changed files with 190 additions and 1 deletion.
diff --git a/app/ai.py b/app/ai.py
@@ -11,6 +11,9 @@
 from bs4 import BeautifulSoup
 import markdown
 from translate import Translator
+from transformers import BlipProcessor, BlipForConditionalGeneration
+from PIL import Image
+import io
 
 load_dotenv()
 
@@ -21,6 +24,24 @@
 logger = logging.getLogger(__name__)
 
 
+def generate_image_caption(image_data: bytes) -> str:
+    # Load model and processor from Hugging Face
+    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+    model = BlipForConditionalGeneration.from_pretrained(
+        "Salesforce/blip-image-captioning-base"
+    )
+
+    # Open image and process it
+    image = Image.open(io.BytesIO(image_data)).convert("RGB")
+    inputs = processor(images=image, return_tensors="pt")
+
+    # Generate caption
+    outputs = model.generate(**inputs)
+    caption = processor.decode(outputs[0], skip_special_tokens=True)
+
+    return caption
+
+
 def chat_with_chatbot(messages: List[Dict[str, str]], apiKey: str, engine: str) -> str:
     if not apiKey:
         logger.error("API key is missing.")

diff --git a/app/api_routes.py b/app/api_routes.py
@@ -9,7 +9,7 @@
 from sqlalchemy.orm import joinedload
 from flask_login import login_user
 from typing import Union, List, Optional, Dict
-from .ai import chat_with_chatbot, text_to_mp3, translate_text
+from .ai import chat_with_chatbot, text_to_mp3, translate_text, generate_image_caption
 from .constants import BOT_AVATAR_API, USER_AVATAR_API
 from .helpers import create_default_chatbots
 from .data_fetcher import fetch_contribution_data
@@ -922,3 +922,21 @@ def remove_file():
         return response
     except Exception as e:
         return jsonify({"success": False, "message": str(e)}), 500
+
+
+@api_bp.route("/api/image-captioning", methods=["POST"])
+@jwt_required()
+def api_image_captioning():
+    if "image" not in request.files:
+        return jsonify({"success": False, "message": "No image file provided."}), 400
+
+    file = request.files["image"]
+    if file.filename == "":
+        return jsonify({"success": False, "message": "No selected file"}), 400
+
+    try:
+        image_data = file.read()
+        caption = generate_image_caption(image_data)
+        return jsonify({"success": True, "caption": caption})
+    except Exception as e:
+        return jsonify({"success": False, "message": str(e)}), 500
diff --git a/client/src/components/modals/command-modal.tsx b/client/src/components/modals/command-modal.tsx
@@ -15,6 +15,7 @@ import {
 import {
   AudioWaveform,
   Bot,
+  Captions,
   ChartColumn,
   Image,
   Languages,
@@ -25,6 +26,7 @@ import {
 } from "lucide-react";
 import {
   useCreateChatbotModal,
+  useImageCaptioningStore,
   useImagineModal,
   useOcrMagic,
   useSettingsModal,
@@ -45,6 +47,7 @@ export function CommandModal() {
   const ocrModal = useOcrMagic();
   const ttHModal = usettHMagic();
   const translateModal = useTranslateMagicModal();
+  const imageCaptioningModal = useImageCaptioningStore();
   const navigate = useNavigate();
 
   React.useEffect(() => {
@@ -90,6 +93,10 @@ export function CommandModal() {
               <Image />
               <span>{t("commandbox.image_generation")}</span>
             </CommandItem>
+            <CommandItem onSelect={() => imageCaptioningModal.onOpen()}>
+              <Captions />
+              <span>Image Captioning</span>
+            </CommandItem>
             <CommandItem
               onSelect={() => {
                 setOpen(false);

diff --git a/client/src/components/modals/image-captioning-magic-modal.tsx b/client/src/components/modals/image-captioning-magic-modal.tsx
@@ -0,0 +1,137 @@
+import {
+  AlertDialog,
+  AlertDialogContent,
+  AlertDialogDescription,
+  AlertDialogFooter,
+  AlertDialogHeader,
+  AlertDialogTitle,
+} from "@/components/ui/alert-dialog";
+import { SERVER_URL } from "@/lib/utils";
+import { useImageCaptioningStore } from "@/stores/modal-store";
+import axios from "axios";
+import { useState } from "react";
+import { Button } from "../ui/button";
+import toast from "react-hot-toast";
+
+import { X } from "lucide-react";
+
+import { Input } from "../ui/input";
+import { Skeleton } from "../ui/skeleton";
+
+export default function ImageCaptioningModal() {
+  const modal = useImageCaptioningStore();
+  const [loading, setLoading] = useState(false);
+  const [caption, setCaption] = useState("");
+
+  const [selectedFile, setSelectedFile] = useState<File | null>(null);
+  const [imagePreview, setImagePreview] = useState<string | null>(null);
+
+  const handleFileChange = (event: React.ChangeEvent<HTMLInputElement>) => {
+    const file = event.target.files ? event.target.files[0] : null;
+    if (file) {
+      setSelectedFile(file);
+      setImagePreview(URL.createObjectURL(file));
+    }
+  };
+
+  const handleCaptionSubmit = async (event: any) => {
+    event.preventDefault();
+    setCaption("");
+    if (!selectedFile) return toast.error("Please select a file!");
+
+    const formData = new FormData();
+    formData.append("image", selectedFile);
+    setLoading(true);
+    try {
+      const token = localStorage.getItem("token");
+
+      const authHeaders = {
+        Authorization: `Bearer ${token || ""}`,
+      };
+
+      const response = await axios.post(
+        `${SERVER_URL}/api/image-captioning`,
+        formData,
+        {
+          headers: authHeaders,
+        }
+      );
+      setCaption(response.data.caption);
+    } catch (error) {
+      console.error("Error fetching image caption:", error);
+    } finally {
+      setLoading(false);
+    }
+  };
+
+  return (
+    <AlertDialog open={modal.isOpen} onOpenChange={() => modal.onClose()}>
+      <AlertDialogContent>
+        <AlertDialogHeader>
+          <AlertDialogTitle>
+            <div className="flex items-center justify-between">
+              <p>Image Captioning Tool</p>
+              <Button
+                variant={"outline"}
+                size={"icon"}
+                className="rounded-full"
+                onClick={() => modal.onClose()}
+              >
+                <X />
+              </Button>
+            </div>
+          </AlertDialogTitle>
+          <AlertDialogDescription>
+            Generate a Caption for Your Image.
+          </AlertDialogDescription>
+          <div className="grid gap-4 w-full">
+            {imagePreview && (
+              <div className="relative aspect-video">
+                <img
+                  src={imagePreview}
+                  alt="Uploaded image preview"
+                  className="object-contain w-full h-full"
+                />
+              </div>
+            )}
+            {loading && (
+              <div className="space-y-2">
+                <Skeleton className="h-4 w-full" />
+                <Skeleton className="h-4 w-[90%]" />
+                <Skeleton className="h-4 w-[75%]" />
+              </div>
+            )}
+            {caption && (
+              <div className="p-4 bg-muted rounded-md">
+                <p className="text-sm">{caption}</p>
+              </div>
+            )}
+            <div className="flex items-center gap-4">
+              <form
+                onSubmit={handleCaptionSubmit}
+                className="w-full flex items-center flex-col gap-4"
+              >
+                <Input
+                  disabled={loading}
+                  type="file"
+                  onChange={handleFileChange}
+                  accept="image/*"
+                  className="cursor-pointer"
+                />
+                <Button
+                  disabled={loading}
+                  className="w-full"
+                  variant={"outline"}
+                  type="submit"
+                >
+                  {loading ? "Generating Caption..." : "Generate Caption"}
+                </Button>
+              </form>
+            </div>
+          </div>
+        </AlertDialogHeader>
+        <AlertDialogFooter></AlertDialogFooter>
+      </AlertDialogContent>
+    </AlertDialog>
+  );
+}
diff --git a/client/src/contexts/modals.tsx b/client/src/contexts/modals.tsx
@@ -1,5 +1,6 @@
 import CreateChatbotModal from "@/components/modals/create-chatbot-modal";
 import DeleteChatbotModal from "@/components/modals/delete-chatbot-modal";
+import ImageCaptioningModal from "@/components/modals/image-captioning-magic-modal";
 import ImagineModal from "@/components/modals/imgine-modal";
 import OcrMagicModal from "@/components/modals/ocr-magic-modal";
 import SettingsModal from "@/components/modals/settings-modal";
@@ -21,6 +22,7 @@ export default function Modals() {
       <OcrMagicModal />
       <TtHMagicModal />
       <TtsMagicModal />
+      <ImageCaptioningModal />
       <TranslateMagicModal />
       <ImagineModal />
       <DeleteChatbotModal />

diff --git a/client/src/stores/modal-store.ts b/client/src/stores/modal-store.ts
@@ -12,3 +12,4 @@ export const useTranslateMagicModal = create<DefaultModal>(defaultModalValues);
 export const useImagineModal = create<DefaultModal>(defaultModalValues);
 export const useOcrMagic = create<DefaultModal>(defaultModalValues);
 export const usettHMagic = create<DefaultModal>(defaultModalValues);
+export const useImageCaptioningStore = create<DefaultModal>(defaultModalValues);
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -21,3 +21,5 @@ translate
 pytesseract
 pillow
 fpdf
+transformers
+torch
diff --git a/requirements.txt b/requirements.txt
@@ -19,3 +19,4 @@ pytesseract
 pillow
 fpdf
 translate
+transformers