Skip to content

Commit

Permalink
Merge pull request #281 from priyanshuverma-dev/feat-image-captioning…
Browse files Browse the repository at this point in the history
…-magic-tool

feat: Image Captioning Magic tool
  • Loading branch information
kom-senapati authored Nov 7, 2024
2 parents 4c79db5 + 24bfd56 commit 64e92ca
Show file tree
Hide file tree
Showing 8 changed files with 190 additions and 1 deletion.
21 changes: 21 additions & 0 deletions app/ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
from bs4 import BeautifulSoup
import markdown
from translate import Translator
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import io

load_dotenv()

Expand All @@ -21,6 +24,24 @@
logger = logging.getLogger(__name__)


def generate_image_caption(image_data: bytes) -> str:
# Load model and processor from Hugging Face
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained(
"Salesforce/blip-image-captioning-base"
)

# Open image and process it
image = Image.open(io.BytesIO(image_data)).convert("RGB")
inputs = processor(images=image, return_tensors="pt")

# Generate caption
outputs = model.generate(**inputs)
caption = processor.decode(outputs[0], skip_special_tokens=True)

return caption


def chat_with_chatbot(messages: List[Dict[str, str]], apiKey: str, engine: str) -> str:
if not apiKey:
logger.error("API key is missing.")
Expand Down
20 changes: 19 additions & 1 deletion app/api_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from sqlalchemy.orm import joinedload
from flask_login import login_user
from typing import Union, List, Optional, Dict
from .ai import chat_with_chatbot, text_to_mp3, translate_text
from .ai import chat_with_chatbot, text_to_mp3, translate_text, generate_image_caption
from .constants import BOT_AVATAR_API, USER_AVATAR_API
from .helpers import create_default_chatbots
from .data_fetcher import fetch_contribution_data
Expand Down Expand Up @@ -922,3 +922,21 @@ def remove_file():
return response
except Exception as e:
return jsonify({"success": False, "message": str(e)}), 500


@api_bp.route("/api/image-captioning", methods=["POST"])
@jwt_required()
def api_image_captioning():
if "image" not in request.files:
return jsonify({"success": False, "message": "No image file provided."}), 400

file = request.files["image"]
if file.filename == "":
return jsonify({"success": False, "message": "No selected file"}), 400

try:
image_data = file.read()
caption = generate_image_caption(image_data)
return jsonify({"success": True, "caption": caption})
except Exception as e:
return jsonify({"success": False, "message": str(e)}), 500
7 changes: 7 additions & 0 deletions client/src/components/modals/command-modal.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import {
import {
AudioWaveform,
Bot,
Captions,
ChartColumn,
Image,
Languages,
Expand All @@ -25,6 +26,7 @@ import {
} from "lucide-react";
import {
useCreateChatbotModal,
useImageCaptioningStore,
useImagineModal,
useOcrMagic,
useSettingsModal,
Expand All @@ -45,6 +47,7 @@ export function CommandModal() {
const ocrModal = useOcrMagic();
const ttHModal = usettHMagic();
const translateModal = useTranslateMagicModal();
const imageCaptioningModal = useImageCaptioningStore();
const navigate = useNavigate();

React.useEffect(() => {
Expand Down Expand Up @@ -90,6 +93,10 @@ export function CommandModal() {
<Image />
<span>{t("commandbox.image_generation")}</span>
</CommandItem>
<CommandItem onSelect={() => imageCaptioningModal.onOpen()}>
<Captions />
<span>Image Captioning</span>
</CommandItem>
<CommandItem
onSelect={() => {
setOpen(false);
Expand Down
137 changes: 137 additions & 0 deletions client/src/components/modals/image-captioning-magic-modal.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import {
AlertDialog,
AlertDialogContent,
AlertDialogDescription,
AlertDialogFooter,
AlertDialogHeader,
AlertDialogTitle,
} from "@/components/ui/alert-dialog";
import { SERVER_URL } from "@/lib/utils";
import { useImageCaptioningStore } from "@/stores/modal-store";
import axios from "axios";
import { useState } from "react";
import { Button } from "../ui/button";
import toast from "react-hot-toast";

import { X } from "lucide-react";

import { Input } from "../ui/input";
import { Skeleton } from "../ui/skeleton";

export default function ImageCaptioningModal() {
const modal = useImageCaptioningStore();
const [loading, setLoading] = useState(false);
const [caption, setCaption] = useState("");

const [selectedFile, setSelectedFile] = useState<File | null>(null);
const [imagePreview, setImagePreview] = useState<string | null>(null);

const handleFileChange = (event: React.ChangeEvent<HTMLInputElement>) => {
const file = event.target.files ? event.target.files[0] : null;
if (file) {
setSelectedFile(file);
setImagePreview(URL.createObjectURL(file));
}
};

const handleCaptionSubmit = async (event: any) => {
event.preventDefault();
setCaption("");
if (!selectedFile) return toast.error("Please select a file!");

const formData = new FormData();
formData.append("image", selectedFile);
setLoading(true);
try {
const token = localStorage.getItem("token");

const authHeaders = {
Authorization: `Bearer ${token || ""}`,
};

const response = await axios.post(
`${SERVER_URL}/api/image-captioning`,
formData,
{
headers: authHeaders,
}
);
setCaption(response.data.caption);
} catch (error) {
console.error("Error fetching image caption:", error);
} finally {
setLoading(false);
}
};

return (
<AlertDialog open={modal.isOpen} onOpenChange={() => modal.onClose()}>
<AlertDialogContent>
<AlertDialogHeader>
<AlertDialogTitle>
<div className="flex items-center justify-between">
<p>Image Captioning Tool</p>
<Button
variant={"outline"}
size={"icon"}
className="rounded-full"
onClick={() => modal.onClose()}
>
<X />
</Button>
</div>
</AlertDialogTitle>
<AlertDialogDescription>
Generate a Caption for Your Image.
</AlertDialogDescription>
<div className="grid gap-4 w-full">
{imagePreview && (
<div className="relative aspect-video">
<img
src={imagePreview}
alt="Uploaded image preview"
className="object-contain w-full h-full"
/>
</div>
)}
{loading && (
<div className="space-y-2">
<Skeleton className="h-4 w-full" />
<Skeleton className="h-4 w-[90%]" />
<Skeleton className="h-4 w-[75%]" />
</div>
)}
{caption && (
<div className="p-4 bg-muted rounded-md">
<p className="text-sm">{caption}</p>
</div>
)}
<div className="flex items-center gap-4">
<form
onSubmit={handleCaptionSubmit}
className="w-full flex items-center flex-col gap-4"
>
<Input
disabled={loading}
type="file"
onChange={handleFileChange}
accept="image/*"
className="cursor-pointer"
/>
<Button
disabled={loading}
className="w-full"
variant={"outline"}
type="submit"
>
{loading ? "Generating Caption..." : "Generate Caption"}
</Button>
</form>
</div>
</div>
</AlertDialogHeader>
<AlertDialogFooter></AlertDialogFooter>
</AlertDialogContent>
</AlertDialog>
);
}
2 changes: 2 additions & 0 deletions client/src/contexts/modals.tsx
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import CreateChatbotModal from "@/components/modals/create-chatbot-modal";
import DeleteChatbotModal from "@/components/modals/delete-chatbot-modal";
import ImageCaptioningModal from "@/components/modals/image-captioning-magic-modal";
import ImagineModal from "@/components/modals/imgine-modal";
import OcrMagicModal from "@/components/modals/ocr-magic-modal";
import SettingsModal from "@/components/modals/settings-modal";
Expand All @@ -21,6 +22,7 @@ export default function Modals() {
<OcrMagicModal />
<TtHMagicModal />
<TtsMagicModal />
<ImageCaptioningModal />
<TranslateMagicModal />
<ImagineModal />
<DeleteChatbotModal />
Expand Down
1 change: 1 addition & 0 deletions client/src/stores/modal-store.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ export const useTranslateMagicModal = create<DefaultModal>(defaultModalValues);
export const useImagineModal = create<DefaultModal>(defaultModalValues);
export const useOcrMagic = create<DefaultModal>(defaultModalValues);
export const usettHMagic = create<DefaultModal>(defaultModalValues);
export const useImageCaptioningStore = create<DefaultModal>(defaultModalValues);
2 changes: 2 additions & 0 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,5 @@ translate
pytesseract
pillow
fpdf
transformers
torch
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ pytesseract
pillow
fpdf
translate
transformers

0 comments on commit 64e92ca

Please sign in to comment.