From 9c3a292938d53a0ff462d4bfb3d477f4c005dea3 Mon Sep 17 00:00:00 2001
From: Kye <kye@apacmediasolutions.com>
Date: Fri, 24 Nov 2023 16:56:05 -0800
Subject: [PATCH] gpt4vision api

---
 multi_modal_auto_agent.py                     |  17 +++
 .../multi_modal_auto_agent.py                 |  33 -----
 swarms/models/__init__.py                     |   4 +
 swarms/models/gpt4_vision_api.py              | 127 ++++++++++++++++++
 swarms/structs/flow.py                        |  17 ++-
 5 files changed, 160 insertions(+), 38 deletions(-)
 create mode 100644 multi_modal_auto_agent.py
 delete mode 100644 playground/demos/multi_modal_autonomous_agents/multi_modal_auto_agent.py
 create mode 100644 swarms/models/gpt4_vision_api.py

diff --git a/multi_modal_auto_agent.py b/multi_modal_auto_agent.py
new file mode 100644
index 000000000..5d27dc42e
--- /dev/null
+++ b/multi_modal_auto_agent.py
@@ -0,0 +1,17 @@
+from swarms.structs import Flow
+from swarms.models.gpt4_vision_api import GPT4VisionAPI
+
+
+llm = GPT4VisionAPI()
+
+task = "What is the color of the object?"
+img = "images/swarms.jpeg"
+
+## Initialize the workflow
+flow = Flow(
+    llm=llm,
+    max_loops='auto',
+    dashboard=True,
+)
+
+flow.run(task=task, img=img)
\ No newline at end of file
diff --git a/playground/demos/multi_modal_autonomous_agents/multi_modal_auto_agent.py b/playground/demos/multi_modal_autonomous_agents/multi_modal_auto_agent.py
deleted file mode 100644
index a2602706e..000000000
--- a/playground/demos/multi_modal_autonomous_agents/multi_modal_auto_agent.py
+++ /dev/null
@@ -1,33 +0,0 @@
-from swarms.structs import Flow
-from swarms.models import Idefics
-
-# Multi Modality Auto Agent
-llm = Idefics(max_length=2000)
-
-task = (
-    "User: What is in this image?"
-    " https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG"
-)
-
-## Initialize the workflow
-flow = Flow(
-    llm=llm,
-    max_loops=2,
-    dashboard=True,
-    # stopping_condition=None,  # You can define a stopping condition as needed.
-    # loop_interval=1,
-    # retry_attempts=3,
-    # retry_interval=1,
-    # interactive=False,  # Set to 'True' for interactive mode.
-    # dynamic_temperature=False,  # Set to 'True' for dynamic temperature handling.
-)
-
-# out = flow.load_state("flow_state.json")
-# temp = flow.dynamic_temperature()
-# filter = flow.add_response_filter("Trump")
-out = flow.run(task)
-# out = flow.validate_response(out)
-# out = flow.analyze_feedback(out)
-# out = flow.print_history_and_memory()
-# # out = flow.save_state("flow_state.json")
-# print(out)
diff --git a/swarms/models/__init__.py b/swarms/models/__init__.py
index 10bf2fab7..aa1da8f73 100644
--- a/swarms/models/__init__.py
+++ b/swarms/models/__init__.py
@@ -25,6 +25,7 @@
 from swarms.models.vilt import Vilt  # noqa: E402
 from swarms.models.nougat import Nougat  # noqa: E402
 from swarms.models.layoutlm_document_qa import LayoutLMDocumentQA  # noqa: E402
+from swarms.models.gpt4_vision_api import GPT4VisionAPI  # noqa: E40
 
 # from swarms.models.gpt4v import GPT4Vision
 # from swarms.models.dalle3 import Dalle3
@@ -49,4 +50,7 @@
     "WizardLLMStoryTeller",
     # "GPT4Vision",
     # "Dalle3",
+    # "DistilWhisperModel",
+    "GPT4VisionAPI",
+    
 ]
diff --git a/swarms/models/gpt4_vision_api.py b/swarms/models/gpt4_vision_api.py
new file mode 100644
index 000000000..057362618
--- /dev/null
+++ b/swarms/models/gpt4_vision_api.py
@@ -0,0 +1,127 @@
+import base64
+import os
+import requests
+from dotenv import load_dotenv
+
+# Load environment variables
+load_dotenv()
+openai_api_key = os.getenv("OPENAI_API_KEY")
+
+class GPT4VisionAPI:
+    """
+    GPT-4 Vision API
+
+    This class is a wrapper for the OpenAI API. It is used to run the GPT-4 Vision model.
+
+    Parameters
+    ----------
+    openai_api_key : str
+        The OpenAI API key. Defaults to the OPENAI_API_KEY environment variable.
+
+    Methods
+    -------
+    encode_image(img: str)
+        Encode image to base64.
+    run(task: str, img: str)
+        Run the model.
+    __call__(task: str, img: str)
+        Run the model.
+
+    Examples:
+    ---------
+    >>> from swarms.models import GPT4VisionAPI
+    >>> llm = GPT4VisionAPI()
+    >>> task = "What is the color of the object?"
+    >>> img = "https://i.imgur.com/2M2ZGwC.jpeg"
+    >>> llm.run(task, img)
+    
+    
+    """
+    def __init__(
+        self,
+        openai_api_key: str = openai_api_key
+    ):
+        super().__init__()
+        self.openai_api_key = openai_api_key
+
+    def encode_image(self, img: str):
+        """Encode image to base64."""
+        with open(img, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode("utf-8")
+
+    # Function to handle vision tasks
+    def run(self, task: str, img: str):
+        """Run the model."""
+        try:                
+            base64_image = self.encode_image(img)
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {openai_api_key}",
+            }
+            payload = {
+                "model": "gpt-4-vision-preview",
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": task},
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/jpeg;base64,{base64_image}"
+                                },
+                            },
+                        ],
+                    }
+                ],
+                "max_tokens": 300,
+            }
+            response = requests.post(
+                "https://api.openai.com/v1/chat/completions",
+                headers=headers,
+                json=payload,
+            )
+
+            out = response.json()
+
+            out = out["choices"][0]["text"]
+        except Exception as error:
+            print(f"Error with the request: {error}")
+            raise error
+        # Function to handle vision tasks
+
+    def __call__(self, task: str, img: str):
+        """Run the model."""
+        try:                
+            base64_image = self.encode_image(img)
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {openai_api_key}",
+            }
+            payload = {
+                "model": "gpt-4-vision-preview",
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": task},
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/jpeg;base64,{base64_image}"
+                                },
+                            },
+                        ],
+                    }
+                ],
+                "max_tokens": 300,
+            }
+            response = requests.post(
+                "https://api.openai.com/v1/chat/completions",
+                headers=headers,
+                json=payload,
+            )
+            return response.json()
+        except Exception as error:
+            print(f"Error with the request: {error}")
+            raise error
diff --git a/swarms/structs/flow.py b/swarms/structs/flow.py
index 99a3e5873..2287273cd 100644
--- a/swarms/structs/flow.py
+++ b/swarms/structs/flow.py
@@ -496,7 +496,7 @@ def activate_autonomous_agent(self):
             )
             print(error)
 
-    def run(self, task: str, **kwargs):
+    def run(self, task: str, img: Optional[str], **kwargs):
         """
         Run the autonomous agent loop
 
@@ -550,10 +550,17 @@ def run(self, task: str, **kwargs):
                 attempt = 0
                 while attempt < self.retry_attempts:
                     try:
-                        response = self.llm(
-                            task,
-                            **kwargs,
-                        )
+                        if img:
+                            response = self.llm(
+                                task,
+                                img,
+                                **kwargs,
+                            )
+                        else:
+                            response = self.llm(
+                                task,
+                                **kwargs,
+                            )
 
                         # If code interpreter is enabled then run the code
                         if self.code_interpreter: