gpt4vision api

kyegomez · Nov 25, 2023 · 9c3a292 · 9c3a292
1 parent 399099e
commit 9c3a292
Show file tree

Hide file tree

Showing 5 changed files with 160 additions and 38 deletions.
diff --git a/multi_modal_auto_agent.py b/multi_modal_auto_agent.py
@@ -0,0 +1,17 @@
+from swarms.structs import Flow
+from swarms.models.gpt4_vision_api import GPT4VisionAPI
+
+
+llm = GPT4VisionAPI()
+
+task = "What is the color of the object?"
+img = "images/swarms.jpeg"
+
+## Initialize the workflow
+flow = Flow(
+    llm=llm,
+    max_loops='auto',
+    dashboard=True,
+)
+
+flow.run(task=task, img=img)
diff --git a/playground/demos/multi_modal_autonomous_agents/multi_modal_auto_agent.py b/playground/demos/multi_modal_autonomous_agents/multi_modal_auto_agent.py
diff --git a/swarms/models/__init__.py b/swarms/models/__init__.py
@@ -25,6 +25,7 @@
 from swarms.models.vilt import Vilt  # noqa: E402
 from swarms.models.nougat import Nougat  # noqa: E402
 from swarms.models.layoutlm_document_qa import LayoutLMDocumentQA  # noqa: E402
+from swarms.models.gpt4_vision_api import GPT4VisionAPI  # noqa: E40
 
 # from swarms.models.gpt4v import GPT4Vision
 # from swarms.models.dalle3 import Dalle3
@@ -49,4 +50,7 @@
     "WizardLLMStoryTeller",
     # "GPT4Vision",
     # "Dalle3",
+    # "DistilWhisperModel",
+    "GPT4VisionAPI",
+
 ]
diff --git a/swarms/models/gpt4_vision_api.py b/swarms/models/gpt4_vision_api.py
@@ -0,0 +1,127 @@
+import base64
+import os
+import requests
+from dotenv import load_dotenv
+
+# Load environment variables
+load_dotenv()
+openai_api_key = os.getenv("OPENAI_API_KEY")
+
+class GPT4VisionAPI:
+    """
+    GPT-4 Vision API
+
+    This class is a wrapper for the OpenAI API. It is used to run the GPT-4 Vision model.
+
+    Parameters
+    ----------
+    openai_api_key : str
+        The OpenAI API key. Defaults to the OPENAI_API_KEY environment variable.
+
+    Methods
+    -------
+    encode_image(img: str)
+        Encode image to base64.
+    run(task: str, img: str)
+        Run the model.
+    __call__(task: str, img: str)
+        Run the model.
+
+    Examples:
+    ---------
+    >>> from swarms.models import GPT4VisionAPI
+    >>> llm = GPT4VisionAPI()
+    >>> task = "What is the color of the object?"
+    >>> img = "https://i.imgur.com/2M2ZGwC.jpeg"
+    >>> llm.run(task, img)
+    
+    
+    """
+    def __init__(
+        self,
+        openai_api_key: str = openai_api_key
+    ):
+        super().__init__()
+        self.openai_api_key = openai_api_key
+
+    def encode_image(self, img: str):
+        """Encode image to base64."""
+        with open(img, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode("utf-8")
+
+    # Function to handle vision tasks
+    def run(self, task: str, img: str):
+        """Run the model."""
+        try:                
+            base64_image = self.encode_image(img)
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {openai_api_key}",
+            }
+            payload = {
+                "model": "gpt-4-vision-preview",
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": task},
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/jpeg;base64,{base64_image}"
+                                },
+                            },
+                        ],
+                    }
+                ],
+                "max_tokens": 300,
+            }
+            response = requests.post(
+                "https://api.openai.com/v1/chat/completions",
+                headers=headers,
+                json=payload,
+            )
+
+            out = response.json()
+
+            out = out["choices"][0]["text"]
+        except Exception as error:
+            print(f"Error with the request: {error}")
+            raise error
+        # Function to handle vision tasks
+
+    def __call__(self, task: str, img: str):
+        """Run the model."""
+        try:                
+            base64_image = self.encode_image(img)
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {openai_api_key}",
+            }
+            payload = {
+                "model": "gpt-4-vision-preview",
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": task},
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/jpeg;base64,{base64_image}"
+                                },
+                            },
+                        ],
+                    }
+                ],
+                "max_tokens": 300,
+            }
+            response = requests.post(
+                "https://api.openai.com/v1/chat/completions",
+                headers=headers,
+                json=payload,
+            )
+            return response.json()
+        except Exception as error:
+            print(f"Error with the request: {error}")
+            raise error
diff --git a/swarms/structs/flow.py b/swarms/structs/flow.py
@@ -496,7 +496,7 @@ def activate_autonomous_agent(self):
             )
             print(error)
 
-    def run(self, task: str, **kwargs):
+    def run(self, task: str, img: Optional[str], **kwargs):
         """
         Run the autonomous agent loop
 
@@ -550,10 +550,17 @@ def run(self, task: str, **kwargs):
                 attempt = 0
                 while attempt < self.retry_attempts:
                     try:
-                        response = self.llm(
-                            task,
-                            **kwargs,
-                        )
+                        if img:
+                            response = self.llm(
+                                task,
+                                img,
+                                **kwargs,
+                            )
+                        else:
+                            response = self.llm(
+                                task,
+                                **kwargs,
+                            )
 
                         # If code interpreter is enabled then run the code
                         if self.code_interpreter: