Printing essential print statements, hiding ffmpeg prints, updated i …

…profile
OpenInterpreter · Mar 14, 2024 · 3d225fb · 3d225fb
1 parent e61687d
commit 3d225fb
Show file tree

Hide file tree

Showing 9 changed files with 378 additions and 132 deletions.
diff --git a/01OS/01OS/clients/base_device.py b/01OS/01OS/clients/base_device.py
@@ -153,7 +153,7 @@ def record_audio(self):
 
         """Record audio from the microphone and add it to the queue."""
         stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
-        logger.info("Recording started...")
+        print("Recording started...")
         global RECORDING
 
         # Create a temporary WAV file to store the audio data
@@ -171,7 +171,7 @@ def record_audio(self):
         wav_file.close()
         stream.stop_stream()
         stream.close()
-        logger.info("Recording stopped.")
+        print("Recording stopped.")
 
         duration = wav_file.getnframes() / RATE
         if duration < 0.3:
@@ -255,9 +255,9 @@ async def websocket_communication(self, WS_URL):
             try:
                 async with websockets.connect(WS_URL) as websocket:
                     if CAMERA_ENABLED:
-                        logger.info("Press the spacebar to start/stop recording. Press 'c' to capture an image from the camera. Press CTRL-C to exit.")
+                        print("Press the spacebar to start/stop recording. Press 'c' to capture an image from the camera. Press CTRL-C to exit.")
                     else:
-                        logger.info("Press the spacebar to start/stop recording. Press CTRL-C to exit.")
+                        print("Press the spacebar to start/stop recording. Press CTRL-C to exit.")
 
                     asyncio.create_task(self.message_sender(websocket))
 

diff --git a/01OS/01OS/server/i.py b/01OS/01OS/server/i.py
diff --git a/01OS/01OS/server/server.py b/01OS/01OS/server/server.py
@@ -220,11 +220,15 @@ async def listener():
                 time.sleep(15)
 
             text = stt(audio_file_path)
-            print(text)
+            print("> ", text)
             message = {"role": "user", "type": "message", "content": text}
 
         # At this point, we have only text messages
 
+        if type(message["content"]) != str:
+            print("This should be a string, but it's not:", message["content"])
+            message["content"] = message["content"].decode()
+
         # Custom stop message will halt us
         if message["content"].lower().strip(".,! ") == "stop":
             continue
@@ -238,116 +242,77 @@ async def listener():
 
         accumulated_text = ""
 
-        force_task_completion_message = """AUTOMATED MESSAGE: Proceed. You CAN run code on my machine. If you want to run code, start your message with "```"! If the entire task I asked for is done, say exactly 'The task is done.' If you need some specific information (like username or password) say EXACTLY 'Please provide more information.' If it's impossible, say 'The task is impossible.' (If I haven't provided a task, say exactly 'Let me know what you'd like to do next.') Otherwise keep going."""
-        interpreter.messages = [m for m in interpreter.messages if m["content"] != force_task_completion_message]
-        insert_force_task_completion_message = True
 
         if any([m["type"] == "image" for m in messages]) and interpreter.llm.model.startswith("gpt-"):
             interpreter.llm.model = "gpt-4-vision-preview"
             interpreter.llm.supports_vision = True
+
+        for chunk in interpreter.chat(messages, stream=True, display=True):
 
-        while insert_force_task_completion_message == True:
-
-            for chunk in interpreter.chat(messages, stream=True, display=True):
-
-                if chunk["type"] == "code":
-                    insert_force_task_completion_message = False
-
-                if any([m["type"] == "image" for m in interpreter.messages]):
-                    interpreter.llm.model = "gpt-4-vision-preview"
+            if any([m["type"] == "image" for m in interpreter.messages]):
+                interpreter.llm.model = "gpt-4-vision-preview"
 
-                logger.debug("Got chunk:", chunk)
+            logger.debug("Got chunk:", chunk)
 
-                # Send it to the user
-                await to_device.put(chunk)
-                # Yield to the event loop, so you actually send it out
-                await asyncio.sleep(0.01)
-
-                if os.getenv('TTS_RUNNER') == "server":
-                    # Speak full sentences out loud
-                    if chunk["role"] == "assistant" and "content" in chunk and chunk["type"] == "message":
-                        accumulated_text += chunk["content"]
-                        sentences = split_into_sentences(accumulated_text)
-
-                        # If we're going to speak, say we're going to stop sending text.
-                        # This should be fixed probably, we should be able to do both in parallel, or only one.
-                        if any(is_full_sentence(sentence) for sentence in sentences):
-                            await to_device.put({"role": "assistant", "type": "message", "end": True})
-
-                        if is_full_sentence(sentences[-1]):
-                            for sentence in sentences:
-                                await stream_tts_to_device(sentence)
-                            accumulated_text = ""
-                        else:
-                            for sentence in sentences[:-1]:
-                                await stream_tts_to_device(sentence)
-                            accumulated_text = sentences[-1]
-
-                        # If we're going to speak, say we're going to stop sending text.
-                        # This should be fixed probably, we should be able to do both in parallel, or only one.
-                        if any(is_full_sentence(sentence) for sentence in sentences):
-                            await to_device.put({"role": "assistant", "type": "message", "start": True})
+            # Send it to the user
+            await to_device.put(chunk)
+            # Yield to the event loop, so you actually send it out
+            await asyncio.sleep(0.01)
+
+            if os.getenv('TTS_RUNNER') == "server":
+                # Speak full sentences out loud
+                if chunk["role"] == "assistant" and "content" in chunk and chunk["type"] == "message":
+                    accumulated_text += chunk["content"]
+                    sentences = split_into_sentences(accumulated_text)
 
-                # If we have a new message, save our progress and go back to the top
-                if not from_user.empty():
-
-                    # Check if it's just an end flag. We ignore those.
-                    temp_message = await from_user.get()
+                    # If we're going to speak, say we're going to stop sending text.
+                    # This should be fixed probably, we should be able to do both in parallel, or only one.
+                    if any(is_full_sentence(sentence) for sentence in sentences):
+                        await to_device.put({"role": "assistant", "type": "message", "end": True})
 
-                    if type(temp_message) is dict and temp_message.get("role") == "user" and temp_message.get("end"):
-                        # Yup. False alarm.
-                        continue
+                    if is_full_sentence(sentences[-1]):
+                        for sentence in sentences:
+                            await stream_tts_to_device(sentence)
+                        accumulated_text = ""
                     else:
-                        # Whoops! Put that back
-                        await from_user.put(temp_message)
+                        for sentence in sentences[:-1]:
+                            await stream_tts_to_device(sentence)
+                        accumulated_text = sentences[-1]
+
+                    # If we're going to speak, say we're going to stop sending text.
+                    # This should be fixed probably, we should be able to do both in parallel, or only one.
+                    if any(is_full_sentence(sentence) for sentence in sentences):
+                        await to_device.put({"role": "assistant", "type": "message", "start": True})
+
+            # If we have a new message, save our progress and go back to the top
+            if not from_user.empty():
 
-                    with open(conversation_history_path, 'w') as file:
-                        json.dump(interpreter.messages, file, indent=4)
+                # Check if it's just an end flag. We ignore those.
+                temp_message = await from_user.get()
+
+                if type(temp_message) is dict and temp_message.get("role") == "user" and temp_message.get("end"):
+                    # Yup. False alarm.
+                    continue
+                else:
+                    # Whoops! Put that back
+                    await from_user.put(temp_message)
 
-                    # TODO: is triggering seemingly randomly
-                    #logger.info("New user message recieved. Breaking.")
-                    #break
+                with open(conversation_history_path, 'w') as file:
+                    json.dump(interpreter.messages, file, indent=4)
 
-                # Also check if there's any new computer messages
-                if not from_computer.empty():
-
-                    with open(conversation_history_path, 'w') as file:
-                        json.dump(interpreter.messages, file, indent=4)
+                # TODO: is triggering seemingly randomly
+                #logger.info("New user message recieved. Breaking.")
+                #break
 
-                    logger.info("New computer message recieved. Breaking.")
-                    break
-            else:
+            # Also check if there's any new computer messages
+            if not from_computer.empty():
+                
                 with open(conversation_history_path, 'w') as file:
                     json.dump(interpreter.messages, file, indent=4)
 
-                force_task_completion_responses = [
-                    "the task is done.",
-                    "the task is impossible.",
-                    "let me know what you'd like to do next.",
-                    "please provide more information.",
-                ]
-
-                # Did the LLM respond with one of the key messages?
-                if (
-                    interpreter.messages
-                    and any(
-                        task_status in interpreter.messages[-1].get("content", "").lower()
-                        for task_status in force_task_completion_responses
-                    )
-                ):
-                    insert_force_task_completion_message = False
-                    break
-
-                if insert_force_task_completion_message:
-                    interpreter.messages += [
-                        {
-                            "role": "user",
-                            "type": "message",
-                            "content": force_task_completion_message,
-                        }
-                    ]
-                else:
-                    break
+                logger.info("New computer message recieved. Breaking.")
+                break
+
 
 async def stream_tts_to_device(sentence):
     force_task_completion_responses = [

diff --git a/01OS/01OS/server/services/stt/local-whisper/stt.py b/01OS/01OS/server/services/stt/local-whisper/stt.py
@@ -95,9 +95,9 @@ def export_audio_to_wav_ffmpeg(audio: bytearray, mime_type: str) -> str:
             f='s16le',
             ar='16000',
             ac=1,
-        ).output(output_path).run()
+        ).output(output_path, loglevel='panic').run()
     else:
-        ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run()
+        ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k', loglevel='panic').run()
 
     try:
         yield output_path

diff --git a/01OS/01OS/server/services/stt/openai/stt.py b/01OS/01OS/server/services/stt/openai/stt.py
@@ -44,16 +44,15 @@ def export_audio_to_wav_ffmpeg(audio: bytearray, mime_type: str) -> str:
 
     # Export to wav
     output_path = os.path.join(temp_dir, f"output_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav")
-    print(mime_type, input_path, output_path)
     if mime_type == "audio/raw":
         ffmpeg.input(
             input_path,
             f='s16le',
             ar='16000',
             ac=1,
-        ).output(output_path).run()
+        ).output(output_path, loglevel='panic').run()
     else:
-        ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run()
+        ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k', loglevel='panic').run()
 
     try:
         yield output_path

diff --git a/01OS/01OS/server/services/tts/openai/tts.py b/01OS/01OS/server/services/tts/openai/tts.py
@@ -23,7 +23,7 @@ def tts(self, text):
 
                 # TODO: hack to format audio correctly for device
                 outfile = tempfile.gettempdir() + "/" + "raw.dat"
-                ffmpeg.input(temp_file.name).output(outfile, f="s16le", ar="16000", ac="1").run()
+                ffmpeg.input(temp_file.name).output(outfile, f="s16le", ar="16000", ac="1", loglevel='panic').run()
 
                 return outfile
 

diff --git a/01OS/01OS/server/services/tts/piper/tts.py b/01OS/01OS/server/services/tts/piper/tts.py
@@ -24,7 +24,7 @@ def tts(self, text):
 
             # TODO: hack to format audio correctly for device
             outfile = tempfile.gettempdir() + "/" + "raw.dat"
-            ffmpeg.input(temp_file.name).output(outfile, f="s16le", ar="16000", ac="1").run()
+            ffmpeg.input(temp_file.name).output(outfile, f="s16le", ar="16000", ac="1", loglevel='panic').run()
 
             return outfile
 

diff --git a/01OS/01OS/server/utils/bytes_to_wav.py b/01OS/01OS/server/utils/bytes_to_wav.py
@@ -37,9 +37,9 @@ def export_audio_to_wav_ffmpeg(audio: bytearray, mime_type: str) -> str:
             f='s16le',
             ar='16000',
             ac=1,
-        ).output(output_path).run()
+        ).output(output_path, loglevel='panic').run()
     else:
-        ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run()
+        ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k', loglevel='panic').run()
 
     try:
         yield output_path

diff --git a/01OS/pyproject.toml b/01OS/pyproject.toml
@@ -24,7 +24,7 @@ pydub = "^0.25.1"
 ngrok = "^1.0.0"
 simpleaudio = "^1.0.4"
 opencv-python = "^4.9.0.80"
-open-interpreter = {version = "0.2.1rc2", extras = ["os"]}
+open-interpreter = { git = "https://github.com/KillianLucas/open-interpreter.git", branch = "development" }
 psutil = "^5.9.8"
 typer = "^0.9.0"
 platformdirs = "^4.2.0"