Skip to content

Commit

Permalink
Printing essential print statements, hiding ffmpeg prints, updated i …
Browse files Browse the repository at this point in the history
…profile
  • Loading branch information
KillianLucas committed Mar 14, 2024
1 parent e61687d commit 3d225fb
Show file tree
Hide file tree
Showing 9 changed files with 378 additions and 132 deletions.
8 changes: 4 additions & 4 deletions 01OS/01OS/clients/base_device.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def record_audio(self):

"""Record audio from the microphone and add it to the queue."""
stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
logger.info("Recording started...")
print("Recording started...")
global RECORDING

# Create a temporary WAV file to store the audio data
Expand All @@ -171,7 +171,7 @@ def record_audio(self):
wav_file.close()
stream.stop_stream()
stream.close()
logger.info("Recording stopped.")
print("Recording stopped.")

duration = wav_file.getnframes() / RATE
if duration < 0.3:
Expand Down Expand Up @@ -255,9 +255,9 @@ async def websocket_communication(self, WS_URL):
try:
async with websockets.connect(WS_URL) as websocket:
if CAMERA_ENABLED:
logger.info("Press the spacebar to start/stop recording. Press 'c' to capture an image from the camera. Press CTRL-C to exit.")
print("Press the spacebar to start/stop recording. Press 'c' to capture an image from the camera. Press CTRL-C to exit.")
else:
logger.info("Press the spacebar to start/stop recording. Press CTRL-C to exit.")
print("Press the spacebar to start/stop recording. Press CTRL-C to exit.")

asyncio.create_task(self.message_sender(websocket))

Expand Down
330 changes: 306 additions & 24 deletions 01OS/01OS/server/i.py

Large diffs are not rendered by default.

153 changes: 59 additions & 94 deletions 01OS/01OS/server/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,11 +220,15 @@ async def listener():
time.sleep(15)

text = stt(audio_file_path)
print(text)
print("> ", text)
message = {"role": "user", "type": "message", "content": text}

# At this point, we have only text messages

if type(message["content"]) != str:
print("This should be a string, but it's not:", message["content"])
message["content"] = message["content"].decode()

# Custom stop message will halt us
if message["content"].lower().strip(".,! ") == "stop":
continue
Expand All @@ -238,116 +242,77 @@ async def listener():

accumulated_text = ""

force_task_completion_message = """AUTOMATED MESSAGE: Proceed. You CAN run code on my machine. If you want to run code, start your message with "```"! If the entire task I asked for is done, say exactly 'The task is done.' If you need some specific information (like username or password) say EXACTLY 'Please provide more information.' If it's impossible, say 'The task is impossible.' (If I haven't provided a task, say exactly 'Let me know what you'd like to do next.') Otherwise keep going."""
interpreter.messages = [m for m in interpreter.messages if m["content"] != force_task_completion_message]
insert_force_task_completion_message = True

if any([m["type"] == "image" for m in messages]) and interpreter.llm.model.startswith("gpt-"):
interpreter.llm.model = "gpt-4-vision-preview"
interpreter.llm.supports_vision = True

for chunk in interpreter.chat(messages, stream=True, display=True):

while insert_force_task_completion_message == True:

for chunk in interpreter.chat(messages, stream=True, display=True):

if chunk["type"] == "code":
insert_force_task_completion_message = False

if any([m["type"] == "image" for m in interpreter.messages]):
interpreter.llm.model = "gpt-4-vision-preview"
if any([m["type"] == "image" for m in interpreter.messages]):
interpreter.llm.model = "gpt-4-vision-preview"

logger.debug("Got chunk:", chunk)
logger.debug("Got chunk:", chunk)

# Send it to the user
await to_device.put(chunk)
# Yield to the event loop, so you actually send it out
await asyncio.sleep(0.01)

if os.getenv('TTS_RUNNER') == "server":
# Speak full sentences out loud
if chunk["role"] == "assistant" and "content" in chunk and chunk["type"] == "message":
accumulated_text += chunk["content"]
sentences = split_into_sentences(accumulated_text)

# If we're going to speak, say we're going to stop sending text.
# This should be fixed probably, we should be able to do both in parallel, or only one.
if any(is_full_sentence(sentence) for sentence in sentences):
await to_device.put({"role": "assistant", "type": "message", "end": True})

if is_full_sentence(sentences[-1]):
for sentence in sentences:
await stream_tts_to_device(sentence)
accumulated_text = ""
else:
for sentence in sentences[:-1]:
await stream_tts_to_device(sentence)
accumulated_text = sentences[-1]

# If we're going to speak, say we're going to stop sending text.
# This should be fixed probably, we should be able to do both in parallel, or only one.
if any(is_full_sentence(sentence) for sentence in sentences):
await to_device.put({"role": "assistant", "type": "message", "start": True})
# Send it to the user
await to_device.put(chunk)
# Yield to the event loop, so you actually send it out
await asyncio.sleep(0.01)

if os.getenv('TTS_RUNNER') == "server":
# Speak full sentences out loud
if chunk["role"] == "assistant" and "content" in chunk and chunk["type"] == "message":
accumulated_text += chunk["content"]
sentences = split_into_sentences(accumulated_text)

# If we have a new message, save our progress and go back to the top
if not from_user.empty():

# Check if it's just an end flag. We ignore those.
temp_message = await from_user.get()
# If we're going to speak, say we're going to stop sending text.
# This should be fixed probably, we should be able to do both in parallel, or only one.
if any(is_full_sentence(sentence) for sentence in sentences):
await to_device.put({"role": "assistant", "type": "message", "end": True})

if type(temp_message) is dict and temp_message.get("role") == "user" and temp_message.get("end"):
# Yup. False alarm.
continue
if is_full_sentence(sentences[-1]):
for sentence in sentences:
await stream_tts_to_device(sentence)
accumulated_text = ""
else:
# Whoops! Put that back
await from_user.put(temp_message)
for sentence in sentences[:-1]:
await stream_tts_to_device(sentence)
accumulated_text = sentences[-1]

# If we're going to speak, say we're going to stop sending text.
# This should be fixed probably, we should be able to do both in parallel, or only one.
if any(is_full_sentence(sentence) for sentence in sentences):
await to_device.put({"role": "assistant", "type": "message", "start": True})

# If we have a new message, save our progress and go back to the top
if not from_user.empty():

with open(conversation_history_path, 'w') as file:
json.dump(interpreter.messages, file, indent=4)
# Check if it's just an end flag. We ignore those.
temp_message = await from_user.get()

if type(temp_message) is dict and temp_message.get("role") == "user" and temp_message.get("end"):
# Yup. False alarm.
continue
else:
# Whoops! Put that back
await from_user.put(temp_message)

# TODO: is triggering seemingly randomly
#logger.info("New user message recieved. Breaking.")
#break
with open(conversation_history_path, 'w') as file:
json.dump(interpreter.messages, file, indent=4)

# Also check if there's any new computer messages
if not from_computer.empty():

with open(conversation_history_path, 'w') as file:
json.dump(interpreter.messages, file, indent=4)
# TODO: is triggering seemingly randomly
#logger.info("New user message recieved. Breaking.")
#break

logger.info("New computer message recieved. Breaking.")
break
else:
# Also check if there's any new computer messages
if not from_computer.empty():
with open(conversation_history_path, 'w') as file:
json.dump(interpreter.messages, file, indent=4)

force_task_completion_responses = [
"the task is done.",
"the task is impossible.",
"let me know what you'd like to do next.",
"please provide more information.",
]

# Did the LLM respond with one of the key messages?
if (
interpreter.messages
and any(
task_status in interpreter.messages[-1].get("content", "").lower()
for task_status in force_task_completion_responses
)
):
insert_force_task_completion_message = False
break

if insert_force_task_completion_message:
interpreter.messages += [
{
"role": "user",
"type": "message",
"content": force_task_completion_message,
}
]
else:
break
logger.info("New computer message recieved. Breaking.")
break


async def stream_tts_to_device(sentence):
force_task_completion_responses = [
Expand Down
4 changes: 2 additions & 2 deletions 01OS/01OS/server/services/stt/local-whisper/stt.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,9 @@ def export_audio_to_wav_ffmpeg(audio: bytearray, mime_type: str) -> str:
f='s16le',
ar='16000',
ac=1,
).output(output_path).run()
).output(output_path, loglevel='panic').run()
else:
ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run()
ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k', loglevel='panic').run()

try:
yield output_path
Expand Down
5 changes: 2 additions & 3 deletions 01OS/01OS/server/services/stt/openai/stt.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,15 @@ def export_audio_to_wav_ffmpeg(audio: bytearray, mime_type: str) -> str:

# Export to wav
output_path = os.path.join(temp_dir, f"output_{datetime.now().strftime('%Y%m%d%H%M%S%f')}.wav")
print(mime_type, input_path, output_path)
if mime_type == "audio/raw":
ffmpeg.input(
input_path,
f='s16le',
ar='16000',
ac=1,
).output(output_path).run()
).output(output_path, loglevel='panic').run()
else:
ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run()
ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k', loglevel='panic').run()

try:
yield output_path
Expand Down
2 changes: 1 addition & 1 deletion 01OS/01OS/server/services/tts/openai/tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def tts(self, text):

# TODO: hack to format audio correctly for device
outfile = tempfile.gettempdir() + "/" + "raw.dat"
ffmpeg.input(temp_file.name).output(outfile, f="s16le", ar="16000", ac="1").run()
ffmpeg.input(temp_file.name).output(outfile, f="s16le", ar="16000", ac="1", loglevel='panic').run()

return outfile

Expand Down
2 changes: 1 addition & 1 deletion 01OS/01OS/server/services/tts/piper/tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def tts(self, text):

# TODO: hack to format audio correctly for device
outfile = tempfile.gettempdir() + "/" + "raw.dat"
ffmpeg.input(temp_file.name).output(outfile, f="s16le", ar="16000", ac="1").run()
ffmpeg.input(temp_file.name).output(outfile, f="s16le", ar="16000", ac="1", loglevel='panic').run()

return outfile

Expand Down
4 changes: 2 additions & 2 deletions 01OS/01OS/server/utils/bytes_to_wav.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ def export_audio_to_wav_ffmpeg(audio: bytearray, mime_type: str) -> str:
f='s16le',
ar='16000',
ac=1,
).output(output_path).run()
).output(output_path, loglevel='panic').run()
else:
ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k').run()
ffmpeg.input(input_path).output(output_path, acodec='pcm_s16le', ac=1, ar='16k', loglevel='panic').run()

try:
yield output_path
Expand Down
2 changes: 1 addition & 1 deletion 01OS/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ pydub = "^0.25.1"
ngrok = "^1.0.0"
simpleaudio = "^1.0.4"
opencv-python = "^4.9.0.80"
open-interpreter = {version = "0.2.1rc2", extras = ["os"]}
open-interpreter = { git = "https://github.com/KillianLucas/open-interpreter.git", branch = "development" }
psutil = "^5.9.8"
typer = "^0.9.0"
platformdirs = "^4.2.0"
Expand Down

0 comments on commit 3d225fb

Please sign in to comment.