-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
135 lines (109 loc) · 4.11 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from fastapi import FastAPI, UploadFile, File
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
from openai import OpenAI
import uvicorn
from pathlib import Path
import tempfile
from typing import Optional, List, Dict
from dotenv import load_dotenv
import os
import base64
# Load environment variables
load_dotenv()
app = FastAPI()
# Enable CORS for local development
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Initialize OpenAI client
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
# Simple in-memory chat history
chat_history: List[Dict[str, str]] = []
def transcribe_audio(audio_path: str) -> Optional[str]:
"""Convert speech to text using OpenAI's Whisper"""
try:
with open(audio_path, "rb") as audio:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=audio,
response_format="text"
)
return transcript
except Exception as e:
print(f"Transcription error: {e}")
return None
def process_with_llm(text: str) -> Optional[str]:
"""Process text with LLM using conversation history"""
try:
# Build messages with history
messages = [
{"role": "system", "content": "You are a helpful voice assistant. Keep responses concise and natural."}
] + chat_history + [{"role": "user", "content": text}]
# Get response from GPT
chat_completion = client.chat.completions.create(
model="gpt-3.5-turbo-1106",
messages=messages
)
response = chat_completion.choices[0].message.content
# Update history
chat_history.append({"role": "user", "content": text})
chat_history.append({"role": "assistant", "content": response})
# Keep only last 10 exchanges (20 messages) to manage context length
while len(chat_history) > 20:
chat_history.pop(0)
return response
except Exception as e:
print(f"LLM processing error: {e}")
return None
def convert_to_speech(text: str) -> Optional[bytes]:
"""Convert text to speech using OpenAI's TTS"""
try:
speech_response = client.audio.speech.create(
model="tts-1",
voice="alloy",
input=text
)
return speech_response.content
except Exception as e:
print(f"Text-to-speech error: {e}")
return None
@app.post("/process-voice")
async def process_voice(audio_file: UploadFile = File(...)):
try:
# Save uploaded audio file temporarily
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
content = await audio_file.read()
temp_audio.write(content)
temp_audio_path = temp_audio.name
# Step 1: Speech to Text
transcript = transcribe_audio(temp_audio_path)
if not transcript:
return JSONResponse(content={"error": "Transcription failed"}, status_code=500)
# Step 2: Process with LLM
response_text = process_with_llm(transcript)
if not response_text:
return JSONResponse(content={"error": "LLM processing failed"}, status_code=500)
# Step 3: Text to Speech
audio_content = convert_to_speech(response_text)
if not audio_content:
return JSONResponse(content={"error": "Text-to-speech conversion failed"}, status_code=500)
# Cleanup temporary audio file
Path(temp_audio_path).unlink()
# Encode audio content as base64
audio_base64 = base64.b64encode(audio_content).decode('utf-8')
return JSONResponse(content={
"transcript": transcript,
"response_text": response_text,
"audio": audio_base64,
"history": chat_history
})
except Exception as e:
print(f"Error in process_voice: {e}")
return JSONResponse(content={"error": str(e)}, status_code=500)
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)