diff --git a/app.py b/app.py
index 0346bfe..8d6b2bc 100644
--- a/app.py
+++ b/app.py
@@ -345,10 +345,11 @@
# if __name__ == "__main__":
# main()
+import streamlit as st
+st.set_page_config(layout="wide")
from dotenv import load_dotenv
import os
import numpy as np
-import streamlit as st
import librosa
import io
import openai
@@ -358,7 +359,7 @@
from load_model import load_model
from transcribe_audio import transcribe_audio
from extract_entities import extract_entities
-from translate_text import translate_text
+from translate_text import translate_text # Assuming this is where you translate text
# Load environment variables
load_dotenv()
@@ -366,50 +367,72 @@
# Set OpenAI API key
openai.api_key = openai_api_key
+# Initialize session state variables
+if "transcription_text" not in st.session_state:
+ st.session_state.transcription_text = ""
+if "summary" not in st.session_state:
+ st.session_state.summary = ""
+if "detailed_transcription" not in st.session_state:
+ st.session_state.detailed_transcription = ""
+if "show_detailed" not in st.session_state:
+ st.session_state.show_detailed = False
+
# Main function to run the Streamlit app
def main():
- st.title("Speech to Text App")
+ st.markdown("
Speech to Text App
", unsafe_allow_html=True)
# Load the Whisper model
model = load_model(model_id, model_path)
- # Language selection dropdown
- #languages = ['hi', 'bn', 'te', 'mr', 'ta', 'ur', 'gu', 'kn', 'ml', 'pa']
- #language = st.selectbox("Select the language of the audio file", languages, index=0)
-
# File uploader for audio files
st.write("Upload an audio file:")
- audio_file = st.file_uploader("Select an audio",type=["mp3", "wav"])
+ audio_file = st.file_uploader("Select an audio", type=["mp3", "wav"])
audio_data = None
if audio_file:
# Process uploaded audio file
- st.write("We are extracting these entities:\n- Name:\n- Phone Numbers:\n- Addresses:\n- Email:\n- PIN Code:\n- Occupation:\n- Gender:")
audio_bytes = audio_file.read()
st.audio(audio_bytes)
audio_file = io.BytesIO(audio_bytes)
try:
- audio_data, _ = librosa.load(audio_file, sr=16000)
+ audio_data, _ = librosa.load(audio_file, sr=18000)
except Exception as e:
st.error(f"Error loading audio file: {e}")
# Perform transcription and other tasks on button click
if audio_data is not None and st.button("Transcribe"):
with st.spinner("Transcribing audio..."):
- transcription_text = transcribe_audio(model, audio_data)
- st.write(transcription_text)
-
- with st.spinner("Extracting entities..."):
- entities = extract_entities(transcription_text)
- st.write("Extracted Entities:")
- st.write(entities)
-
- with st.spinner("Translating to English..."):
- translated_text = translate_text(transcription_text)
- st.write("Translated Text:")
- st.write(translated_text)
+ try:
+ st.session_state.transcription_text = transcribe_audio(model, audio_data)
+ with st.spinner("Summarizing..."):
+ summary, detailed_transcription = extract_entities(st.session_state.transcription_text)
+ st.session_state.summary = summary
+ st.session_state.detailed_transcription = detailed_transcription
+ st.session_state.show_detailed = False
+ st.rerun()
+ except Exception as e:
+ st.error(f"Error during transcription or entity extraction: {e}")
+
+ # display summary
+ if st.session_state.summary:
+ st.write("**Summary:**")
+ translated_summary = translate_text(st.session_state.summary)
+ st.markdown(translated_summary.replace("\n", " \n"))
+
+ # Button
+ if st.session_state.summary and st.button("View detailed transcription"):
+ st.session_state.show_detailed = True
+ st.rerun()
+
+ # display detailed transcription
+ if st.session_state.show_detailed:
+ st.write("Detailed view:")
+ st.write("**Original language:**")
+ st.markdown(st.session_state.detailed_transcription.replace("\n", " \n"))
+ st.write("**Translated to English:**")
+ translated_detailed = translate_text(st.session_state.detailed_transcription)
+ st.markdown(translated_detailed.replace("\n", " \n"))
-# Entry point of the script
if __name__ == "__main__":
main()
diff --git a/download_whisper.py b/download_whisper.py
index 888246b..82e1c65 100644
--- a/download_whisper.py
+++ b/download_whisper.py
@@ -3,7 +3,6 @@
model_path = "whisper_model"
model_id = 'large-v3'
-s
os.makedirs(model_path, exist_ok=True)
# Download model
diff --git a/extract_entities.py b/extract_entities.py
index f7b699c..85224c6 100644
--- a/extract_entities.py
+++ b/extract_entities.py
@@ -6,27 +6,79 @@
def extract_entities(text):
prompt = f"""
The following entities are present in Indian Languages.
- Please extract the following entities from the text.
- Provide entities for both in English and the original language of the audio in well-structured format:
+ Please extract the following entities from the text:
+ Name, pin code, phone number, gender, occupation, and address.
+
+ Provide the summary of the text in exact below format:
+ Name is ......., pin code is ........, phone number is ........, gender is ........, occupation is ........, Address is ............ .
+
+ Text: "{text}"
+
+ Summary:
+
+
+ Detailed view:
+
+ Original language: {text}
Text: "{text}"
+
+ Summary:
+
+ Detailed view:
+
+ Original language: {text}
- - Name:
- - Phone Numbers:
- - Addresses:
- - Email:
- - PIN Code:
- - Occupation:
- - Gender:
"""
- response = openai.chat.completions.create(
- model="gpt-3.5-turbo",
- messages=[
- {"role": "system", "content": "You are a helpful assistant that extracts information from Indian multilingual text."},
- {"role": "user", "content": prompt}
- ],
- max_tokens=200
- )
- entities_text = response.choices[0].message.content
-
- return entities_text
+
+ try:
+ response = openai.chat.completions.create(
+ model="gpt-4o",
+ messages=[
+ {"role": "system", "content": "You are a helpful assistant that extracts information from Indian multilingual text."},
+ {"role": "user", "content": prompt}
+ ],
+ max_tokens=500
+ )
+ response_text = response.choices[0].message.content
+ except Exception as e:
+ return f"Error during OpenAI API call: {e}", "Detailed view not available."
+
+ # Process the response to extract summary and detailed transcription
+ if "Detailed view:" in response_text:
+ parts = response_text.split("Detailed view:")
+ summary_part = parts[0].strip()
+ detailed_transcription_part = parts[1].strip()
+ else:
+ summary_part = response_text.strip()
+ detailed_transcription_part = "Detailed view not provided."
+
+ # Format the summary and detailed transcription
+ formatted_summary = format_summary(summary_part)
+ formatted_detailed_transcription = format_detailed_transcription(detailed_transcription_part)
+
+ return formatted_summary, formatted_detailed_transcription
+
+def format_summary(summary):
+ # Process the summary to remove unnecessary parts
+ lines = summary.split('\n')
+ summary_lines = []
+ is_summary_section = False
+
+ for line in lines:
+ line = line.strip()
+ if line.startswith("Summary:"):
+ is_summary_section = True
+ continue
+ if is_summary_section:
+ summary_lines.append(line)
+
+ formatted_summary = ' '.join(summary_lines)
+ return formatted_summary
+
+def format_detailed_transcription(detailed_transcription):
+ # Process the detailed transcription to ensure proper formatting
+ lines = detailed_transcription.split('\n')
+ detailed_lines = [line.strip() for line in lines if line.strip()]
+ formatted_detailed_transcription = '\n'.join(detailed_lines)
+ return formatted_detailed_transcription
diff --git a/translate_text.py b/translate_text.py
index 178cfa7..1335461 100644
--- a/translate_text.py
+++ b/translate_text.py
@@ -14,4 +14,4 @@ def translate_text(text):
max_tokens=150
)
translated_text = response.choices[0].message.content
- return translated_text
+ return translated_text
\ No newline at end of file