From dfe9ebd8be56c263f20a09dac1d55720af7ab8c1 Mon Sep 17 00:00:00 2001 From: ayushichaurasia Date: Fri, 2 Aug 2024 15:33:27 +0530 Subject: [PATCH] Summary and detailed transcription using Librosa with Issue #6 --- app.py | 69 ++++++++++++++++++++++------------ download_whisper.py | 1 - extract_entities.py | 92 +++++++++++++++++++++++++++++++++++---------- translate_text.py | 2 +- 4 files changed, 119 insertions(+), 45 deletions(-) diff --git a/app.py b/app.py index 0346bfe..8d6b2bc 100644 --- a/app.py +++ b/app.py @@ -345,10 +345,11 @@ # if __name__ == "__main__": # main() +import streamlit as st +st.set_page_config(layout="wide") from dotenv import load_dotenv import os import numpy as np -import streamlit as st import librosa import io import openai @@ -358,7 +359,7 @@ from load_model import load_model from transcribe_audio import transcribe_audio from extract_entities import extract_entities -from translate_text import translate_text +from translate_text import translate_text # Assuming this is where you translate text # Load environment variables load_dotenv() @@ -366,50 +367,72 @@ # Set OpenAI API key openai.api_key = openai_api_key +# Initialize session state variables +if "transcription_text" not in st.session_state: + st.session_state.transcription_text = "" +if "summary" not in st.session_state: + st.session_state.summary = "" +if "detailed_transcription" not in st.session_state: + st.session_state.detailed_transcription = "" +if "show_detailed" not in st.session_state: + st.session_state.show_detailed = False + # Main function to run the Streamlit app def main(): - st.title("Speech to Text App") + st.markdown("

Speech to Text App

", unsafe_allow_html=True) # Load the Whisper model model = load_model(model_id, model_path) - # Language selection dropdown - #languages = ['hi', 'bn', 'te', 'mr', 'ta', 'ur', 'gu', 'kn', 'ml', 'pa'] - #language = st.selectbox("Select the language of the audio file", languages, index=0) - # File uploader for audio files st.write("Upload an audio file:") - audio_file = st.file_uploader("Select an audio",type=["mp3", "wav"]) + audio_file = st.file_uploader("Select an audio", type=["mp3", "wav"]) audio_data = None if audio_file: # Process uploaded audio file - st.write("We are extracting these entities:\n- Name:\n- Phone Numbers:\n- Addresses:\n- Email:\n- PIN Code:\n- Occupation:\n- Gender:") audio_bytes = audio_file.read() st.audio(audio_bytes) audio_file = io.BytesIO(audio_bytes) try: - audio_data, _ = librosa.load(audio_file, sr=16000) + audio_data, _ = librosa.load(audio_file, sr=18000) except Exception as e: st.error(f"Error loading audio file: {e}") # Perform transcription and other tasks on button click if audio_data is not None and st.button("Transcribe"): with st.spinner("Transcribing audio..."): - transcription_text = transcribe_audio(model, audio_data) - st.write(transcription_text) - - with st.spinner("Extracting entities..."): - entities = extract_entities(transcription_text) - st.write("Extracted Entities:") - st.write(entities) - - with st.spinner("Translating to English..."): - translated_text = translate_text(transcription_text) - st.write("Translated Text:") - st.write(translated_text) + try: + st.session_state.transcription_text = transcribe_audio(model, audio_data) + with st.spinner("Summarizing..."): + summary, detailed_transcription = extract_entities(st.session_state.transcription_text) + st.session_state.summary = summary + st.session_state.detailed_transcription = detailed_transcription + st.session_state.show_detailed = False + st.rerun() + except Exception as e: + st.error(f"Error during transcription or entity extraction: {e}") + + # display summary + if st.session_state.summary: + st.write("**Summary:**") + translated_summary = translate_text(st.session_state.summary) + st.markdown(translated_summary.replace("\n", " \n")) + + # Button + if st.session_state.summary and st.button("View detailed transcription"): + st.session_state.show_detailed = True + st.rerun() + + # display detailed transcription + if st.session_state.show_detailed: + st.write("Detailed view:") + st.write("**Original language:**") + st.markdown(st.session_state.detailed_transcription.replace("\n", " \n")) + st.write("**Translated to English:**") + translated_detailed = translate_text(st.session_state.detailed_transcription) + st.markdown(translated_detailed.replace("\n", " \n")) -# Entry point of the script if __name__ == "__main__": main() diff --git a/download_whisper.py b/download_whisper.py index 888246b..82e1c65 100644 --- a/download_whisper.py +++ b/download_whisper.py @@ -3,7 +3,6 @@ model_path = "whisper_model" model_id = 'large-v3' -s os.makedirs(model_path, exist_ok=True) # Download model diff --git a/extract_entities.py b/extract_entities.py index f7b699c..85224c6 100644 --- a/extract_entities.py +++ b/extract_entities.py @@ -6,27 +6,79 @@ def extract_entities(text): prompt = f""" The following entities are present in Indian Languages. - Please extract the following entities from the text. - Provide entities for both in English and the original language of the audio in well-structured format: + Please extract the following entities from the text: + Name, pin code, phone number, gender, occupation, and address. + + Provide the summary of the text in exact below format: + Name is ......., pin code is ........, phone number is ........, gender is ........, occupation is ........, Address is ............ . + + Text: "{text}" + + Summary: + + + Detailed view: + + Original language: {text} Text: "{text}" + + Summary: + + Detailed view: + + Original language: {text} - - Name: - - Phone Numbers: - - Addresses: - - Email: - - PIN Code: - - Occupation: - - Gender: """ - response = openai.chat.completions.create( - model="gpt-3.5-turbo", - messages=[ - {"role": "system", "content": "You are a helpful assistant that extracts information from Indian multilingual text."}, - {"role": "user", "content": prompt} - ], - max_tokens=200 - ) - entities_text = response.choices[0].message.content - - return entities_text + + try: + response = openai.chat.completions.create( + model="gpt-4o", + messages=[ + {"role": "system", "content": "You are a helpful assistant that extracts information from Indian multilingual text."}, + {"role": "user", "content": prompt} + ], + max_tokens=500 + ) + response_text = response.choices[0].message.content + except Exception as e: + return f"Error during OpenAI API call: {e}", "Detailed view not available." + + # Process the response to extract summary and detailed transcription + if "Detailed view:" in response_text: + parts = response_text.split("Detailed view:") + summary_part = parts[0].strip() + detailed_transcription_part = parts[1].strip() + else: + summary_part = response_text.strip() + detailed_transcription_part = "Detailed view not provided." + + # Format the summary and detailed transcription + formatted_summary = format_summary(summary_part) + formatted_detailed_transcription = format_detailed_transcription(detailed_transcription_part) + + return formatted_summary, formatted_detailed_transcription + +def format_summary(summary): + # Process the summary to remove unnecessary parts + lines = summary.split('\n') + summary_lines = [] + is_summary_section = False + + for line in lines: + line = line.strip() + if line.startswith("Summary:"): + is_summary_section = True + continue + if is_summary_section: + summary_lines.append(line) + + formatted_summary = ' '.join(summary_lines) + return formatted_summary + +def format_detailed_transcription(detailed_transcription): + # Process the detailed transcription to ensure proper formatting + lines = detailed_transcription.split('\n') + detailed_lines = [line.strip() for line in lines if line.strip()] + formatted_detailed_transcription = '\n'.join(detailed_lines) + return formatted_detailed_transcription diff --git a/translate_text.py b/translate_text.py index 178cfa7..1335461 100644 --- a/translate_text.py +++ b/translate_text.py @@ -14,4 +14,4 @@ def translate_text(text): max_tokens=150 ) translated_text = response.choices[0].message.content - return translated_text + return translated_text \ No newline at end of file