From b888db4a9af43a648f0d772c2c3dce429fb72cfa Mon Sep 17 00:00:00 2001 From: Nikita Krasnytskyi Date: Mon, 16 Sep 2024 16:16:48 +0100 Subject: [PATCH] Updated Guide: Real Time Speech Recognition (#9349) * Update real-time-speech-recognition.md added necessary dependency * Update run.py updated code to handle cases with stereo microphone * Update real-time-speech-recognition.md improved english * Update run.py updated code for streaming * Update run.py --- demo/asr/run.ipynb | 2 +- demo/asr/run.py | 7 ++++++- demo/stream_asr/run.ipynb | 2 +- demo/stream_asr/run.py | 5 +++++ guides/09_other-tutorials/real-time-speech-recognition.md | 8 +++----- 5 files changed, 16 insertions(+), 8 deletions(-) diff --git a/demo/asr/run.ipynb b/demo/asr/run.ipynb index e8931ecaaadfc..40220c2b225cb 100644 --- a/demo/asr/run.ipynb +++ b/demo/asr/run.ipynb @@ -1 +1 @@ -{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: asr"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio torch torchaudio transformers"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "from transformers import pipeline\n", "import numpy as np\n", "\n", "transcriber = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base.en\")\n", "\n", "def transcribe(audio):\n", " sr, y = audio\n", " y = y.astype(np.float32)\n", " y /= np.max(np.abs(y))\n", "\n", " return transcriber({\"sampling_rate\": sr, \"raw\": y})[\"text\"] # type: ignore\n", "\n", "demo = gr.Interface(\n", " transcribe,\n", " gr.Audio(sources=[\"microphone\"]),\n", " \"text\",\n", ")\n", "\n", "if __name__ == \"__main__\":\n", " demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5} \ No newline at end of file +{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: asr"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio torch torchaudio transformers"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "from transformers import pipeline\n", "import numpy as np\n", "\n", "transcriber = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base.en\")\n", "\n", "def transcribe(audio):\n", " sr, y = audio\n", " \n", " # Convert to mono if stereo\n", " if y.ndim > 1:\n", " y = y.mean(axis=1)\n", " \n", " y = y.astype(np.float32)\n", " y /= np.max(np.abs(y))\n", "\n", " return transcriber({\"sampling_rate\": sr, \"raw\": y})[\"text\"] # type: ignore\n", "\n", "demo = gr.Interface(\n", " transcribe,\n", " gr.Audio(sources=\"microphone\"),\n", " \"text\",\n", ")\n", "\n", "if __name__ == \"__main__\":\n", " demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5} \ No newline at end of file diff --git a/demo/asr/run.py b/demo/asr/run.py index 0d4aa4e9a05f1..9ae26ba77b219 100644 --- a/demo/asr/run.py +++ b/demo/asr/run.py @@ -6,6 +6,11 @@ def transcribe(audio): sr, y = audio + + # Convert to mono if stereo + if y.ndim > 1: + y = y.mean(axis=1) + y = y.astype(np.float32) y /= np.max(np.abs(y)) @@ -13,7 +18,7 @@ def transcribe(audio): demo = gr.Interface( transcribe, - gr.Audio(sources=["microphone"]), + gr.Audio(sources="microphone"), "text", ) diff --git a/demo/stream_asr/run.ipynb b/demo/stream_asr/run.ipynb index 1473aaca86b04..f2d007e6eaacc 100644 --- a/demo/stream_asr/run.ipynb +++ b/demo/stream_asr/run.ipynb @@ -1 +1 @@ -{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: stream_asr"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio torch torchaudio transformers"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "from transformers import pipeline\n", "import numpy as np\n", "\n", "transcriber = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base.en\")\n", "\n", "def transcribe(stream, new_chunk):\n", " sr, y = new_chunk\n", " y = y.astype(np.float32)\n", " y /= np.max(np.abs(y))\n", "\n", " if stream is not None:\n", " stream = np.concatenate([stream, y])\n", " else:\n", " stream = y\n", " return stream, transcriber({\"sampling_rate\": sr, \"raw\": stream})[\"text\"] # type: ignore\n", "\n", "demo = gr.Interface(\n", " transcribe,\n", " [\"state\", gr.Audio(sources=[\"microphone\"], streaming=True)],\n", " [\"state\", \"text\"],\n", " live=True,\n", ")\n", "\n", "if __name__ == \"__main__\":\n", " demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5} \ No newline at end of file +{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: stream_asr"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio torch torchaudio transformers"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "from transformers import pipeline\n", "import numpy as np\n", "\n", "transcriber = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base.en\")\n", "\n", "def transcribe(stream, new_chunk):\n", " sr, y = new_chunk\n", " \n", " # Convert to mono if stereo\n", " if y.ndim > 1:\n", " y = y.mean(axis=1)\n", " \n", " y = y.astype(np.float32)\n", " y /= np.max(np.abs(y))\n", "\n", " if stream is not None:\n", " stream = np.concatenate([stream, y])\n", " else:\n", " stream = y\n", " return stream, transcriber({\"sampling_rate\": sr, \"raw\": stream})[\"text\"] # type: ignore\n", "\n", "demo = gr.Interface(\n", " transcribe,\n", " [\"state\", gr.Audio(sources=[\"microphone\"], streaming=True)],\n", " [\"state\", \"text\"],\n", " live=True,\n", ")\n", "\n", "if __name__ == \"__main__\":\n", " demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5} \ No newline at end of file diff --git a/demo/stream_asr/run.py b/demo/stream_asr/run.py index 6d617f49d6612..bed102fadd684 100644 --- a/demo/stream_asr/run.py +++ b/demo/stream_asr/run.py @@ -6,6 +6,11 @@ def transcribe(stream, new_chunk): sr, y = new_chunk + + # Convert to mono if stereo + if y.ndim > 1: + y = y.mean(axis=1) + y = y.astype(np.float32) y /= np.max(np.abs(y)) diff --git a/guides/09_other-tutorials/real-time-speech-recognition.md b/guides/09_other-tutorials/real-time-speech-recognition.md index c84cfbb23f325..d0b2816344297 100644 --- a/guides/09_other-tutorials/real-time-speech-recognition.md +++ b/guides/09_other-tutorials/real-time-speech-recognition.md @@ -14,7 +14,7 @@ This tutorial will show how to take a pretrained speech-to-text model and deploy Make sure you have the `gradio` Python package already [installed](/getting_started). You will also need a pretrained speech recognition model. In this tutorial, we will build demos from 2 ASR libraries: -- Transformers (for this, `pip install transformers` and `pip install torch`) +- Transformers (for this, `pip install torch transformers torchaudio`) Make sure you have at least one of these installed so that you can follow along the tutorial. You will also need `ffmpeg` [installed on your system](https://www.ffmpeg.org/download.html), if you do not already have it, to process files from the microphone. @@ -61,10 +61,8 @@ Take a look below. $code_stream_asr -Notice now we have a state variable now, because we need to track all the audio history. `transcribe` gets called whenever there is a new small chunk of audio, but we also need to keep track of all the audio that has been spoken so far in state. -As the interface runs, the `transcribe` function gets called, with a record of all the previously spoken audio in `stream`, as well as the new chunk of audio as `new_chunk`. We return the new full audio so that can be stored back in state, and we also return the transcription. -Here we naively append the audio together and simply call the `transcriber` object on the entire audio. You can imagine more efficient ways of handling this, such as re-processing only the last 5 seconds of audio whenever a new chunk of audio received. +Notice that we now have a state variable because we need to track all the audio history. `transcribe` gets called whenever there is a new small chunk of audio, but we also need to keep track of all the audio spoken so far in the state. As the interface runs, the `transcribe` function gets called, with a record of all the previously spoken audio in the `stream` and the new chunk of audio as `new_chunk`. We return the new full audio to be stored back in its current state, and we also return the transcription. Here, we naively append the audio together and call the `transcriber` object on the entire audio. You can imagine more efficient ways of handling this, such as re-processing only the last 5 seconds of audio whenever a new chunk of audio is received. $demo_stream_asr -Now the ASR model will run inference as you speak! \ No newline at end of file +Now the ASR model will run inference as you speak!