Updated Guide: Real Time Speech Recognition (#9349)

* Update real-time-speech-recognition.md added necessary dependency * Update run.py updated code to handle cases with stereo microphone * Update real-time-speech-recognition.md improved english * Update run.py updated code for streaming * Update run.py
gradio-app · Sep 16, 2024 · b888db4 · b888db4
1 parent 9f0fe39
commit b888db4
Show file tree

Hide file tree

Showing 5 changed files with 16 additions and 8 deletions.
diff --git a/demo/asr/run.ipynb b/demo/asr/run.ipynb
@@ -1 +1 @@
-{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: asr"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio torch torchaudio transformers"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "from transformers import pipeline\n", "import numpy as np\n", "\n", "transcriber = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base.en\")\n", "\n", "def transcribe(audio):\n", "    sr, y = audio\n", "    y = y.astype(np.float32)\n", "    y /= np.max(np.abs(y))\n", "\n", "    return transcriber({\"sampling_rate\": sr, \"raw\": y})[\"text\"]  # type: ignore\n", "\n", "demo = gr.Interface(\n", "    transcribe,\n", "    gr.Audio(sources=[\"microphone\"]),\n", "    \"text\",\n", ")\n", "\n", "if __name__ == \"__main__\":\n", "    demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
+{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: asr"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio torch torchaudio transformers"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "from transformers import pipeline\n", "import numpy as np\n", "\n", "transcriber = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base.en\")\n", "\n", "def transcribe(audio):\n", "    sr, y = audio\n", "    \n", "    # Convert to mono if stereo\n", "    if y.ndim > 1:\n", "        y = y.mean(axis=1)\n", "        \n", "    y = y.astype(np.float32)\n", "    y /= np.max(np.abs(y))\n", "\n", "    return transcriber({\"sampling_rate\": sr, \"raw\": y})[\"text\"]  # type: ignore\n", "\n", "demo = gr.Interface(\n", "    transcribe,\n", "    gr.Audio(sources=\"microphone\"),\n", "    \"text\",\n", ")\n", "\n", "if __name__ == \"__main__\":\n", "    demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
diff --git a/demo/asr/run.py b/demo/asr/run.py
@@ -6,14 +6,19 @@
 
 def transcribe(audio):
     sr, y = audio
+
+    # Convert to mono if stereo
+    if y.ndim > 1:
+        y = y.mean(axis=1)
+
     y = y.astype(np.float32)
     y /= np.max(np.abs(y))
 
     return transcriber({"sampling_rate": sr, "raw": y})["text"]  # type: ignore
 
 demo = gr.Interface(
     transcribe,
-    gr.Audio(sources=["microphone"]),
+    gr.Audio(sources="microphone"),
     "text",
 )
 

diff --git a/demo/stream_asr/run.ipynb b/demo/stream_asr/run.ipynb
@@ -1 +1 @@
-{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: stream_asr"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio torch torchaudio transformers"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "from transformers import pipeline\n", "import numpy as np\n", "\n", "transcriber = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base.en\")\n", "\n", "def transcribe(stream, new_chunk):\n", "    sr, y = new_chunk\n", "    y = y.astype(np.float32)\n", "    y /= np.max(np.abs(y))\n", "\n", "    if stream is not None:\n", "        stream = np.concatenate([stream, y])\n", "    else:\n", "        stream = y\n", "    return stream, transcriber({\"sampling_rate\": sr, \"raw\": stream})[\"text\"]  # type: ignore\n", "\n", "demo = gr.Interface(\n", "    transcribe,\n", "    [\"state\", gr.Audio(sources=[\"microphone\"], streaming=True)],\n", "    [\"state\", \"text\"],\n", "    live=True,\n", ")\n", "\n", "if __name__ == \"__main__\":\n", "    demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
+{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: stream_asr"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio torch torchaudio transformers"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "from transformers import pipeline\n", "import numpy as np\n", "\n", "transcriber = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base.en\")\n", "\n", "def transcribe(stream, new_chunk):\n", "    sr, y = new_chunk\n", "    \n", "    # Convert to mono if stereo\n", "    if y.ndim > 1:\n", "        y = y.mean(axis=1)\n", "        \n", "    y = y.astype(np.float32)\n", "    y /= np.max(np.abs(y))\n", "\n", "    if stream is not None:\n", "        stream = np.concatenate([stream, y])\n", "    else:\n", "        stream = y\n", "    return stream, transcriber({\"sampling_rate\": sr, \"raw\": stream})[\"text\"]  # type: ignore\n", "\n", "demo = gr.Interface(\n", "    transcribe,\n", "    [\"state\", gr.Audio(sources=[\"microphone\"], streaming=True)],\n", "    [\"state\", \"text\"],\n", "    live=True,\n", ")\n", "\n", "if __name__ == \"__main__\":\n", "    demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
diff --git a/demo/stream_asr/run.py b/demo/stream_asr/run.py
@@ -6,6 +6,11 @@
 
 def transcribe(stream, new_chunk):
     sr, y = new_chunk
+
+    # Convert to mono if stereo
+    if y.ndim > 1:
+        y = y.mean(axis=1)
+
     y = y.astype(np.float32)
     y /= np.max(np.abs(y))
 

diff --git a/guides/09_other-tutorials/real-time-speech-recognition.md b/guides/09_other-tutorials/real-time-speech-recognition.md
@@ -14,7 +14,7 @@ This tutorial will show how to take a pretrained speech-to-text model and deploy
 
 Make sure you have the `gradio` Python package already [installed](/getting_started). You will also need a pretrained speech recognition model. In this tutorial, we will build demos from 2 ASR libraries:
 
-- Transformers (for this, `pip install transformers` and `pip install torch`)
+- Transformers (for this, `pip install torch transformers torchaudio`)
 
 Make sure you have at least one of these installed so that you can follow along the tutorial. You will also need `ffmpeg` [installed on your system](https://www.ffmpeg.org/download.html), if you do not already have it, to process files from the microphone.
 
@@ -61,10 +61,8 @@ Take a look below.
 
 $code_stream_asr
 
-Notice now we have a state variable now, because we need to track all the audio history. `transcribe` gets called whenever there is a new small chunk of audio, but we also need to keep track of all the audio that has been spoken so far in state. 
-As the interface runs, the `transcribe` function gets called, with a record of all the previously spoken audio in `stream`, as well as the new chunk of audio as `new_chunk`. We return the new full audio so that can be stored back in state, and we also return the transcription.
-Here we naively append the audio together and simply call the `transcriber` object on the entire audio. You can imagine more efficient ways of handling this, such as re-processing only the last 5 seconds of audio whenever a new chunk of audio received. 
+Notice that we now have a state variable because we need to track all the audio history. `transcribe` gets called whenever there is a new small chunk of audio, but we also need to keep track of all the audio spoken so far in the state. As the interface runs, the `transcribe` function gets called, with a record of all the previously spoken audio in the `stream` and the new chunk of audio as `new_chunk`. We return the new full audio to be stored back in its current state, and we also return the transcription. Here, we naively append the audio together and call the `transcriber` object on the entire audio. You can imagine more efficient ways of handling this, such as re-processing only the last 5 seconds of audio whenever a new chunk of audio is received. 
 
 $demo_stream_asr
 
-Now the ASR model will run inference as you speak! 
+Now the ASR model will run inference as you speak!
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: asr"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio torch torchaudio transformers"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "from transformers import pipeline\n", "import numpy as np\n", "\n", "transcriber = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base.en\")\n", "\n", "def transcribe(audio):\n", " sr, y = audio\n", " y = y.astype(np.float32)\n", " y /= np.max(np.abs(y))\n", "\n", " return transcriber({\"sampling_rate\": sr, \"raw\": y})[\"text\"] # type: ignore\n", "\n", "demo = gr.Interface(\n", " transcribe,\n", " gr.Audio(sources=[\"microphone\"]),\n", " \"text\",\n", ")\n", "\n", "if __name__ == \"__main__\":\n", " demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
		{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: asr"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio torch torchaudio transformers"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "from transformers import pipeline\n", "import numpy as np\n", "\n", "transcriber = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base.en\")\n", "\n", "def transcribe(audio):\n", " sr, y = audio\n", " \n", " # Convert to mono if stereo\n", " if y.ndim > 1:\n", " y = y.mean(axis=1)\n", " \n", " y = y.astype(np.float32)\n", " y /= np.max(np.abs(y))\n", "\n", " return transcriber({\"sampling_rate\": sr, \"raw\": y})[\"text\"] # type: ignore\n", "\n", "demo = gr.Interface(\n", " transcribe,\n", " gr.Audio(sources=\"microphone\"),\n", " \"text\",\n", ")\n", "\n", "if __name__ == \"__main__\":\n", " demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}