From b888db4a9af43a648f0d772c2c3dce429fb72cfa Mon Sep 17 00:00:00 2001
From: Nikita Krasnytskyi <nikita.kras.kyiv@gmail.com>
Date: Mon, 16 Sep 2024 16:16:48 +0100
Subject: [PATCH] Updated Guide: Real Time Speech Recognition (#9349)

* Update real-time-speech-recognition.md

added necessary dependency

* Update run.py

updated code to handle cases with stereo microphone

* Update real-time-speech-recognition.md

improved english

* Update run.py

updated code for streaming

* Update run.py
---
 demo/asr/run.ipynb                                        | 2 +-
 demo/asr/run.py                                           | 7 ++++++-
 demo/stream_asr/run.ipynb                                 | 2 +-
 demo/stream_asr/run.py                                    | 5 +++++
 guides/09_other-tutorials/real-time-speech-recognition.md | 8 +++-----
 5 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/demo/asr/run.ipynb b/demo/asr/run.ipynb
index e8931ecaaadfc..40220c2b225cb 100644
--- a/demo/asr/run.ipynb
+++ b/demo/asr/run.ipynb
@@ -1 +1 @@
-{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: asr"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio torch torchaudio transformers"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "from transformers import pipeline\n", "import numpy as np\n", "\n", "transcriber = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base.en\")\n", "\n", "def transcribe(audio):\n", "    sr, y = audio\n", "    y = y.astype(np.float32)\n", "    y /= np.max(np.abs(y))\n", "\n", "    return transcriber({\"sampling_rate\": sr, \"raw\": y})[\"text\"]  # type: ignore\n", "\n", "demo = gr.Interface(\n", "    transcribe,\n", "    gr.Audio(sources=[\"microphone\"]),\n", "    \"text\",\n", ")\n", "\n", "if __name__ == \"__main__\":\n", "    demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
\ No newline at end of file
+{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: asr"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio torch torchaudio transformers"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "from transformers import pipeline\n", "import numpy as np\n", "\n", "transcriber = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base.en\")\n", "\n", "def transcribe(audio):\n", "    sr, y = audio\n", "    \n", "    # Convert to mono if stereo\n", "    if y.ndim > 1:\n", "        y = y.mean(axis=1)\n", "        \n", "    y = y.astype(np.float32)\n", "    y /= np.max(np.abs(y))\n", "\n", "    return transcriber({\"sampling_rate\": sr, \"raw\": y})[\"text\"]  # type: ignore\n", "\n", "demo = gr.Interface(\n", "    transcribe,\n", "    gr.Audio(sources=\"microphone\"),\n", "    \"text\",\n", ")\n", "\n", "if __name__ == \"__main__\":\n", "    demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
\ No newline at end of file
diff --git a/demo/asr/run.py b/demo/asr/run.py
index 0d4aa4e9a05f1..9ae26ba77b219 100644
--- a/demo/asr/run.py
+++ b/demo/asr/run.py
@@ -6,6 +6,11 @@
 
 def transcribe(audio):
     sr, y = audio
+    
+    # Convert to mono if stereo
+    if y.ndim > 1:
+        y = y.mean(axis=1)
+        
     y = y.astype(np.float32)
     y /= np.max(np.abs(y))
 
@@ -13,7 +18,7 @@ def transcribe(audio):
 
 demo = gr.Interface(
     transcribe,
-    gr.Audio(sources=["microphone"]),
+    gr.Audio(sources="microphone"),
     "text",
 )
 
diff --git a/demo/stream_asr/run.ipynb b/demo/stream_asr/run.ipynb
index 1473aaca86b04..f2d007e6eaacc 100644
--- a/demo/stream_asr/run.ipynb
+++ b/demo/stream_asr/run.ipynb
@@ -1 +1 @@
-{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: stream_asr"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio torch torchaudio transformers"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "from transformers import pipeline\n", "import numpy as np\n", "\n", "transcriber = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base.en\")\n", "\n", "def transcribe(stream, new_chunk):\n", "    sr, y = new_chunk\n", "    y = y.astype(np.float32)\n", "    y /= np.max(np.abs(y))\n", "\n", "    if stream is not None:\n", "        stream = np.concatenate([stream, y])\n", "    else:\n", "        stream = y\n", "    return stream, transcriber({\"sampling_rate\": sr, \"raw\": stream})[\"text\"]  # type: ignore\n", "\n", "demo = gr.Interface(\n", "    transcribe,\n", "    [\"state\", gr.Audio(sources=[\"microphone\"], streaming=True)],\n", "    [\"state\", \"text\"],\n", "    live=True,\n", ")\n", "\n", "if __name__ == \"__main__\":\n", "    demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
\ No newline at end of file
+{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: stream_asr"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio torch torchaudio transformers"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "from transformers import pipeline\n", "import numpy as np\n", "\n", "transcriber = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base.en\")\n", "\n", "def transcribe(stream, new_chunk):\n", "    sr, y = new_chunk\n", "    \n", "    # Convert to mono if stereo\n", "    if y.ndim > 1:\n", "        y = y.mean(axis=1)\n", "        \n", "    y = y.astype(np.float32)\n", "    y /= np.max(np.abs(y))\n", "\n", "    if stream is not None:\n", "        stream = np.concatenate([stream, y])\n", "    else:\n", "        stream = y\n", "    return stream, transcriber({\"sampling_rate\": sr, \"raw\": stream})[\"text\"]  # type: ignore\n", "\n", "demo = gr.Interface(\n", "    transcribe,\n", "    [\"state\", gr.Audio(sources=[\"microphone\"], streaming=True)],\n", "    [\"state\", \"text\"],\n", "    live=True,\n", ")\n", "\n", "if __name__ == \"__main__\":\n", "    demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
\ No newline at end of file
diff --git a/demo/stream_asr/run.py b/demo/stream_asr/run.py
index 6d617f49d6612..bed102fadd684 100644
--- a/demo/stream_asr/run.py
+++ b/demo/stream_asr/run.py
@@ -6,6 +6,11 @@
 
 def transcribe(stream, new_chunk):
     sr, y = new_chunk
+    
+    # Convert to mono if stereo
+    if y.ndim > 1:
+        y = y.mean(axis=1)
+        
     y = y.astype(np.float32)
     y /= np.max(np.abs(y))
 
diff --git a/guides/09_other-tutorials/real-time-speech-recognition.md b/guides/09_other-tutorials/real-time-speech-recognition.md
index c84cfbb23f325..d0b2816344297 100644
--- a/guides/09_other-tutorials/real-time-speech-recognition.md
+++ b/guides/09_other-tutorials/real-time-speech-recognition.md
@@ -14,7 +14,7 @@ This tutorial will show how to take a pretrained speech-to-text model and deploy
 
 Make sure you have the `gradio` Python package already [installed](/getting_started). You will also need a pretrained speech recognition model. In this tutorial, we will build demos from 2 ASR libraries:
 
-- Transformers (for this, `pip install transformers` and `pip install torch`)
+- Transformers (for this, `pip install torch transformers torchaudio`)
 
 Make sure you have at least one of these installed so that you can follow along the tutorial. You will also need `ffmpeg` [installed on your system](https://www.ffmpeg.org/download.html), if you do not already have it, to process files from the microphone.
 
@@ -61,10 +61,8 @@ Take a look below.
 
 $code_stream_asr
 
-Notice now we have a state variable now, because we need to track all the audio history. `transcribe` gets called whenever there is a new small chunk of audio, but we also need to keep track of all the audio that has been spoken so far in state. 
-As the interface runs, the `transcribe` function gets called, with a record of all the previously spoken audio in `stream`, as well as the new chunk of audio as `new_chunk`. We return the new full audio so that can be stored back in state, and we also return the transcription.
-Here we naively append the audio together and simply call the `transcriber` object on the entire audio. You can imagine more efficient ways of handling this, such as re-processing only the last 5 seconds of audio whenever a new chunk of audio received. 
+Notice that we now have a state variable because we need to track all the audio history. `transcribe` gets called whenever there is a new small chunk of audio, but we also need to keep track of all the audio spoken so far in the state. As the interface runs, the `transcribe` function gets called, with a record of all the previously spoken audio in the `stream` and the new chunk of audio as `new_chunk`. We return the new full audio to be stored back in its current state, and we also return the transcription. Here, we naively append the audio together and call the `transcriber` object on the entire audio. You can imagine more efficient ways of handling this, such as re-processing only the last 5 seconds of audio whenever a new chunk of audio is received. 
 
 $demo_stream_asr
 
-Now the ASR model will run inference as you speak! 
\ No newline at end of file
+Now the ASR model will run inference as you speak!