[ADD] support for large audio/voice files

- Add Telethon to support MTProto (large files) - restructure whisper call into async call - split files using `ffmpeg`
maschlr · Aug 30, 2024 · 2e9900f · 2e9900f
1 parent 447d44d
commit 2e9900f
Show file tree

Hide file tree

Showing 8 changed files with 245 additions and 71 deletions.
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,4 @@ __pycache__
 alembic.ini
 *.pyc
 .ruff_cache
+bot.session
diff --git a/requirements.txt b/requirements.txt
@@ -11,3 +11,5 @@ python-magic
 prettytable
 pandas
 matplotlib
+telethon
+tqdm
diff --git a/summaree_bot/bot/admin.py b/summaree_bot/bot/admin.py
@@ -67,8 +67,8 @@ def _dataset(update: Update, context: DbSessionContext) -> BotDocument:
     now = datetime.now(dt.UTC)
     filename = f"dataset-{now.isoformat()[:19]}.jsonl.bz2"
     bot_msg = BotDocument(
-        chat_id=update.message.chat_id,
-        reply_to_message_id=update.message.message_id,
+        chat_id=update.effective_message.chat_id,
+        reply_to_message_id=update.effective_message.message_id,
         filename=filename,
         document=compressed_buffer.getvalue(),
     )

diff --git a/summaree_bot/bot/audio.py b/summaree_bot/bot/audio.py
@@ -1,5 +1,6 @@
 import asyncio
 import datetime
+import os
 import tempfile
 from pathlib import Path
 from typing import Any, Coroutine, cast
@@ -10,14 +11,16 @@
 from telegram.constants import ChatAction, ParseMode
 from telegram.ext import ContextTypes
 from telegram.helpers import escape_markdown
+from telethon.sync import TelegramClient as TelethonClient
+from tqdm.asyncio import tqdm
 
 from ..integrations import (
     _check_existing_transcript,
     _elaborate,
     _extract_file_name,
     _summarize,
-    _transcribe_file,
     _translate_topic,
+    transcribe_file,
 )
 from ..integrations.deepl import _translate_text
 from ..logging import getLogger
@@ -51,15 +54,18 @@ async def get_summary_msg(update: Update, context: ContextTypes.DEFAULT_TYPE) ->
                 # download the file to the folder
                 tempdir_path = Path(tempdir_path_str)
                 file_path = tempdir_path / file_name
-                file = await voice_or_audio.get_file()
-                await file.download_to_drive(file_path)
+                if voice_or_audio.file_size > 20 * 1024 * 1024:
+                    await download_large_file(update.effective_chat.id, update.message.message_id, file_path)
+                else:
+                    file = await voice_or_audio.get_file()
+                    await file.download_to_drive(file_path)
 
                 if not file_name.suffix:
                     mime = magic.from_file(file_path, mime=True)
                     _, suffix = mime.split("/")
                     file_path.rename(file_path.with_suffix(f".{suffix}"))
 
-                transcript = _transcribe_file(update, context, file_path, voice_or_audio)
+                transcript = await transcribe_file(update, context, file_path, voice_or_audio)
 
         summary = _summarize(update, context, transcript)
 
@@ -94,6 +100,23 @@ async def get_summary_msg(update: Update, context: ContextTypes.DEFAULT_TYPE) ->
     return bot_msg
 
 
+async def download_large_file(chat_id, message_id, destination):
+    client = TelethonClient("bot", os.environ["TELEGRAM_API_ID"], os.environ["TELEGRAM_API_HASH"])
+    try:
+        await client.start(bot_token=os.environ["TELEGRAM_BOT_TOKEN"])
+        message = await client.get_messages(chat_id, ids=message_id)
+        if message.file:
+            _logger.info("Downloading large file")
+            with open(destination, "wb") as fp:
+                async for chunk in tqdm(client.iter_download(message)):
+                    fp.write(chunk)
+            print(f"File saved to {destination}")
+        else:
+            print("This message does not contain a file")
+    finally:
+        await client.disconnect()
+
+
 @session_context
 def _get_summary_message(update: Update, context: DbSessionContext, summary: Summary) -> BotMessage:
     if update.effective_chat is None:
@@ -182,7 +205,6 @@ async def transcribe_and_summarize(update: Update, context: ContextTypes.DEFAULT
     with Session.begin() as session:
         # check how many transcripts/summaries have already been created in the current month
         chat = session.get(TelegramChat, update.effective_chat.id)
-
         file_size = cast(int, voice.file_size if voice else audio.file_size if audio else 0)
         subscription_keyboard = get_subscription_keyboard(update, context)
         if file_size > 10 * 1024 * 1024 and not chat.is_premium_active:
@@ -205,22 +227,7 @@ async def transcribe_and_summarize(update: Update, context: ContextTypes.DEFAULT
                 reply_markup=subscription_keyboard,
             )
             return
-        elif file_size > 25 * 1024 * 1024:
-            # TODO: openai whisper docs mention possible splitting of files >25MB -> look into/inplement
-            # implement using pydub -> split audio into chunks of 25MB and process each chunk
-            # split using silence
-            lang_to_text = {
-                "en": "⚠️ Sorry, the file is too big to be processed (max. 25MB). " "Please send a smaller file.",
-                "de": "⚠️ Sorry, die Datei ist zu groß, um zu verarbeiten (max. 25MB). "
-                "Bitte senden Sie eine kleinere Datei.",
-                "es": "⚠️ Lo sentimos, el archivo es demasiado grande para ser procesado (máximo 25MB). "
-                "Envíe un archivo más pequeño.",
-                "ru": "⚠️ Извините, файл слишком большой, чтобы быть обработанным (максимум 25MB). "
-                "Отправьте меньший файл.",
-            }
-            text = lang_to_text.get(update.effective_user.language_code, lang_to_text["en"])
-            await update.message.reply_text(text)
-            return
+
         current_month = datetime.datetime.now(tz=datetime.UTC).month
         summaries_this_month = (
             session.query(Summary)

diff --git a/summaree_bot/integrations/__init__.py b/summaree_bot/integrations/__init__.py
@@ -5,7 +5,7 @@
     _elaborate,
     _extract_file_name,
     _summarize,
-    _transcribe_file,
+    transcribe_file,
 )
 
 __all__ = [
@@ -17,6 +17,6 @@
     "is_valid_email",
     "_check_existing_transcript",
     "_extract_file_name",
-    "_transcribe_file",
+    "transcribe_file",
     "_translate_topic",
 ]
diff --git a/summaree_bot/integrations/audio.py b/summaree_bot/integrations/audio.py
@@ -0,0 +1,111 @@
+import logging
+import re
+import shlex
+import subprocess
+import time
+from glob import glob
+from itertools import pairwise
+from pathlib import Path
+from typing import List
+
+import numpy as np
+
+_logger = logging.getLogger(__name__)
+
+SILENCE_START_PATTERN = re.compile(r".+silence_start:\s(\d+\.\d+)")
+SILENCE_END_PATTERN = re.compile(r".+silence_end:\s(\d+\.\d+)")
+BITRATE_PATTERN = re.compile(r".+?(\d+) kb\/s.*")
+
+
+def get_silent_segments(input_file: Path, min_silence_len: int = 500, noise_thresh: float = 0.001):
+    _logger.info(f"Getting silent segments from file: {input_file}")
+    cmd = f"ffmpeg -i {input_file} -af silencedetect=noise={noise_thresh}:d={min_silence_len}ms -f null -"
+    process = subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    _, output = process.communicate()
+    if not process.returncode == 0:
+        msg = f"Getting silent segments failed: {output}"
+        _logger.error(msg)
+        raise ValueError(msg)
+    result = {"silent_segments": None, "bitrate": None}
+    silent_segments = []
+    found_bitrate = False
+    for line1, line2 in pairwise(output.decode("utf-8").splitlines()):
+        if not found_bitrate:
+            match_bitrate = BITRATE_PATTERN.match(line1)
+            if not match_bitrate:
+                continue
+            result["bitrate"] = int(match_bitrate.group(1))
+            found_bitrate = True
+
+        match_start = SILENCE_START_PATTERN.match(line1)
+        if match_start:
+            start = float(match_start.group(1))
+        else:
+            continue
+        match_end = SILENCE_END_PATTERN.match(line2)
+        if match_end:
+            end = float(match_end.group(1))
+            _logger.info(f"Silent segment: {start} - {end}")
+            silent_segments.append((start, end))
+
+    result["silent_segments"] = np.array(silent_segments)
+    _logger.info("Getting silent segments successfully finished")
+    return result
+
+
+def split_audio_ffmpeg(input_file: Path, output_dir: Path, segments: List[float]):
+    suffix = input_file.suffix
+    output_file = output_dir / f"{input_file.stem}_%03d{suffix}"
+    segment_times = ",".join(map(str, segments))
+    cmd = rf"ffmpeg -i {str(input_file)} -f segment -segment_times {segment_times} -c copy {str(output_file)}"
+    run_args = shlex.split(cmd)
+    process = subprocess.Popen(run_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    while process.poll() is None:
+        time.sleep(1)
+    stdout, stderr = process.communicate()
+    if process.returncode != 0:
+        msg = f"Splitting failed: {stderr}"
+        _logger.error(msg)
+        raise ValueError(msg)
+    _logger.info(f"Splitting successfully finished: {output_file}\n{stdout}")
+    files = glob(pathname=f"{input_file.stem}_*{suffix}", root_dir=output_dir)
+    return [output_dir / file for file in files]
+
+
+def transcode_ffmpeg(input_file: Path) -> Path:
+    _logger.info(f"Transcoding file: {input_file}")
+    output_file = input_file.parent / f"{input_file.stem}.mp3"
+    cmd = f"ffmpeg -i {input_file} -c:a libmp3lame -b:a 96k {output_file}"
+    run_args = shlex.split(cmd)
+    process = subprocess.Popen(run_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    while process.poll() is None:
+        time.sleep(1)
+
+    stdout, stderr = process.communicate()
+    if process.returncode != 0:
+        msg = f"Transcoding failed: {stderr}"
+        _logger.error(msg)
+        raise ValueError(msg)
+    _logger.info(f"Transcoding successfully finished: {output_file}\n{stdout}")
+    return output_file
+
+
+def split_audio(
+    input_file: Path, output_dir: Path, max_size_mb: int = 24, min_silence_len: int = 500, noise_thresh: float = 0.001
+):
+    # get segments and bitrate
+    analysis = get_silent_segments(input_file, min_silence_len=min_silence_len, noise_thresh=noise_thresh)
+    splits = analysis["silent_segments"].mean(axis=1)
+    # load audio
+    segments = []
+    current_start = 0
+    for split1, split2 in pairwise(splits):
+        # fill up the chunks to the max size
+        # If adding this chunk would exceed the size limit, export the current chunk
+        if (split2 - current_start) * analysis["bitrate"] / 1024 / 8 > max_size_mb:
+            segments.append(split1)
+            current_start = split1
+
+    # Export any remaining audio
+    split_paths = split_audio_ffmpeg(input_file, output_dir, segments)
+    return split_paths