Skip to content

Commit

Permalink
[ADD] support for large audio/voice files
Browse files Browse the repository at this point in the history
- Add Telethon to support MTProto (large files)
- restructure whisper call into async call
- split files using `ffmpeg`
  • Loading branch information
maschlr committed Aug 30, 2024
1 parent 447d44d commit 2e9900f
Show file tree
Hide file tree
Showing 8 changed files with 245 additions and 71 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ __pycache__
alembic.ini
*.pyc
.ruff_cache
bot.session
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,5 @@ python-magic
prettytable
pandas
matplotlib
telethon
tqdm
4 changes: 2 additions & 2 deletions summaree_bot/bot/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ def _dataset(update: Update, context: DbSessionContext) -> BotDocument:
now = datetime.now(dt.UTC)
filename = f"dataset-{now.isoformat()[:19]}.jsonl.bz2"
bot_msg = BotDocument(
chat_id=update.message.chat_id,
reply_to_message_id=update.message.message_id,
chat_id=update.effective_message.chat_id,
reply_to_message_id=update.effective_message.message_id,
filename=filename,
document=compressed_buffer.getvalue(),
)
Expand Down
49 changes: 28 additions & 21 deletions summaree_bot/bot/audio.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
import datetime
import os
import tempfile
from pathlib import Path
from typing import Any, Coroutine, cast
Expand All @@ -10,14 +11,16 @@
from telegram.constants import ChatAction, ParseMode
from telegram.ext import ContextTypes
from telegram.helpers import escape_markdown
from telethon.sync import TelegramClient as TelethonClient
from tqdm.asyncio import tqdm

from ..integrations import (
_check_existing_transcript,
_elaborate,
_extract_file_name,
_summarize,
_transcribe_file,
_translate_topic,
transcribe_file,
)
from ..integrations.deepl import _translate_text
from ..logging import getLogger
Expand Down Expand Up @@ -51,15 +54,18 @@ async def get_summary_msg(update: Update, context: ContextTypes.DEFAULT_TYPE) ->
# download the file to the folder
tempdir_path = Path(tempdir_path_str)
file_path = tempdir_path / file_name
file = await voice_or_audio.get_file()
await file.download_to_drive(file_path)
if voice_or_audio.file_size > 20 * 1024 * 1024:
await download_large_file(update.effective_chat.id, update.message.message_id, file_path)
else:
file = await voice_or_audio.get_file()
await file.download_to_drive(file_path)

if not file_name.suffix:
mime = magic.from_file(file_path, mime=True)
_, suffix = mime.split("/")
file_path.rename(file_path.with_suffix(f".{suffix}"))

transcript = _transcribe_file(update, context, file_path, voice_or_audio)
transcript = await transcribe_file(update, context, file_path, voice_or_audio)

summary = _summarize(update, context, transcript)

Expand Down Expand Up @@ -94,6 +100,23 @@ async def get_summary_msg(update: Update, context: ContextTypes.DEFAULT_TYPE) ->
return bot_msg


async def download_large_file(chat_id, message_id, destination):
client = TelethonClient("bot", os.environ["TELEGRAM_API_ID"], os.environ["TELEGRAM_API_HASH"])
try:
await client.start(bot_token=os.environ["TELEGRAM_BOT_TOKEN"])
message = await client.get_messages(chat_id, ids=message_id)
if message.file:
_logger.info("Downloading large file")
with open(destination, "wb") as fp:
async for chunk in tqdm(client.iter_download(message)):
fp.write(chunk)
print(f"File saved to {destination}")
else:
print("This message does not contain a file")
finally:
await client.disconnect()


@session_context
def _get_summary_message(update: Update, context: DbSessionContext, summary: Summary) -> BotMessage:
if update.effective_chat is None:
Expand Down Expand Up @@ -182,7 +205,6 @@ async def transcribe_and_summarize(update: Update, context: ContextTypes.DEFAULT
with Session.begin() as session:
# check how many transcripts/summaries have already been created in the current month
chat = session.get(TelegramChat, update.effective_chat.id)

file_size = cast(int, voice.file_size if voice else audio.file_size if audio else 0)
subscription_keyboard = get_subscription_keyboard(update, context)
if file_size > 10 * 1024 * 1024 and not chat.is_premium_active:
Expand All @@ -205,22 +227,7 @@ async def transcribe_and_summarize(update: Update, context: ContextTypes.DEFAULT
reply_markup=subscription_keyboard,
)
return
elif file_size > 25 * 1024 * 1024:
# TODO: openai whisper docs mention possible splitting of files >25MB -> look into/inplement
# implement using pydub -> split audio into chunks of 25MB and process each chunk
# split using silence
lang_to_text = {
"en": "⚠️ Sorry, the file is too big to be processed (max. 25MB). " "Please send a smaller file.",
"de": "⚠️ Sorry, die Datei ist zu groß, um zu verarbeiten (max. 25MB). "
"Bitte senden Sie eine kleinere Datei.",
"es": "⚠️ Lo sentimos, el archivo es demasiado grande para ser procesado (máximo 25MB). "
"Envíe un archivo más pequeño.",
"ru": "⚠️ Извините, файл слишком большой, чтобы быть обработанным (максимум 25MB). "
"Отправьте меньший файл.",
}
text = lang_to_text.get(update.effective_user.language_code, lang_to_text["en"])
await update.message.reply_text(text)
return

current_month = datetime.datetime.now(tz=datetime.UTC).month
summaries_this_month = (
session.query(Summary)
Expand Down
4 changes: 2 additions & 2 deletions summaree_bot/integrations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
_elaborate,
_extract_file_name,
_summarize,
_transcribe_file,
transcribe_file,
)

__all__ = [
Expand All @@ -17,6 +17,6 @@
"is_valid_email",
"_check_existing_transcript",
"_extract_file_name",
"_transcribe_file",
"transcribe_file",
"_translate_topic",
]
111 changes: 111 additions & 0 deletions summaree_bot/integrations/audio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import logging
import re
import shlex
import subprocess
import time
from glob import glob
from itertools import pairwise
from pathlib import Path
from typing import List

import numpy as np

_logger = logging.getLogger(__name__)

SILENCE_START_PATTERN = re.compile(r".+silence_start:\s(\d+\.\d+)")
SILENCE_END_PATTERN = re.compile(r".+silence_end:\s(\d+\.\d+)")
BITRATE_PATTERN = re.compile(r".+?(\d+) kb\/s.*")


def get_silent_segments(input_file: Path, min_silence_len: int = 500, noise_thresh: float = 0.001):
_logger.info(f"Getting silent segments from file: {input_file}")
cmd = f"ffmpeg -i {input_file} -af silencedetect=noise={noise_thresh}:d={min_silence_len}ms -f null -"
process = subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
_, output = process.communicate()
if not process.returncode == 0:
msg = f"Getting silent segments failed: {output}"
_logger.error(msg)
raise ValueError(msg)
result = {"silent_segments": None, "bitrate": None}
silent_segments = []
found_bitrate = False
for line1, line2 in pairwise(output.decode("utf-8").splitlines()):
if not found_bitrate:
match_bitrate = BITRATE_PATTERN.match(line1)
if not match_bitrate:
continue
result["bitrate"] = int(match_bitrate.group(1))
found_bitrate = True

match_start = SILENCE_START_PATTERN.match(line1)
if match_start:
start = float(match_start.group(1))
else:
continue
match_end = SILENCE_END_PATTERN.match(line2)
if match_end:
end = float(match_end.group(1))
_logger.info(f"Silent segment: {start} - {end}")
silent_segments.append((start, end))

result["silent_segments"] = np.array(silent_segments)
_logger.info("Getting silent segments successfully finished")
return result


def split_audio_ffmpeg(input_file: Path, output_dir: Path, segments: List[float]):
suffix = input_file.suffix
output_file = output_dir / f"{input_file.stem}_%03d{suffix}"
segment_times = ",".join(map(str, segments))
cmd = rf"ffmpeg -i {str(input_file)} -f segment -segment_times {segment_times} -c copy {str(output_file)}"
run_args = shlex.split(cmd)
process = subprocess.Popen(run_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
while process.poll() is None:
time.sleep(1)
stdout, stderr = process.communicate()
if process.returncode != 0:
msg = f"Splitting failed: {stderr}"
_logger.error(msg)
raise ValueError(msg)
_logger.info(f"Splitting successfully finished: {output_file}\n{stdout}")
files = glob(pathname=f"{input_file.stem}_*{suffix}", root_dir=output_dir)
return [output_dir / file for file in files]


def transcode_ffmpeg(input_file: Path) -> Path:
_logger.info(f"Transcoding file: {input_file}")
output_file = input_file.parent / f"{input_file.stem}.mp3"
cmd = f"ffmpeg -i {input_file} -c:a libmp3lame -b:a 96k {output_file}"
run_args = shlex.split(cmd)
process = subprocess.Popen(run_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
while process.poll() is None:
time.sleep(1)

stdout, stderr = process.communicate()
if process.returncode != 0:
msg = f"Transcoding failed: {stderr}"
_logger.error(msg)
raise ValueError(msg)
_logger.info(f"Transcoding successfully finished: {output_file}\n{stdout}")
return output_file


def split_audio(
input_file: Path, output_dir: Path, max_size_mb: int = 24, min_silence_len: int = 500, noise_thresh: float = 0.001
):
# get segments and bitrate
analysis = get_silent_segments(input_file, min_silence_len=min_silence_len, noise_thresh=noise_thresh)
splits = analysis["silent_segments"].mean(axis=1)
# load audio
segments = []
current_start = 0
for split1, split2 in pairwise(splits):
# fill up the chunks to the max size
# If adding this chunk would exceed the size limit, export the current chunk
if (split2 - current_start) * analysis["bitrate"] / 1024 / 8 > max_size_mb:
segments.append(split1)
current_start = split1

# Export any remaining audio
split_paths = split_audio_ffmpeg(input_file, output_dir, segments)
return split_paths
Loading

0 comments on commit 2e9900f

Please sign in to comment.