From 3e8a175c6f6c178f3583cbb929e35134466e0997 Mon Sep 17 00:00:00 2001 From: Chitoku YATO Date: Wed, 3 Apr 2024 10:46:24 -0700 Subject: [PATCH 1/3] Add whisper_streaming package --- packages/audio/whisper_streaming/Dockerfile | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 packages/audio/whisper_streaming/Dockerfile diff --git a/packages/audio/whisper_streaming/Dockerfile b/packages/audio/whisper_streaming/Dockerfile new file mode 100644 index 000000000..3bf228d5a --- /dev/null +++ b/packages/audio/whisper_streaming/Dockerfile @@ -0,0 +1,21 @@ +#--- +# name: whisper_streaming +# group: audio +# depends: [pytorch, torchaudio, faster-whisper] +# requires: '>=34.1.0' +# docs: docs.md +#--- +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +WORKDIR /opt + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + ffmpeg \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +RUN git clone https://github.com/ufal/whisper_streaming.git && \ + cd whisper_streaming && \ + pip3 install --no-cache-dir --verbose librosa soundfile \ No newline at end of file From 51dbca89d9eee97b9b3f70538e545a3ef66783c8 Mon Sep 17 00:00:00 2001 From: Chitoku YATO Date: Thu, 9 May 2024 14:50:26 -0700 Subject: [PATCH 2/3] Add docs --- packages/audio/whisper_streaming/docs.md | 54 ++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 packages/audio/whisper_streaming/docs.md diff --git a/packages/audio/whisper_streaming/docs.md b/packages/audio/whisper_streaming/docs.md new file mode 100644 index 000000000..5c250d89e --- /dev/null +++ b/packages/audio/whisper_streaming/docs.md @@ -0,0 +1,54 @@ + +* whisper_streaming from https://github.com/ufal/whisper_streaming + +### Testing real-time simulation from audio file + +Once in container; + +```bash +cd whisper_streaming/ +python3 whisper_online.py --model tiny.en --lan en --backend faster-whisper /data/audio/asr/Micro-Machine.wav +``` + +If you want to save all the output to file. + +```bash +time python3 whisper_online.py --model large-v3 --lan en --backend faster-whisper /data/audio/asr/Micro-Machine.wav 2>&1 | tee -a /data/audio/asr/MM_large-v3_En.logws +``` + +### Testing server mode -- real-time from mic + +#### Terminal 1: Inside the container + +```bash +cd whisper_streaming/ +python3 whisper_online_server.py --port 43001 --model medium.en +``` + +#### Terminal 2: Outside the container + +On another terminal, just on the host (not in container), first check if your system can find a microphone. + +```bash +arcord -l +``` + +The output may contain list like this, and it confirms it is seen as `hw:2,0` + +``` +card 2: Headset [Logitech USB Headset], device 0: USB Audio [USB Audio] + Subdevices: 1/1 + Subdevice #0: subdevice #0 +``` + +You can execute the following to netcat the captured audio to `localhost:43001` so that the server running in the container can process. + +```bash +arecord -f S16_LE -c1 -r 16000 -t raw -D hw:2,0 | nc localhost 43001 +``` + +### Benchmark + + + + From 0df27a520140db248f029f086f5f48f05bb6dbbc Mon Sep 17 00:00:00 2001 From: Chitoku YATO Date: Thu, 9 May 2024 14:51:59 -0700 Subject: [PATCH 3/3] Copy SAM benchmark script for now --- packages/audio/whisper_streaming/benchmark.py | 144 ++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 packages/audio/whisper_streaming/benchmark.py diff --git a/packages/audio/whisper_streaming/benchmark.py b/packages/audio/whisper_streaming/benchmark.py new file mode 100644 index 000000000..26e864ef6 --- /dev/null +++ b/packages/audio/whisper_streaming/benchmark.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +import os +import time +import datetime +import resource +import argparse +import socket +from urllib.parse import urlparse + +import numpy as np +import matplotlib.pyplot as plt +import cv2 +import PIL.Image +from segment_anything import sam_model_registry, SamAutomaticMaskGenerator, SamPredictor + +parser = argparse.ArgumentParser() +parser.add_argument("--checkpoint", type=str, default="https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth") +parser.add_argument('-i', '--images', action='append', nargs='*', help="Paths to images to test") + +parser.add_argument('-r', '--runs', type=int, default=2, help="Number of inferencing runs to do (for timing)") +parser.add_argument('-w', '--warmup', type=int, default=1, help='the number of warmup iterations') + +parser.add_argument('-s', '--save', type=str, default='', help='CSV file to save benchmarking results to') + +args = parser.parse_args() + +if not args.images: + args.images = [ + "https://raw.githubusercontent.com/facebookresearch/segment-anything/main/notebooks/images/dog.jpg", + "https://raw.githubusercontent.com/facebookresearch/segment-anything/main/notebooks/images/groceries.jpg", + "https://raw.githubusercontent.com/facebookresearch/segment-anything/main/notebooks/images/truck.jpg", + ] +else: + args.images = [x[0] for x in args.images] + +print(args) + +import requests +from tqdm import tqdm + +def download_from_url(url, filename=None): + + if filename is None: + filename = os.path.basename(urlparse(url).path) + + if not os.path.isfile(filename): + + response = requests.get(url, stream=True) + total_size_in_bytes= int(response.headers.get('content-length', 0)) + block_size = 1024 # 1Kibibyte + + print(f"Downloading {filename} :") + progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True) + + with open(filename, 'wb') as file: + for data in response.iter_content(block_size): + progress_bar.update(len(data)) + file.write(data) + + progress_bar.close() + if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: + print("ERROR, download failed!") + + return os.path.abspath(filename) + +def get_max_rss(): # peak memory usage in MB (max RSS - https://stackoverflow.com/a/7669482) + return (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss) / 1024 + +def save_anns(cv2_image, anns): + + plt.imshow(cv2_image) + + if len(anns) == 0: + return + sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True) + ax = plt.gca() + ax.set_autoscale_on(False) + + img = np.ones((sorted_anns[0]['segmentation'].shape[0], sorted_anns[0]['segmentation'].shape[1], 4)) + img[:,:,3] = 0 + for ann in sorted_anns: + m = ann['segmentation'] + color_mask = np.concatenate([np.random.random(3), [0.35]]) + img[m] = color_mask + plt.imshow(img) + plt.axis('off') + plt.savefig("sam_benchmark_output.jpg") + +avg_encoder=0 +avg_latency=0 +cv2_image=None +mask=None + +CHECKPOINT_URL = "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth" +FILENAME = os.path.basename(urlparse(args.checkpoint).path) +download_from_url(args.checkpoint, FILENAME) + +sam_checkpoint = "sam_vit_h_4b8939.pth" +model_type = "vit_h" +device = "cuda" + +sam = sam_model_registry[model_type](checkpoint=sam_checkpoint) +sam.to(device=device) +mask_generator = SamAutomaticMaskGenerator(sam) + +imagepaths = [] +for imageurl in args.images: + imagepaths.append(download_from_url(imageurl)) + +for run in range(args.runs + args.warmup): + + for imagepath in imagepaths: + + cv2_image = cv2.imread(imagepath) + cv2_image = cv2.cvtColor(cv2_image, cv2.COLOR_BGR2RGB) + + time_begin=time.perf_counter() + masks = mask_generator.generate(cv2_image) + time_elapsed=time.perf_counter() - time_begin + + print(f"{imagepath}") + print(f" Full pipeline : {time_elapsed:.3f} seconds") + + if run >= args.warmup: + avg_latency += time_elapsed + +avg_latency /= ( args.runs * len(args.images) ) + +memory_usage=get_max_rss() + +print(f"AVERAGE of {args.runs} runs:") +print(f" latency --- {avg_latency:.3f} sec") +print(f"Memory consumption : {memory_usage:.2f} MB") + +save_anns(cv2_image, masks) + +if args.save: + if not os.path.isfile(args.save): # csv header + with open(args.save, 'w') as file: + file.write(f"timestamp, hostname, api, checkpoint, latency, memory\n") + with open(args.save, 'a') as file: + file.write(f"{datetime.datetime.now().strftime('%Y%m%d %H:%M:%S')}, {socket.gethostname()}, ") + file.write(f"sam-python, {args.checkpoint}, {avg_latency}, {memory_usage}\n") + \ No newline at end of file