-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_whisper_baselines.py
41 lines (31 loc) · 1.67 KB
/
run_whisper_baselines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import re
import json
from evaluate import load
wer = load("wer")
cer = load("cer")
def find_top_predictions_and_references(file_path):
pattern = r'<hypothesis1>(.*?)</hypothesis1>'
with open(file_path, "r") as f:
data = json.load(f)
predictions = []
references = []
for data_point in data:
matches = re.findall(pattern, data_point["input"], re.DOTALL)
predictions.append(matches[0])
references.append(data_point["output"])
return predictions, references
def find_wer_and_cer(predictions, references):
assert(len(predictions) == len(references))
wer_ = 100 * wer.compute(predictions=predictions, references=references)
cer_ = 100 * cer.compute(predictions=predictions, references=references)
return wer_, cer_
partial_amounts = ["5-percent-removed", "10-percent-removed", "15-percent-removed", "20-percent-removed", "25-percent-removed"]
for partial_amount in partial_amounts:
test_other_predictions, test_other_references = find_top_predictions_and_references(f"llama2-data/{partial_amount}/test-other.json")
test_clean_predictions, test_clean_references = find_top_predictions_and_references(f"llama2-data/{partial_amount}/test-clean.json")
test_other_wer, test_other_cer = find_wer_and_cer(test_other_predictions, test_other_references)
test_clean_wer, test_clean_cer = find_wer_and_cer(test_clean_predictions, test_clean_references)
print(f"LibriSpeech Test Other WER ({partial_amount}): {test_other_wer:.2f}%")
print(f"LibriSpeech Test Clean WER ({partial_amount}): {test_clean_wer:.2f}%")
print(f"LibriSpeech Test Other CER ({partial_amount}): {test_other_cer:.2f}%")
print(f"LibriSpeech Test Clean CER ({partial_amount}): {test_clean_cer:.2f}%")