From 3f6f38a6d45d38edf607042a7ee45e449e1cd1fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Drago=C8=99?= Date: Mon, 13 Nov 2023 15:02:48 +0100 Subject: [PATCH] 1 more variation + remove dash (-) from punctuation to be removed --- ASR_NL_benchmark/normalize.py | 2 +- ASR_NL_benchmark/variations.glm | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ASR_NL_benchmark/normalize.py b/ASR_NL_benchmark/normalize.py index be738f8..e3121af 100644 --- a/ASR_NL_benchmark/normalize.py +++ b/ASR_NL_benchmark/normalize.py @@ -44,7 +44,7 @@ def replace_numbers_and_symbols(text): >>> replace_numbers_and_symbols('12,3%') 'twaalf komma drie procent' """ - removed_punct = string.punctuation.replace("'", '') + removed_punct = string.punctuation.replace("'", '').replace('-', '') text_without_symbols = replace_symbols(text) clean_text = replace_numbers(text_without_symbols) clean_text = clean_text.translate(str.maketrans('', '', removed_punct)) diff --git a/ASR_NL_benchmark/variations.glm b/ASR_NL_benchmark/variations.glm index 1c80848..4c841a4 100644 --- a/ASR_NL_benchmark/variations.glm +++ b/ASR_NL_benchmark/variations.glm @@ -59,6 +59,7 @@ tewerk => te werk / [ ] __ [ ] [concept-] => [{ concept- / concept }] / [ ] __ [ ] [NAVO-] => [{ NAVO- / NAVO }] / [ ] __ [ ] [uh] => [{ uh / %HESITATION }] / [ ] __ [ ] +[bnr-nieuwsradio] => [{ bnr-nieuwsradio / bnr nieuwsradio }] ;; ;; BN-VL [Darfour] => [{ Darfour / Darfur }] / [ ] __ [ ]