From bbf910392bee9a6c36fc5929cabb1417a3cf21b3 Mon Sep 17 00:00:00 2001 From: Branislav Gerazov Date: Mon, 6 May 2024 15:51:00 +0200 Subject: [PATCH 1/3] add max_chars to alignment --- jiwer/alignment.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/jiwer/alignment.py b/jiwer/alignment.py index 3944b7c..97dfd43 100644 --- a/jiwer/alignment.py +++ b/jiwer/alignment.py @@ -33,6 +33,7 @@ def visualize_alignment( output: Union[WordOutput, CharacterOutput], show_measures: bool = True, skip_correct: bool = True, + max_chars: int = None, ) -> str: """ Visualize the output of [jiwer.process_words][process.process_words] and @@ -46,6 +47,7 @@ def visualize_alignment( show_measures: If enabled, the visualization will include measures like the WER or CER skip_correct: If enabled, the visualization will exclude correct reference and hypothesis pairs + max_chars: If set split the aligned strings into multiple lines if they exceed this length Returns: (str): The visualization as a string @@ -109,7 +111,7 @@ def visualize_alignment( final_str += f"sentence {idx+1}\n" final_str += _construct_comparison_string( - gt, hp, chunks, include_space_seperator=not is_cer + gt, hp, chunks, include_space_seperator=not is_cer, max_chars=max_chars, ) final_str += "\n" @@ -139,10 +141,12 @@ def _construct_comparison_string( hypothesis: List[str], ops: List[AlignmentChunk], include_space_seperator: bool = False, + max_chars: int = None, ) -> str: ref_str = "REF: " hyp_str = "HYP: " op_str = " " + agg_str = "" # aggregate string for max_chars split for op in ops: if op.type == "equal" or op.type == "substitute": @@ -163,6 +167,18 @@ def _construct_comparison_string( op_chars = [op_char for _ in range(len(ref))] for rf, hp, c in zip(ref, hyp, op_chars): str_len = max(len(rf), len(hp), len(c)) + if max_chars is not None: + if len(ref_str) + str_len > max_chars: + # aggregate the strings + if include_space_seperator: + agg_str = f"{ref_str[:-1]}\n{hyp_str[:-1]}\n{op_str[:-1]}\n\n" + else: + agg_str = f"{ref_str}\n{hyp_str}\n{op_str}\n\n" + + # reset the strings + ref_str = "REF: " + hyp_str = "HYP: " + op_str = " " if rf == "*": rf = "".join(["*"] * str_len) @@ -180,6 +196,6 @@ def _construct_comparison_string( if include_space_seperator: # remove last space - return f"{ref_str[:-1]}\n{hyp_str[:-1]}\n{op_str[:-1]}\n" + return agg_str + f"{ref_str[:-1]}\n{hyp_str[:-1]}\n{op_str[:-1]}\n" else: - return f"{ref_str}\n{hyp_str}\n{op_str}\n" + return agg_str + f"{ref_str}\n{hyp_str}\n{op_str}\n" From 3f7b0a2d01eddbe3f315369c4b2fcc5d29a9bbcb Mon Sep 17 00:00:00 2001 From: Branislav Gerazov Date: Mon, 6 May 2024 16:00:57 +0200 Subject: [PATCH 2/3] fix aggregation --- jiwer/alignment.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/jiwer/alignment.py b/jiwer/alignment.py index 97dfd43..c4b43aa 100644 --- a/jiwer/alignment.py +++ b/jiwer/alignment.py @@ -171,9 +171,9 @@ def _construct_comparison_string( if len(ref_str) + str_len > max_chars: # aggregate the strings if include_space_seperator: - agg_str = f"{ref_str[:-1]}\n{hyp_str[:-1]}\n{op_str[:-1]}\n\n" + agg_str += f"{ref_str[:-1]}\n{hyp_str[:-1]}\n{op_str[:-1]}\n\n" else: - agg_str = f"{ref_str}\n{hyp_str}\n{op_str}\n\n" + agg_str += f"{ref_str}\n{hyp_str}\n{op_str}\n\n" # reset the strings ref_str = "REF: " From 5e3de5e5745b33b5c9855b77a388be403a33f992 Mon Sep 17 00:00:00 2001 From: Branislav Gerazov Date: Mon, 6 May 2024 16:09:17 +0200 Subject: [PATCH 3/3] add example in docstring --- jiwer/alignment.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/jiwer/alignment.py b/jiwer/alignment.py index c4b43aa..3d9120c 100644 --- a/jiwer/alignment.py +++ b/jiwer/alignment.py @@ -98,6 +98,19 @@ def visualize_alignment( HYP: quite * bit of an even longest sentence here D I I S I ``` + + When setting `max_chars=80`, the output will be split into multiple lines: + + ```txt + sentence 1 + REF: This is a very long sentence that is *** much longer than the previous one + HYP: This is a very loong sentence that is not much longer than the previous one + S I + + REF: or the one before that + HYP: or *** one before that + D + ``` """ references = output.references hypothesis = output.hypotheses