BiomedSciAI · SagiPolaczek · Mar 21, 2024 · Mar 20, 2024 · Mar 21, 2024 · SagiPolaczek
diff --git a/fuse/eval/metrics/sequence_gen/metrics_seq_gen_common.py b/fuse/eval/metrics/sequence_gen/metrics_seq_gen_common.py
@@ -18,13 +18,99 @@
 """
 from typing import Optional, Tuple, List
 from functools import partial
-
+from copy import copy
 import torch
 import numpy as np
 
 from fuse.eval.metrics.metrics_common import MetricPerBatchDefault
 
 
+class MetricCountSeqAndTokens(MetricPerBatchDefault):
+ """
+ Counts the total number sequences and tokens in encoder_input
+ """
+
+ def __init__(
+ self,
+ encoder_input: str,
+ ignore_index: Optional[int] = None,
+ state: Optional[dict] = None,
+ **kwargs: dict,
+ ) -> None:
+ """
+ :param encoder_input: key to the encoder_input
+ :param ignore_index: token_id to ignore (not to count), typically pad token id
+ :param state: the sequence count and token count to continue for. Should be restored when we continue training.
+ use get_state() to get the state and save it upon checkpointing,
+ :param kwargs: additional super class arguments
+ """
+ super().__init__(
+ seq_num="seq_num", # collect log_probs - output of _count_seq_and_tokens_update
+ token_num="token_num", # collect token_num - output of _count_seq_and_tokens_update
+ metric_per_batch_func=None,
+ metric_per_batch_func_pre_collect=partial(
+ _count_seq_and_tokens_update,
+ ignore_index=ignore_index,
+ encoder_input_key=encoder_input,
+ ),
+ result_aggregate_func=self._count_seq_and_tokens_compute,
+ **kwargs,
+ )
+ if state is None:
+ self._state = {"seq_num": 0, "token_num": 0}
+ else:
+ assert "seq_num" in state
+ assert "token_num" in state
+ self._state = state
+
+ def _count_seq_and_tokens_compute(
+ self,
+ seq_num: List[np.ndarray],
+ token_num: List[np.ndarray],
+ ) -> float:
+
+ seq_num_total = sum(seq_num)
+ token_num_total = sum(token_num)
+ self._state["seq_num"] += seq_num_total
+ self._state["token_num"] += token_num_total
+ return copy(self._state)
+
+ def get_state(self) -> dict:
+ return copy(self._state)
+
+
+def _count_seq_and_tokens_update(
+ batch_dict: dict,
+ encoder_input_key: str,
+ ignore_index: Optional[int] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+ """Count number of sequences and tokens
+ Args:
+ encoder_input_key:
+ key to encoder_input
+ ignore_index:
+ Token not to count, typically padding
+ Returns:
+ dictionary with number of sequences and tokens
+ """
+ encoder_input = batch_dict[encoder_input_key]
+
+ # to save GPU memory
+ encoder_input = encoder_input.detach()
+
+ if ignore_index is not None:
+ mask = encoder_input.ne(ignore_index)
+ else:
+ mask = torch.ones_like(encoder_input, dtype=torch.bool)
+
+ seq_num = torch.tensor(
+ mask.shape[0], dtype=torch.int64, device=encoder_input.device
+ )
+ token_num = mask.sum().to(dtype=torch.int64)
+
+ return {"seq_num": seq_num.unsqueeze(0), "token_num": token_num.unsqueeze(0)}
+
+
 class MetricPerplexity(MetricPerBatchDefault):
  def __init__(
  self,