diff --git a/.gitignore b/.gitignore index 1fb749b..3468ae0 100644 --- a/.gitignore +++ b/.gitignore @@ -161,4 +161,5 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ -.DS_Store \ No newline at end of file +.DS_Store +.vscode/ \ No newline at end of file diff --git a/codetf/performance/evaluation_metric.py b/codetf/performance/evaluation_metric.py index afce7c2..917e747 100644 --- a/codetf/performance/evaluation_metric.py +++ b/codetf/performance/evaluation_metric.py @@ -4,17 +4,26 @@ from sklearn.metrics import f1_score, precision_score, recall_score from transformers import EvalPrediction + class EvaluationMetric: def __init__(self, metric, tokenizer): self.metric = metric self.tokenizer = tokenizer def compute_metrics(self, eval_pred: EvalPrediction): - predictions = self.tokenizer.batch_decode(eval_pred.predictions, skip_special_tokens=True) - references = self.tokenizer.batch_decode(eval_pred.label_ids, skip_special_tokens=True) + predictions = self.tokenizer.batch_decode( + eval_pred.predictions, skip_special_tokens=True + ) + references = self.tokenizer.batch_decode( + eval_pred.label_ids, skip_special_tokens=True + ) if self.metric == "bleu": return {"bleu": sacrebleu.corpus_bleu(predictions, [references]).score} + elif self.metric == "chrf": + return {"chrf": sacrebleu.corpus_chrf(predictions, [references]).score} + elif self.metric == "ter": + return {"ter": sacrebleu.corpus_ter(predictions, [references]).score} elif self.metric == "f1": return {"f1": self.compute_f1_score(predictions, references)} elif self.metric == "precision": @@ -31,22 +40,24 @@ def compute_metrics(self, eval_pred: EvalPrediction): def compute_f1_score(self, hypotheses, references): # Calculate F1 score for your use case, this is just a sample - return f1_score(hypotheses, references, average='weighted') + return f1_score(hypotheses, references, average="weighted") def compute_precision_score(self, hypotheses, references): # Calculate precision score for your use case, this is just a sample - return precision_score(hypotheses, references, average='weighted') + return precision_score(hypotheses, references, average="weighted") def compute_recall_score(self, hypotheses, references): # Calculate recall score for your use case, this is just a sample - return recall_score(hypotheses, references, average='weighted') + return recall_score(hypotheses, references, average="weighted") def compute_rouge(self, hypotheses, references): - scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True) + scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True) scores = [scorer.score(ref, hyp) for ref, hyp in zip(references, hypotheses)] - rouge1 = sum([score['rouge1'].fmeasure for score in scores]) / len(scores) - rougeL = sum([score['rougeL'].fmeasure for score in scores]) / len(scores) + rouge1 = sum([score["rouge1"].fmeasure for score in scores]) / len(scores) + rougeL = sum([score["rougeL"].fmeasure for score in scores]) / len(scores) return {"rouge1": rouge1, "rougeL": rougeL} def compute_meteor(self, hypotheses, references): - return sum([meteor_score([ref], hyp) for ref, hyp in zip(references, hypotheses)]) / len(hypotheses) + return sum( + [meteor_score([ref], hyp) for ref, hyp in zip(references, hypotheses)] + ) / len(hypotheses)