tloen · RenzeLou · Feb 16, 2024 · Feb 22, 2024 · Feb 22, 2024 · Feb 22, 2024
diff --git a/DataCollator.py b/DataCollator.py
@@ -0,0 +1,94 @@
+from dataclasses import dataclass
+from transformers.data.data_collator import *
+
+@dataclass
+class DataCollatorForSeq2Seq:
+ """
+ Data collator that will dynamically pad the inputs received, as well as the labels.
+
+ Args:
+ tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
+ The tokenizer used for encoding the data.
+ model ([`PreTrainedModel`]):
+ The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to
+ prepare the *decoder_input_ids*
+
+ This is useful when using *label_smoothing* to avoid calculating loss twice.
+ padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+ among:
+
+ - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
+ sequence is provided).
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+ acceptable input length for the model if that argument is not provided.
+ - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
+ max_length (`int`, *optional*):
+ Maximum length of the returned list and optionally padding length (see above).
+ pad_to_multiple_of (`int`, *optional*):
+ If set will pad the sequence to a multiple of the provided value.
+
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+ 7.5 (Volta).
+ label_pad_token_id (`int`, *optional*, defaults to -100):
+ The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
+ return_tensors (`str`):
+ The type of Tensor to return. Allowable values are "np", "pt" and "tf".
+ """
+
+ tokenizer: PreTrainedTokenizerBase
+ model: Optional[Any] = None
+ padding: Union[bool, str, PaddingStrategy] = True
+ max_length: Optional[int] = None
+ pad_to_multiple_of: Optional[int] = None
+ label_pad_token_id: int = -100
+ return_tensors: str = "pt"
+
+ def __call__(self, features, return_tensors=None):
+ if return_tensors is None:
+ return_tensors = self.return_tensors
+ labels = [feature["labels"] for feature in features] if "labels" in features[0].keys() else None
+ # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
+ # same length to return tensors.
+ if labels is not None:
+ max_label_length = max(len(l) for l in labels)
+ if self.pad_to_multiple_of is not None:
+ max_label_length = (
+ (max_label_length + self.pad_to_multiple_of - 1)
+ // self.pad_to_multiple_of
+ * self.pad_to_multiple_of
+ )
+
+ padding_side = self.tokenizer.padding_side
+ for feature in features:
+ remainder = [self.label_pad_token_id] * (max_label_length - len(feature["labels"]))
+ if isinstance(feature["labels"], list):
+ feature["labels"] = (
+ feature["labels"] + remainder if padding_side == "right" else remainder + feature["labels"]
+ )
+ elif padding_side == "right":
+ feature["labels"] = np.concatenate([feature["labels"], remainder]).astype(np.int64)
+ else:
+ feature["labels"] = np.concatenate([remainder, feature["labels"]]).astype(np.int64)
+
+ features = self.tokenizer.pad(
+ features,
+ padding=self.padding,
+ max_length=self.max_length,
+ pad_to_multiple_of=self.pad_to_multiple_of,
+ return_tensors=return_tensors,
+ )
+
+ # prepare decoder_input_ids
+ if (
+ labels is not None
+ and self.model is not None
+ and hasattr(self.model, "prepare_decoder_input_ids_from_labels")
+ ):
+ decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels(labels=features["labels"])
+ features["decoder_input_ids"] = decoder_input_ids
+
+ print(features)
+ exit()
+
+ return features
diff --git a/README.md b/README.md
@@ -1,3 +1,15 @@
+
+Forked from Alpaca-LoRA, modify some parts of the code for my cutomized usage.
+
+- add batched evaluation.
+- add metric calculation.
+- fix the error from `torch.complie` and the PEFT APIs. see [issue](https://github.com/tloen/alpaca-lora/issues/609)
+- ...
+
+
+
+
+
 # 🦙🌲🤏 Alpaca-LoRA
 
 - 🤗 **Try the pretrained model out [here](https://huggingface.co/spaces/tloen/alpaca-lora), courtesy of a GPU grant from Huggingface!**

diff --git a/calculate_metrics.py b/calculate_metrics.py
@@ -0,0 +1,133 @@
+import argparse
+import copy
+import json
+import os
+
+import numpy as np
+
+from calculate_metrics_src import compute_grouped_metrics_v2, compute_metrics
+
+
+def process_superni(superni_preds, superni_meta):
+ assert len(superni_preds) == len(superni_meta), "The length of the predictions {} and the metadata {} should be the same".format(len(superni_preds), len(superni_meta))
+ final_res = []
+ for pred, meta in zip(superni_preds, superni_meta):
+ # to ensure the order are the same
+ assert pred["input"] == meta["input"], "The input of the prediction {} and the metadata {} should be the same".format(pred["input"], meta["input"])
+ assert pred["instruction"] == meta["instruction"], "The instruction of the prediction {} and the metadata {} should be the same".format(pred["instruction"], meta["instruction"])
+ item = copy.deepcopy(meta)
+ item["response"] = pred["response"]
+ final_res.append(item)
+
+ return final_res
+
+def calculate_metrics(all_results, save_path=None, save_prefix=None):
+ instructions, inputs, outputs, responses = [], [], [], []
+ categoreis = []
+ for result in all_results:
+ instruction = result["instruction"]
+ input = result["input"]
+ output = result["output"]
+ response = result["response"]
+ if "categories" in result:
+ # superni
+ categoreis.append(result["categories"])
+ assert type(output) == list, "The output of superni should be a list, but got {}, save_prefix: {}".format(output, save_prefix)
+ outputs.append(output) # the output of the superni is already a list
+ else:
+ # p3, mmlu, bbh
+ assert type(output) == str and type(output) != list, "The output of p3, mmlu, and bbh should be a string (only superni is a list), but got {}, save_prefix: {}".format(output, save_prefix)
+ outputs.append([output]) # we expect the item in the list to be a list (cuz the `metric_max_over_ground_truths`)
+
+ instructions.append(instruction)
+ inputs.append(input)
+ responses.append(response)
+
+ # calculate the metrics
+ if len(categoreis) == 0:
+ categoreis = None
+
+ metrics = compute_ni_metrics(responses, outputs, instructions, inputs, categories=categoreis, save_path=save_path, save_prefix=save_prefix)
+
+ return metrics
+
+
+def compute_ni_metrics(preds:list, references:list, instructions:list, inputs:list, categories=None, save_prefix=None, save_path=None):
+ decoded_preds = preds
+ result = compute_metrics(predictions=decoded_preds, references=references)
+ categories = ["_".join(it[0].lower().split()) for it in categories] if categories is not None else None
+ if categories is not None:
+ result_per_category = compute_grouped_metrics_v2(predictions=decoded_preds, references=references, groups=categories)
+ result.update(result_per_category)
+ prediction_lens = [len(pred.split()) for pred in decoded_preds] # this gen len is different from the origin code, which is the word length instead of token length
+ result["gen_len"] = np.mean(prediction_lens)
+ result = {k: round(v, 4) for k, v in result.items()}
+
+ assert save_path is not None and save_prefix is not None, "The save_path and save_prefix should not be None"
+
+ if save_path is not None and save_prefix is not None:
+ with open(os.path.join(save_path, f"{save_prefix}_eval_predictions.jsonl"), "w") as fout:
+ for instruction, input, output, pred in zip(instructions, inputs, references, decoded_preds):
+ fout.write(json.dumps({
+ "Definition": instruction,
+ "Input": input,
+ "Output": output,
+ "Prediction": pred
+ }) + "\n")
+ # save the scores
+ with open(os.path.join(save_path, f"{save_prefix}_eval_scores.json"), "w") as fout:
+ json.dump(result, fout, indent=4)
+
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--results_path",type=str,default="./alpaca_2")
+
+ args, unparsed = parser.parse_known_args()
+ if unparsed:
+ raise ValueError(unparsed)
+
+
+ # read and calculate the metrics on all the four benchmarks
+ # superni
+ # note that the supernni should also read the 'superni_test_11810_eval_usage.json' file
+ # cuz we have to use the categories in that file to calcultae the grouped metrics
+ with open(os.path.join(args.results_path, "superni.json"), "r") as fin:
+ superni_preds = json.load(fin)
+
+ with open("/data/rml6079/projects/muffin_llama/alpaca-lora/eval_benchmarks/superni_test_11810_eval_usage.json", "r") as fin:
+ superni_meta = json.load(fin)
+
+ # combine these two files to get the final list to calculate the metrics.
+ # the only difference of superni_results, is that it has the response field, and the output of the superni is a list instead of a string
+ superni_results = process_superni(superni_preds, superni_meta)
+ calculate_metrics(superni_results, save_path=args.results_path, save_prefix="superni")
+ print("superni done")
+
+ # p3
+ with open(os.path.join(args.results_path, "p3.json"), "r") as fin:
+ p3_results = json.load(fin)
+ calculate_metrics(p3_results, save_path=args.results_path, save_prefix="p3")
+ print("p3 done")
+
+ # mmlu
+ with open(os.path.join(args.results_path, "mmlu.json"), "r") as fin:
+ mmlu_results = json.load(fin)
+ calculate_metrics(mmlu_results, save_path=args.results_path, save_prefix="mmlu")
+ print("mmlu done")
+
+ # bbh
+ with open(os.path.join(args.results_path, "bbh.json"), "r") as fin:
+ bbh_results = json.load(fin)
+ calculate_metrics(bbh_results, save_path=args.results_path, save_prefix="bbh")
+ print("bbh done")
+
+
+
+
+
+
+
+if __name__ == "__main__":
+ main()