-
Notifications
You must be signed in to change notification settings - Fork 206
/
preparing_your_dataset.py
34 lines (25 loc) · 1.18 KB
/
preparing_your_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import json
from datasets import Dataset, DatasetDict
# Convert the alpaca JSON dataset to HF format
# Right now only the HuggingFace datasets are supported, that's why the JSON Alpaca dataset
# needs to be converted to the HuggingFace format. In addition, this HF dataset should have 3 columns for instruction finetuning: instruction, text and target.
def preprocess_alpaca_json_data(alpaca_dataset_path: str):
"""Creates a dataset given the alpaca JSON dataset. You can download it here: https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json
:param alpaca_dataset_path: path of the Alpaca dataset
"""
alpaca_data = json.load(open(alpaca_dataset_path))
instructions = []
inputs = []
outputs = []
for data in alpaca_data:
instructions.append(data["instruction"])
inputs.append(data["input"])
outputs.append(data["output"])
data_dict = {
"train": {"instruction": instructions, "text": inputs, "target": outputs}
}
dataset = DatasetDict()
# using your `Dict` object
for k, v in data_dict.items():
dataset[k] = Dataset.from_dict(v)
dataset.save_to_disk(str("./alpaca_data"))