Skip to content

Commit

Permalink
prepare experiments
Browse files Browse the repository at this point in the history
  • Loading branch information
stolzenp committed May 5, 2024
1 parent c1a53f6 commit 45ec5a9
Show file tree
Hide file tree
Showing 12 changed files with 327 additions and 11 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import numpy as np
import datetime
import os
from transformers import AutoTokenizer, TrainingArguments
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, Trainer
from datasets import load_dataset
import evaluate

generated_dataset = load_dataset("stolzenp/500-cola-sentences-baseline", split="train")
test_split = load_dataset("glue", "cola", split="test")

model_name = "bert-base-uncased"

label2id = {"correct": 1, "wrong": 0, "acceptable": 1, "unacceptable": 0}
id2label = {1: "acceptable", 0: "unacceptable"}

# setup preprocessing
tokenizer = AutoTokenizer.from_pretrained(model_name)


def preprocess_text(batch):
preprocessed_tokens = tokenizer(batch["text"], truncation=True, padding=True)
return preprocessed_tokens


def preprocess_text_and_labels(batch):
preprocessed_tokens = tokenizer(batch["text"], truncation=True, padding=True)
preprocessed_tokens["label"] = [label2id[label] for label in batch["label"]]
return preprocessed_tokens


# setup compute_metrics
accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return accuracy.compute(predictions=predictions, references=labels)


generated_dataset = generated_dataset.train_test_split(test_size=0.1)
train_split = generated_dataset["train"]
val_split = generated_dataset["test"]

tokenized_train = train_split.map(preprocess_text_and_labels, batched=True)
tokenized_val = val_split.map(preprocess_text_and_labels, batched=True)
tokenized_test = test_split.map(preprocess_text, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(
model_name, num_labels=2, id2label=id2label, label2id=label2id
)

# initialize training
trainer = Trainer(
model=model,
args=TrainingArguments(
output_dir="baseline_downstream_model",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=2,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
),
train_dataset=tokenized_train,
eval_dataset=tokenized_val,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)

trainer.train()
outputs = trainer.predict(tokenized_test)
test_accuracy = outputs[2]["test_accuracy"]

results_file = "results.txt"
experiments_directory = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
results_path = f"{experiments_directory}/{results_file}"

results_timestamp = datetime.datetime.now()

with open(results_path, "a") as file:
file.write(f"{results_timestamp} - cola_baseline\n")
file.write(f"accuracy: {test_accuracy}\n")
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from haystack.nodes import PromptNode
from fabricator import DatasetGenerator
from fabricator.prompts import BasePrompt

label_options = ["correct", "wrong"]

prompt = BasePrompt(
task_description="Generate a grammatically {} sentence similar to this:",
label_options=label_options,
target_formatting_template="",
)

prompt_node = PromptNode(
model_name_or_path="mistralai/Mistral-7B-Instruct-v0.1",
max_length=50,
model_kwargs={
"model_kwargs": {"do_sample": True, "temperature": 0.5, "top_p": 0.9}
},
)

generator = DatasetGenerator(prompt_node)
generated_dataset = generator.generate(
prompt_template=prompt,
max_prompt_calls=500,
num_samples_to_generate=500,
)

generated_dataset.push_to_hub("500-cola-sentences-baseline")
Original file line number Diff line number Diff line change
@@ -1,3 +1,81 @@
import numpy as np
import datetime
import os
from transformers import AutoTokenizer, TrainingArguments
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, Trainer
from datasets import load_dataset
import evaluate

gold_dataset = load_dataset("glue", "cola", split="train")
train_split = load_dataset("glue", "cola", split="train")
val_split = load_dataset("glue", "cola", split="validation")
test_split = load_dataset("glue", "cola", split="test")

model_name = "bert-base-uncased"

label2id = {"correct": 1, "wrong": 0, "acceptable": 1, "unacceptable": 0}
id2label = {1: "acceptable", 0: "unacceptable"}

# setup preprocessing
tokenizer = AutoTokenizer.from_pretrained(model_name)


def preprocess_text(batch):
preprocessed_tokens = tokenizer(batch["text"], truncation=True, padding=True)
return preprocessed_tokens


# setup compute_metrics
accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return accuracy.compute(predictions=predictions, references=labels)


tokenized_train = train_split.map(preprocess_text, batched=True)
tokenized_val = val_split.map(preprocess_text, batched=True)
tokenized_test = test_split.map(preprocess_text, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(
model_name, num_labels=2, id2label=id2label, label2id=label2id
)

# initialize training
trainer = Trainer(
model=model,
args=TrainingArguments(
output_dir="gold_downstream_model",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=2,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
),
train_dataset=tokenized_train,
eval_dataset=tokenized_val,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)

trainer.train()
outputs = trainer.predict(tokenized_test)
test_accuracy = outputs[2]["test_accuracy"]

results_file = "results.txt"
experiments_directory = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
results_path = f"{experiments_directory}/{results_file}"

results_timestamp = datetime.datetime.now()

with open(results_path, "a") as file:
file.write(f"{results_timestamp} - cola_gold\n")
file.write(f"accuracy: {test_accuracy}\n")
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import numpy as np
import datetime
import os
from transformers import AutoTokenizer, TrainingArguments
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, Trainer
from datasets import load_dataset
import evaluate

generated_dataset = load_dataset("stolzenp/500-cola-sentences-smt", split="train")
test_split = load_dataset("glue", "cola", split="test")

model_name = "bert-base-uncased"

label2id = {"correct": 1, "wrong": 0, "acceptable": 1, "unacceptable": 0}
id2label = {1: "acceptable", 0: "unacceptable"}

# setup preprocessing
tokenizer = AutoTokenizer.from_pretrained(model_name)


def preprocess_text(batch):
preprocessed_tokens = tokenizer(batch["text"], truncation=True, padding=True)
return preprocessed_tokens


def preprocess_text_and_labels(batch):
preprocessed_tokens = tokenizer(batch["text"], truncation=True, padding=True)
preprocessed_tokens["label"] = [label2id[label] for label in batch["label"]]
return preprocessed_tokens


# setup compute_metrics
accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return accuracy.compute(predictions=predictions, references=labels)


generated_dataset = generated_dataset.train_test_split(test_size=0.1)
train_split = generated_dataset["train"]
val_split = generated_dataset["test"]

tokenized_train = train_split.map(preprocess_text_and_labels, batched=True)
tokenized_val = val_split.map(preprocess_text_and_labels, batched=True)
tokenized_test = test_split.map(preprocess_text, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(
model_name, num_labels=2, id2label=id2label, label2id=label2id
)

# initialize training
trainer = Trainer(
model=model,
args=TrainingArguments(
output_dir="smt_downstream_model",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=2,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
),
train_dataset=tokenized_train,
eval_dataset=tokenized_val,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)

trainer.train()
outputs = trainer.predict(tokenized_test)
test_accuracy = outputs[2]["test_accuracy"]

results_file = "results.txt"
experiments_directory = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
results_path = f"{experiments_directory}/{results_file}"

results_timestamp = datetime.datetime.now()

with open(results_path, "a") as file:
file.write(f"{results_timestamp} - cola_smt\n")
file.write(f"accuracy: {test_accuracy}\n")
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from haystack.nodes import PromptNode
from fabricator import DatasetGenerator
from fabricator.prompts import BasePrompt

label_options = ["correct", "wrong"]

prompt = BasePrompt(
task_description="Generate a grammatically {} sentence similar to this:",
label_options=label_options,
target_formatting_template="",
)

prompt_node = PromptNode(
model_name_or_path="mistralai/Mistral-7B-Instruct-v0.1",
max_length=50,
model_kwargs={
"model_kwargs": {"do_sample": True, "temperature": 0.5, "top_p": 0.9}
},
)

generator = DatasetGenerator(prompt_node)
generated_dataset = generator.generate(
prompt_template=prompt,
max_prompt_calls=500,
num_samples_to_generate=500,
small_model_training="text_classification",
train_small_model_every_X_generations=50,
)

generated_dataset.push_to_hub("500-cola-sentences-smt")
Original file line number Diff line number Diff line change
Expand Up @@ -88,5 +88,5 @@ def compute_metrics(eval_pred):
results_timestamp = datetime.datetime.now()

with open(results_path, "a") as file:
file.write(f"{results_timestamp} - imdb_baseline_downstream\n")
file.write(f"{results_timestamp} - imdb_baseline\n")
file.write(f"accuracy: {test_accuracy}\n")
Original file line number Diff line number Diff line change
Expand Up @@ -79,5 +79,5 @@ def compute_metrics(eval_pred):
results_timestamp = datetime.datetime.now()

with open(results_path, "a") as file:
file.write(f"{results_timestamp} - imdb_gold_downstream\n")
file.write(f"{results_timestamp} - imdb_gold\n")
file.write(f"accuracy: {test_accuracy}\n")
Original file line number Diff line number Diff line change
Expand Up @@ -86,5 +86,5 @@ def compute_metrics(eval_pred):
results_timestamp = datetime.datetime.now()

with open(results_path, "a") as file:
file.write(f"{results_timestamp} - imdb_smt_downstream\n")
file.write(f"{results_timestamp} - imdb_smt\n")
file.write(f"accuracy: {test_accuracy}\n")
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,19 @@
from fabricator import DatasetGenerator
from fabricator.prompts import BasePrompt

label_options = ["positive", "negative"]
label_options = ["correct", "wrong"]

prompt = BasePrompt(
task_description="Generate an excerpt from a {} movie review similar to these:",
task_description="Generate a grammatically {} sentence similar to this:",
label_options=label_options,
target_formatting_template="",
)

prompt_node = PromptNode(
model_name_or_path="mistralai/Mistral-7B-Instruct-v0.1",
max_length=100,
max_length=50,
model_kwargs={
"model_kwargs": {"do_sample": True, "temperature": 0.4, "top_p": 0.9}
"model_kwargs": {"do_sample": True, "temperature": 0.5, "top_p": 0.9}
},
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,5 +88,5 @@ def compute_metrics(eval_pred):
results_timestamp = datetime.datetime.now()

with open(results_path, "a") as file:
file.write(f"{results_timestamp} - sst2_baseline_downstream\n")
file.write(f"{results_timestamp} - sst2_baseline\n")
file.write(f"accuracy: {test_accuracy}\n")
Original file line number Diff line number Diff line change
Expand Up @@ -77,5 +77,5 @@ def compute_metrics(eval_pred):
results_timestamp = datetime.datetime.now()

with open(results_path, "a") as file:
file.write(f"{results_timestamp} - sst2_gold_downstream\n")
file.write(f"{results_timestamp} - sst2_gold\n")
file.write(f"accuracy: {test_accuracy}\n")
Original file line number Diff line number Diff line change
Expand Up @@ -86,5 +86,5 @@ def compute_metrics(eval_pred):
results_timestamp = datetime.datetime.now()

with open(results_path, "a") as file:
file.write(f"{results_timestamp} - sst2_smt_downstream\n")
file.write(f"{results_timestamp} - sst2_smt\n")
file.write(f"accuracy: {test_accuracy}\n")

0 comments on commit 45ec5a9

Please sign in to comment.