Skip to content

Commit

Permalink
feat: improve script
Browse files Browse the repository at this point in the history
  • Loading branch information
asawczyn committed Apr 3, 2024
1 parent 73d8b43 commit 2fbffda
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 7 deletions.
8 changes: 4 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ tenacity==8.2.3
loguru==0.7.2
typer==0.9.0
datasets==2.18.0
transformers==4.38.2
torch==2.2.1
peft==0.9.0
trl==0.7.11
transformers==4.39.3
torch==2.2.2
peft==0.10.0
trl==0.8.1
chardet==5.2.0
bitsandbytes==0.43.0
tensorboard==2.16.2
Expand Down
8 changes: 5 additions & 3 deletions scripts/check_memory_max_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def main(

def get_dataset(dataset) -> DatasetDict | Dataset | IterableDatasetDict | IterableDataset:
if dataset == "dummy":
data = {"text": ["text"] * 1_000}
data = {"text": ["text "*100] * 50000}
dataset = Dataset.from_dict(data)
else:
dataset = load_dataset(dataset, split="train")
Expand Down Expand Up @@ -75,6 +75,8 @@ def get_model_and_tokenizer(
)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenizer.padding_side = "right" # to prevent warnings
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

return model, tokenizer

Expand Down Expand Up @@ -120,15 +122,15 @@ def get_trainer(
report_to="wandb", # report metrics to tensorboard
)

# max_seq_length = 1512 # max sequence length for model and packing of the dataset
max_seq_length = 1000 # max sequence length for model and packing of the dataset

trainer = SFTTrainer(
model=model,
args=args,
train_dataset=dataset,
dataset_text_field=dataset_text_field,
peft_config=peft_config,
# max_seq_length=max_seq_length,
max_seq_length=max_seq_length,
tokenizer=tokenizer,
packing=True,
dataset_kwargs={
Expand Down

0 comments on commit 2fbffda

Please sign in to comment.