recipes/configs/llama2/generation_v2.yaml

# Config for running the InferenceRecipe in generate_V2.py to generate output from an LLM
#
# This config assumes that you've run the following command before launching:
#   tune download meta-llama/Llama-2-7b-chat-hf --output-dir /tmp/Llama-2-7b-chat-hf --ignore-patterns "*.safetensors"
#
# To launch, run the following command:
#    tune run dev/generate_v2 --config llama2/generation_v2

output_dir: ./ # Not needed

# Model arguments
model:
  _component_: torchtune.models.llama2.llama2_7b

# Transform arguments
tokenizer:
  _component_: torchtune.models.llama2.llama2_tokenizer
  path: /tmp/Llama-2-7b-chat-hf/tokenizer.model
  max_seq_len: 2048

# Checkpointer
checkpointer:
  _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: /tmp/Llama-2-7b-chat-hf
  checkpoint_files: [
    pytorch_model-00001-of-00002.bin,
    pytorch_model-00002-of-00002.bin
  ]
  output_dir: ${output_dir}
  model_type: LLAMA2

# Device
device: cuda
dtype: bf16
seed: 1234
log_level: INFO

# Generation arguments
prompt:
  system: You are a helpful and creative AI assistant.
  user: What is the capital of France?
max_new_tokens: 200
temperature: 0.6 # 0.8 and 0.6 are popular values to try
top_k: 300