eval_configs/video_llama_eval_withaudio.yaml

model:
  arch: video_llama
  model_type: pretrain_vicuna
  freeze_vit: True
  freeze_qformer: True
  max_txt_len: 512
  end_sym: "###"
  low_resource: False

  frozen_llama_proj: False

  # If you want use LLaMA-2-chat,
  # some ckpts could be download from our provided huggingface repo
  # i.e.  https://huggingface.co/DAMO-NLP-SG/Video-LLaMA-2-13B-Finetuned
  llama_model: "ckpt/vicuna-13b/" or "ckpt/vicuna-7b/" or "ckpt/llama-2-7b-chat-hf"  or "ckpt/llama-2-13b-chat-hf"
  imagebind_ckpt_path: "ckpt/imagebind_path/"
  ckpt: 'path/pretrained_visual_branch_ckpt'   # you can use our pretrained ckpt from https://huggingface.co/DAMO-NLP-SG/Video-LLaMA-2-13B-Pretrained/
  ckpt_2:  'path/pretrained_audio_branch_ckpt'

  equip_audio_branch: True  # whether equips the audio branch
  fusion_head_layers: 2
  max_frame_pos: 32
  fusion_header_type: "seqTransf"


datasets:
  webvid:
    vis_processor:
      train:
        name: "alpro_video_eval"
        n_frms: 8
        image_size: 224
    text_processor:
      train:
        name: "blip_caption"

run:
  task: video_text_pretrain