forked from DAMO-NLP-SG/Video-LLaMA
-
Notifications
You must be signed in to change notification settings - Fork 0
/
video_llama_eval_withaudio.yaml
38 lines (32 loc) · 1.06 KB
/
video_llama_eval_withaudio.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
model:
arch: video_llama
model_type: pretrain_vicuna
freeze_vit: True
freeze_qformer: True
max_txt_len: 512
end_sym: "###"
low_resource: False
frozen_llama_proj: False
# If you want use LLaMA-2-chat,
# some ckpts could be download from our provided huggingface repo
# i.e. https://huggingface.co/DAMO-NLP-SG/Video-LLaMA-2-13B-Finetuned
llama_model: "ckpt/vicuna-13b/" or "ckpt/vicuna-7b/" or "ckpt/llama-2-7b-chat-hf" or "ckpt/llama-2-13b-chat-hf"
imagebind_ckpt_path: "ckpt/imagebind_path/"
ckpt: 'path/pretrained_visual_branch_ckpt' # you can use our pretrained ckpt from https://huggingface.co/DAMO-NLP-SG/Video-LLaMA-2-13B-Pretrained/
ckpt_2: 'path/pretrained_audio_branch_ckpt'
equip_audio_branch: True # whether equips the audio branch
fusion_head_layers: 2
max_frame_pos: 32
fusion_header_type: "seqTransf"
datasets:
webvid:
vis_processor:
train:
name: "alpro_video_eval"
n_frms: 8
image_size: 224
text_processor:
train:
name: "blip_caption"
run:
task: video_text_pretrain