-
Notifications
You must be signed in to change notification settings - Fork 71
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Tested on 910B with MS2.3.1.
- Loading branch information
Showing
30 changed files
with
5,056 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# ShareGPT4V: Improving Large Multi-modal Models with Better Captions | ||
|
||
[Paper](!https://arxiv.org/pdf/2311.12793.pdf) | ||
|
||
[Official Repo](!https://github.com/ShareGPT4Omni/ShareGPT4V) | ||
|
||
[Image](!https://raw.githubusercontent.com/ShareGPT4V/ShareGPT4V-Resources/master/images/teaser.png) | ||
|
||
|
||
## Inference | ||
|
||
|
||
1. Prepare weight files: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .model import Share4VLlamaForCausalLM |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
{ | ||
"_name_or_path": "MS_ShareGPT4V-7B", | ||
"architectures": [ | ||
"Share4VLlamaForCausalLM" | ||
], | ||
"bos_token_id": 1, | ||
"eos_token_id": 2, | ||
"freeze_mm_mlp_adapter": false, | ||
"hidden_act": "silu", | ||
"hidden_size": 4096, | ||
"image_aspect_ratio": "pad", | ||
"image_grid_pinpoints": null, | ||
"initializer_range": 0.02, | ||
"intermediate_size": 11008, | ||
"max_position_embeddings": 4096, | ||
"mm_hidden_size": 1024, | ||
"mm_projector_lr": null, | ||
"mm_projector_type": "mlp2x_gelu", | ||
"mm_use_im_patch_token": false, | ||
"mm_use_im_start_end": false, | ||
"mm_vision_select_feature": "patch", | ||
"mm_vision_select_layer": -2, | ||
"mm_vision_tower": "/root/congw/project/ms_ShareGPT4V/share4v/configs/vit/", | ||
"mm_vision_tower_path":"/root/congw/project/ms_ShareGPT4V/share4v/configs/vit/vit-large336-l12.ckpt", | ||
"model_type": "share4v", | ||
"num_attention_heads": 32, | ||
"num_hidden_layers": 32, | ||
"num_key_value_heads": 32, | ||
"pad_token_id": 0, | ||
"pretraining_tp": 1, | ||
"rms_norm_eps": 1e-05, | ||
"rope_scaling": null, | ||
"tie_word_embeddings": false, | ||
"dtype": "float32", | ||
"transformers_version": "4.31.0", | ||
"tune_entire_model": false, | ||
"tune_mm_mlp_adapter": false, | ||
"tune_vision_tower": false, | ||
"use_cache": true, | ||
"use_mm_proj": true, | ||
"vision_tower_lr": null, | ||
"vocab_size": 32000, | ||
"output_attentions": false, | ||
"output_hidden_states": false, | ||
"use_return_dict": true | ||
} |
36 changes: 36 additions & 0 deletions
36
examples/sharegpt_4v/share4v/configs/tokenizer_config.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
{ | ||
"add_bos_token": true, | ||
"add_eos_token": false, | ||
"bos_token": { | ||
"__type": "AddedToken", | ||
"content": "<s>", | ||
"lstrip": false, | ||
"normalized": false, | ||
"rstrip": false, | ||
"single_word": false | ||
}, | ||
"clean_up_tokenization_spaces": false, | ||
"eos_token": { | ||
"__type": "AddedToken", | ||
"content": "</s>", | ||
"lstrip": false, | ||
"normalized": false, | ||
"rstrip": false, | ||
"single_word": false | ||
}, | ||
"legacy": false, | ||
"model_max_length": 2048, | ||
"pad_token": null, | ||
"padding_side": "right", | ||
"sp_model_kwargs": {}, | ||
"tokenizer_class": "LlamaTokenizer", | ||
"unk_token": { | ||
"__type": "AddedToken", | ||
"content": "<unk>", | ||
"lstrip": false, | ||
"normalized": false, | ||
"rstrip": false, | ||
"single_word": false | ||
} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
{ | ||
"_name_or_path": "ShareGPT4V-7B_Pretrained_vit-large336-l12", | ||
"architectures": [ | ||
"CLIPVisionModel" | ||
], | ||
"attention_dropout": 0.0, | ||
"dropout": 0.0, | ||
"hidden_act": "quick_gelu", | ||
"hidden_size": 1024, | ||
"image_size": 336, | ||
"initializer_factor": 1.0, | ||
"initializer_range": 0.02, | ||
"intermediate_size": 4096, | ||
"layer_norm_eps": 1e-05, | ||
"model_type": "clip_vision_model", | ||
"num_attention_heads": 16, | ||
"num_channels": 3, | ||
"num_hidden_layers": 24, | ||
"patch_size": 14, | ||
"projection_dim": 768, | ||
"torch_dtype": "bfloat16", | ||
"transformers_version": "4.31.0" | ||
} |
29 changes: 29 additions & 0 deletions
29
examples/sharegpt_4v/share4v/configs/vit/preprocessor_config.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
{ | ||
"crop_size": { | ||
"height": 336, | ||
"width": 336 | ||
}, | ||
"do_center_crop": true, | ||
"do_convert_rgb": true, | ||
"do_normalize": true, | ||
"do_rescale": true, | ||
"do_resize": true, | ||
"feature_extractor_type": "CLIPFeatureExtractor", | ||
"image_mean": [ | ||
0.48145466, | ||
0.4578275, | ||
0.40821073 | ||
], | ||
"image_processor_type": "CLIPImageProcessor", | ||
"image_std": [ | ||
0.26862954, | ||
0.26130258, | ||
0.27577711 | ||
], | ||
"resample": 3, | ||
"rescale_factor": 0.00392156862745098, | ||
"size": { | ||
"shortest_edge": 336 | ||
} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
CONTROLLER_HEART_BEAT_EXPIRATION = 30 | ||
WORKER_HEART_BEAT_INTERVAL = 15 | ||
|
||
LOGDIR = "." | ||
|
||
# Model Constants | ||
IGNORE_INDEX = -100 | ||
IMAGE_TOKEN_INDEX = -200 | ||
DEFAULT_IMAGE_TOKEN = "<image>" | ||
DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>" | ||
DEFAULT_IM_START_TOKEN = "<im_start>" | ||
DEFAULT_IM_END_TOKEN = "<im_end>" |
Oops, something went wrong.