-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Implement CountGD multi-modal counting app, with focus on small…
… object detection Implements the CountGD application as described in Amini-Naieni et al. (NeurIPS 2024).https://github.com/niki-amini-naieni/CountGD
- Loading branch information
1 parent
ff8a3cd
commit 3debde1
Showing
105 changed files
with
80,763 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
# Use the specified Python runtime as a parent image | ||
FROM docker.io/nvidia/cuda:12.1.0-cudnn8-devel-ubi8@sha256:f045009cab64c9fda6113b4473ac1c57dfcca65e18ce981bce63f3cddf7b807a | ||
|
||
# Set the working directory in the container | ||
WORKDIR /usr/src/app | ||
|
||
# Install required packages | ||
RUN apt-get update && apt-get install -y \ | ||
gcc-11 \ | ||
build-essential \ | ||
ffmpeg \ | ||
libsm6 \ | ||
libxext6 \ | ||
curl \ | ||
git \ | ||
&& apt-get clean \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
# Set environment variable to use gcc-11 | ||
ENV CC=/usr/bin/gcc-11 | ||
|
||
# Copy the current directory contents into the container | ||
COPY . . | ||
|
||
# Install any needed packages specified in requirements.txt | ||
RUN pip install --no-cache-dir -r requirements.txt | ||
|
||
# Set the working directory for the GroundingDINO ops | ||
WORKDIR /usr/src/app/models/GroundingDINO/ops | ||
|
||
# Run the setup script and the test script | ||
RUN python setup.py build install | ||
RUN python test.py # This should result in 6 lines of * True | ||
|
||
# Install Gradio | ||
RUN pip install gradio | ||
|
||
# Change back to the original working directory | ||
WORKDIR /usr/src/app | ||
|
||
# Expose the port Gradio will run on | ||
EXPOSE 7860 | ||
|
||
# Default command to run the Gradio app | ||
CMD ["python", "app.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
https://github.com/niki-amini-naieni/CountGD/tree/main | ||
https://huggingface.co/spaces/nikigoli/countgd | ||
@InProceedings{AminiNaieni24, | ||
author = "Amini-Naieni, N. and Han, T. and Zisserman, A.", | ||
title = "CountGD: Multi-Modal Open-World Counting", | ||
booktitle = "Advances in Neural Information Processing Systems (NeurIPS)", | ||
year = "2024", | ||
} | ||
--- | ||
title: CountGD_Multi-Modal_Open-World_Counting | ||
app_file: app.py | ||
sdk: gradio | ||
sdk_version: 4.44.1 | ||
--- | ||
nohup python -u app.py & |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
data_aug_scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800] | ||
data_aug_max_size = 1333 | ||
data_aug_scales2_resize = [400, 500, 600] | ||
data_aug_scales2_crop = [384, 600] | ||
data_aug_scale_overlap = None | ||
batch_size = 4 | ||
modelname = 'groundingdino' | ||
backbone = "swin_B_384_22k" | ||
position_embedding = 'sine' | ||
pe_temperatureH = 20 | ||
pe_temperatureW = 20 | ||
return_interm_indices = [1, 2, 3] | ||
enc_layers = 6 | ||
dec_layers = 6 | ||
pre_norm = False | ||
dim_feedforward = 2048 | ||
hidden_dim = 256 | ||
dropout = 0.0 | ||
nheads = 8 | ||
num_queries = 900 | ||
query_dim = 4 | ||
num_patterns = 0 | ||
num_feature_levels = 4 | ||
enc_n_points = 4 | ||
dec_n_points = 4 | ||
two_stage_type = 'standard' | ||
two_stage_bbox_embed_share = False | ||
two_stage_class_embed_share = False | ||
transformer_activation = 'relu' | ||
dec_pred_bbox_embed_share = True | ||
dn_box_noise_scale = 1.0 | ||
dn_label_noise_ratio = 0.5 | ||
dn_label_coef = 1.0 | ||
dn_bbox_coef = 1.0 | ||
embed_init_tgt = True | ||
dn_labelbook_size = 91 | ||
max_text_len = 256 | ||
text_encoder_type = "bert-base-uncased" | ||
use_text_enhancer = True | ||
use_fusion_layer = True | ||
use_checkpoint = False | ||
use_transformer_ckpt = False | ||
use_text_cross_attention = True | ||
text_dropout = 0.0 | ||
fusion_dropout = 0.0 | ||
fusion_droppath = 0.1 | ||
sub_sentence_present = True | ||
max_labels = 90 # pos + neg | ||
lr = 0.0001 # base learning rate | ||
backbone_freeze_keywords = None # only for gdino backbone | ||
freeze_keywords = ['backbone.0', 'bert'] # for whole model, e.g. ['backbone.0', 'bert'] for freeze visual encoder and text encoder | ||
lr_backbone = 1e-05 # specific learning rate | ||
lr_backbone_names = ['backbone.0', 'bert'] | ||
lr_linear_proj_mult = 1e-05 | ||
lr_linear_proj_names = ['ref_point_head', 'sampling_offsets'] | ||
weight_decay = 0.0001 | ||
param_dict_type = 'ddetr_in_mmdet' | ||
ddetr_lr_param = False | ||
epochs = 30 | ||
lr_drop = 10 | ||
save_checkpoint_interval = 10 | ||
clip_max_norm = 0.1 | ||
onecyclelr = False | ||
multi_step_lr = False | ||
lr_drop_list = [10, 20] | ||
frozen_weights = None | ||
dilation = False | ||
pdetr3_bbox_embed_diff_each_layer = False | ||
pdetr3_refHW = -1 | ||
random_refpoints_xy = False | ||
fix_refpoints_hw = -1 | ||
dabdetr_yolo_like_anchor_update = False | ||
dabdetr_deformable_encoder = False | ||
dabdetr_deformable_decoder = False | ||
use_deformable_box_attn = False | ||
box_attn_type = 'roi_align' | ||
dec_layer_number = None | ||
decoder_layer_noise = False | ||
dln_xy_noise = 0.2 | ||
dln_hw_noise = 0.2 | ||
add_channel_attention = False | ||
add_pos_value = False | ||
two_stage_pat_embed = 0 | ||
two_stage_add_query_num = 0 | ||
two_stage_learn_wh = False | ||
two_stage_default_hw = 0.05 | ||
two_stage_keep_all_tokens = False | ||
num_select = 900 | ||
batch_norm_type = 'FrozenBatchNorm2d' | ||
masks = False | ||
aux_loss = True | ||
set_cost_class = 5.0 | ||
set_cost_bbox = 1.0 | ||
set_cost_giou = 0.0 | ||
cls_loss_coef = 5.0 | ||
bbox_loss_coef = 1.0 | ||
giou_loss_coef = 0.0 | ||
enc_loss_coef = 1.0 | ||
interm_loss_coef = 1.0 | ||
no_interm_box_loss = False | ||
mask_loss_coef = 1.0 | ||
dice_loss_coef = 1.0 | ||
focal_alpha = 0.25 | ||
focal_gamma = 2.0 | ||
decoder_sa_type = 'sa' | ||
matcher_type = 'HungarianMatcher' | ||
decoder_module_seq = ['sa', 'ca', 'ffn'] | ||
nms_iou_threshold = -1 | ||
dec_pred_class_embed_share = True | ||
match_unstable_error = True | ||
use_detached_boxes_dec_out = False | ||
dn_scalar = 100 | ||
|
||
box_threshold = 0.23 | ||
text_threshold = 0 | ||
use_coco_eval = False | ||
label_list = ['alcohol bottle', 'baguette roll', 'ball', 'banana', 'bead', 'bee', 'birthday candle', 'biscuit', 'boat', 'bottle', 'bowl', 'box', 'bread roll', 'brick', 'buffalo', 'bun', 'calamari ring', 'can', 'candle', 'cap', 'car', 'cartridge', 'cassette', 'cement bag', 'cereal', 'chewing gum piece', 'chopstick', 'clam', 'coffee bean', 'coin', 'cotton ball', 'cow', 'crane', 'crayon', 'croissant', 'crow', 'cup', 'cupcake', 'cupcake holder', 'fish', 'gemstone', 'go game piece', 'goat', 'goldfish snack', 'goose', 'ice cream', 'ice cream cone', 'instant noodle', 'jade stone', 'jeans', 'kidney bean', 'kitchen towel', 'lighter', 'lipstick', 'm&m piece', 'macaron', 'match', 'meat skewer', 'mini blind', 'mosaic tile', 'naan bread', 'nail', 'nut', 'onion ring', 'orange', 'pearl', 'pen', 'pencil', 'penguin', 'pepper', 'person', 'pigeon', 'plate', 'polka dot tile', 'potato', 'rice bag', 'roof tile', 'screw', 'shoe', 'spoon', 'spring roll', 'stair', 'stapler pin', 'straw', 'supermarket shelf', 'swan', 'tomato', 'watermelon', 'window', 'zebra'] | ||
val_label_list = ["apple", "candy piece", "carrom board piece", "cashew nut", "comic book", "crab cake", "deer", "egg", "elephant", "finger food", "green pea", "hot air balloon", "keyboard key", "lego", "marble", "marker", "nail polish", "potato chip", "red bean", "round dessert", "sauce bottle", "sea shell", "sheep", "ski", "stamp", "sticky note", "strawberry", "sunglasses", "tree log", "watch"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Please download checkpoints files from https://huggingface.co/spaces/nikigoli/countgd/tree/main/checkpoints and put in this folder. |
26 changes: 26 additions & 0 deletions
26
annolid/detector/countgd/checkpoints/bert-base-uncased/config.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
{ | ||
"_name_or_path": "bert-base-uncased", | ||
"architectures": [ | ||
"BertModel" | ||
], | ||
"attention_probs_dropout_prob": 0.1, | ||
"classifier_dropout": null, | ||
"gradient_checkpointing": false, | ||
"hidden_act": "gelu", | ||
"hidden_dropout_prob": 0.1, | ||
"hidden_size": 768, | ||
"initializer_range": 0.02, | ||
"intermediate_size": 3072, | ||
"layer_norm_eps": 1e-12, | ||
"max_position_embeddings": 512, | ||
"model_type": "bert", | ||
"num_attention_heads": 12, | ||
"num_hidden_layers": 12, | ||
"pad_token_id": 0, | ||
"position_embedding_type": "absolute", | ||
"torch_dtype": "float32", | ||
"transformers_version": "4.39.1", | ||
"type_vocab_size": 2, | ||
"use_cache": true, | ||
"vocab_size": 30522 | ||
} |
7 changes: 7 additions & 0 deletions
7
annolid/detector/countgd/checkpoints/bert-base-uncased/special_tokens_map.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
{ | ||
"cls_token": "[CLS]", | ||
"mask_token": "[MASK]", | ||
"pad_token": "[PAD]", | ||
"sep_token": "[SEP]", | ||
"unk_token": "[UNK]" | ||
} |
Oops, something went wrong.