-
Notifications
You must be signed in to change notification settings - Fork 0
/
glip_Swin_L.yaml
120 lines (102 loc) · 2.71 KB
/
glip_Swin_L.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
MODEL:
META_ARCHITECTURE: "GeneralizedVLRCNN"
WEIGHT: "swin_large_patch4_window12_384_22k.pth"
RPN_ONLY: True
RPN_ARCHITECTURE: "VLDYHEAD"
BACKBONE:
CONV_BODY: "SWINT-FPN-RETINANET"
OUT_CHANNELS: 256
SWINT:
EMBED_DIM: 192
DEPTHS: (2, 2, 18, 2)
NUM_HEADS: (6, 12, 24, 48)
WINDOW_SIZE: 12
OUT_CHANNELS: (192, 384, 768, 1536)
DROP_PATH_RATE: 0.4
LANGUAGE_BACKBONE:
FREEZE: False
MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
MASK_SPECIAL: False
RPN:
USE_FPN: True
ANCHOR_SIZES: (64, 128, 256, 512, 1024)
ANCHOR_STRIDE: (8, 16, 32, 64, 128)
ASPECT_RATIOS: (1.0,)
SCALES_PER_OCTAVE: 1
DYHEAD:
CHANNELS: 256
NUM_CONVS: 8
USE_GN: True
USE_DYRELU: True
USE_DFCONV: True
USE_DYFUSE: True
TOPK: 9 # topk for selecting candidate positive samples from each level
SCORE_AGG: "MEAN"
LOG_SCALE: 0.0
USE_CHECKPOINT: True
FUSE_CONFIG:
USE_FUSED_FEATURES_DOT_PRODUCT: True
EARLY_FUSE_ON: True
TYPE: "MHA-B"
USE_CLASSIFICATION_LOSS: False
USE_TOKEN_LOSS: False
USE_CONTRASTIVE_ALIGN_LOSS: False
CONTRASTIVE_HIDDEN_DIM: 64
USE_DOT_PRODUCT_TOKEN_LOSS: True
USE_LAYER_SCALE: True
CLAMP_MIN_FOR_UNDERFLOW: True
CLAMP_MAX_FOR_OVERFLOW: True
CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
CLAMP_DOT_PRODUCT: True
DATASETS:
TRAIN: ("mixed_train_no_coco",) # Place holder dataset for now. To be updated in the next version
TEST: ("coco_2017_val", )
ONE_HOT: False
FLICKR_COPY: 8 # 0.15 * 8 = ~1.2M
MIXED_COPY: 4 # 0.6 * 4 = ~2.4M
OBJECT365_COPY: 2 # 1.4 * 2 = ~2.8M
VG_COPY: 3 # 0.4 * 3 = ~1.2M
IN_COPY: 2 # 0.67 * 2 = ~1.33M
OI_COPY: 1 # 2M * 1 = 2M
DISABLE_SHUFFLE: False
ADD_DET_PROMPT: False
RANDOM_SAMPLE_NEG: 85
CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
FURTHER_SCREEN: True
CAPTION_CONF: 0.5
CAPTION_NMS: -1.0
CAPTION_MIN_BOX: 1
SEPARATION_TOKENS: ". "
PACK_RANDOM_CAPTION_NUMBER: 20
NO_RANDOM_PACK_PROBABILITY: 0.4
RANDOM_PACK_PROB: 0.5
CAPTION_FORMAT_VERSION: "v2"
INPUT:
PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
PIXEL_STD: [ 57.375, 57.120, 58.395 ]
MIN_SIZE_TRAIN: 800
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
AUGMENT:
MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
DATALOADER:
SIZE_DIVISIBILITY: 32
SOLVER:
OPTIMIZER: ADAMW
BASE_LR: 0.0001
LANG_LR: 0.00001
WEIGHT_DECAY: 0.01
WEIGHT_DECAY_SCHEDULE: True
STEPS: (0.67, 0.89)
MAX_ITER: 1000000
IMS_PER_BATCH: 64
WARMUP_ITERS: 2000
WARMUP_FACTOR: 0.001
FIND_UNUSED_PARAMETERS: False
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 1.0
NORM_TYPE: 2.0