-
Notifications
You must be signed in to change notification settings - Fork 7
/
detr_r50.py
86 lines (72 loc) · 2.64 KB
/
detr_r50.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""DETR model, that use the parameters of original DETR-R50 architecture."""
import time
import argparse
import torch
from alonet.detr import Detr
import aloscene
import alonet
class DetrR50(Detr):
"""DETR R50 as described in the paper: https://arxiv.org/abs/2005.12872
Parameters
----------
num_classes : int, optional
Neuron number in embed layer, by default 91
background_class : int, optional
Id use for background class, by default 91
*args : Namespace
Positional arguments (see `Detr <detr>` class)
**kwargs: Dict
Additional parameters (see `Detr <detr>` class)
"""
def __init__(self, *args, num_classes=91, background_class=91, **kwargs):
# Positional encoding
position_embedding = self.build_positional_encoding(hidden_dim=256, position_embedding="sin")
# Backbone
backbone = self.build_backbone("resnet50", train_backbone=True, return_interm_layers=True, dilation=False,)
num_channels = backbone.num_channels
backbone = alonet.detr.backbone.Joiner(backbone, position_embedding)
backbone.num_channels = num_channels
# Build transformer
transformer = self.build_transformer(
hidden_dim=256,
dropout=0.1,
nheads=8,
dim_feedforward=2048,
num_encoder_layers=6,
num_decoder_layers=6,
normalize_before=False,
)
super().__init__(
backbone,
transformer,
*args,
num_classes=num_classes,
num_queries=100,
background_class=background_class,
**kwargs,
)
def main(image_path):
device = torch.device("cuda")
# Load model
model = DetrR50(num_classes=91, weights="detr-r50", device=device).eval()
# Open and prepare a batch for the model
frame = aloscene.Frame(image_path).norm_resnet()
frames = aloscene.Frame.batch_list([frame])
frames = frames.to(device)
with torch.no_grad():
# Measure inference time
tic = time.time()
[model(frames) for _ in range(20)]
toc = time.time()
print(f"{(toc - tic)/20*1000} ms")
# Predict boxes
m_outputs = model(frames)
pred_boxes = model.inference(m_outputs)
# Add and display the predicted boxes
frame.append_boxes2d(pred_boxes[0], "pred_boxes")
frame.get_view().render()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Detr R50 inference on image")
parser.add_argument("image_path", type=str, help="Path to the image for inference")
args = parser.parse_args()
main(args.image_path)