diff --git a/README.md b/README.md index 12355df..41afd48 100644 --- a/README.md +++ b/README.md @@ -7,9 +7,11 @@ It is a simple demo including face detection and face aligment, and some optimizations were made to make the result better. - - - +The keypoint model encodes and decodes the x and y coordinates using heatmap and offset of x and y, +achieving SOTA on WFLW dataset. +Like object detection, heatmap predicts which point is a positive sample on the featuremap, +represented as a highlighted area, while x and y offsets are responsible for predicting the specific coordinates of these positive samples. +And it achieves ** NME 3.95 on WFLW ** with no extern data. click the gif to see the video: [![demo](https://github.com/610265158/simpleface-engine/blob/master/figure/sample.gif)](https://v.youku.com/v_show/id_XNDM3MTY4MTM2MA==.html?spm=a2h3j.8428770.3416059.1) @@ -21,31 +23,28 @@ and with face mask: + PyTorch + onnxruntime + opencv -+ python 3.7 + easydict ## model -+ 1 face detector +### 1 face detector [yolov5-face](https://github.com/deepcam-cn/yolov5-face) -+ 2 landmark detector - +### 2 landmark detector + +###### HOW TO TRAIN [simple face landmark detector]( https://github.com/610265158/Peppa_Pig_Face_Landmark/tree/master/TRAIN/face_landmark) Refer to [TRAIN/face_landmark/README.md](https://github.com/610265158/Peppa_Pig_Face_Landmark/blob/master/TRAIN/face_landmark/README.md) to train the model. - the model is trained with WFLW data. For student **mobilenetv3-large** was used as backbone, for teacher is **efficientnetb5**. - - | model | Resolution | NME(test set) | model size (int8 weights) | Pretrained | - | ------- | ---------- | ------------- | ------------------------- | ------------------------------------------------------------ | - | Student | 128x128 | 4.95 | 1.9M | [model128](https://drive.google.com/drive/folders/1zivD151CkOSm8KYyeC7v4YPC0aYDomry?usp=share_link) | - | Teacher | 128x128 | 4.64 | 6.9M | [model128](https://drive.google.com/drive/folders/1zivD151CkOSm8KYyeC7v4YPC0aYDomry?usp=share_link) | - | Student | 256x256 | 4.65 | 1.9M | [model256](https://drive.google.com/drive/folders/1JFVrbMx07PwL47dFlUSZ1tAMcVxVmJXo?usp=share_link) | - | Teacher | 256x256 | 4.47 | 6.9M | [model256](https://drive.google.com/drive/folders/1JFVrbMx07PwL47dFlUSZ1tAMcVxVmJXo?usp=share_link) | +| WFLW | inputsize | NME | Flops(G) | Params(M) | Pose | Exp. | Ill. | Mu. | Occ. | Blur | pretrained | +|---------|-----------|----------|----------|-----------|------|------|------|------|------|------|-------------------------------------------------------------------------------------------------| +| Student | 128x128 | **4.80** | 0.35 | 3.25 | 8.53 | 5.00 | 4.61 | 4.81 | 5.80 | 5.36 | [skps](https://drive.google.com/drive/folders/1JktGIKohpeLO14a6eJqNlZort_46qVC0?usp=share_link) | +| Teacher | 128x128 | **4.17** | 1.38 | 11.53 | 7.14 | 4.32 | 4.01 | 4.03 | 4.98 | 4.68 | [skps](https://drive.google.com/drive/folders/1JktGIKohpeLO14a6eJqNlZort_46qVC0?usp=share_link) | +| Student | 256x256 | **4.35** | 1.39 | 3.25 | 7.53 | 4.52 | 4.16 | 4.21 | 5.34 | 4.93 | [skps](https://drive.google.com/drive/folders/1Y8FvJV1X5YTUkwt5MywVFvqzStpxRK_S?usp=sharing) | +| Teacher | 256x256 | **3.95** | 5.53 | 11.53 | 7.00 | 4.00 | 3.81 | 3.78 | 4.85 | 4.54 | [skps](https://drive.google.com/drive/folders/1Y8FvJV1X5YTUkwt5MywVFvqzStpxRK_S?usp=sharing) | - I will release new model when there is better one. 7.5K trainning data is not enough for a very good model. Please label more data if needed. diff --git a/TRAIN/face_landmark/README.md b/TRAIN/face_landmark/README.md index 7b35a62..17ca26f 100644 --- a/TRAIN/face_landmark/README.md +++ b/TRAIN/face_landmark/README.md @@ -6,12 +6,11 @@ A simple face aligment method, based on pytorch ## introduction -It is simple and flexible, trained with wingloss , multi task learning, also with data augmentation based on headpose and face attributes(eyes state and mouth state). - -[CN blog](https://blog.csdn.net/qq_35606924/article/details/99711208) - -The model is trained for **[[pappa_pig_face_engine]](https://github.com/610265158/Peppa_Pig_Face_Engine).** - +The keypoint model encodes and decodes the x and y coordinates using heatmap and offset of x and y, +achieving SOTA on WFLW dataset. +Like object detection, heatmap predicts which point is a positive sample on the featuremap, +represented as a highlighted area, while x and y offsets are responsible for predicting the specific coordinates of these positive samples. +And it achieves ** NME 3.95 on WFLW ** with no extern data. Contact me if u have problem about it. 2120140200@mail.nankai.edu.cn :) demo pictures: @@ -22,22 +21,18 @@ demo pictures: this gif is from github.com/610265158/Peppa_Pig_Face_Engine ) -pretrained model is placed in pretrained, in Peppa_Pig_Face_Landmark folder. - ## metric WLFW test set. -the model is trained with WFLW data. For student **mobilenetv3-large** was used as backbone, for teacher is **efficientnetb5**. - -| model | Resolution | NME(test set) | model size (int8 weights) | Pretrained | -| ------- | ---------- | ------------- | ------------------------- | ------------------------------------------------------------ | -| Student | 128x128 | 4.95 | 1.9M | [model128](https://drive.google.com/drive/folders/1zivD151CkOSm8KYyeC7v4YPC0aYDomry?usp=share_link) | -| Teacher | 128x128 | 4.64 | 6.9M | [model128](https://drive.google.com/drive/folders/1zivD151CkOSm8KYyeC7v4YPC0aYDomry?usp=share_link) | -| Student | 256x256 | 4.65 | 1.9M | [model256](https://drive.google.com/drive/folders/1JFVrbMx07PwL47dFlUSZ1tAMcVxVmJXo?usp=share_link) | -| Teacher | 256x256 | 4.47 | 6.9M | [model256](https://drive.google.com/drive/folders/1JFVrbMx07PwL47dFlUSZ1tAMcVxVmJXo?usp=share_link) | +| WFLW | inputsize | NME | Flops(G) | Params(M) | Pose | Exp. | Ill. | Mu. | Occ. | Blur | pretrained | +|---------|-----------|----------|----------|-----------|------|------|------|------|------|------|-------------------------------------------------------------------------------------------------| +| Student | 128x128 | **4.80** | 0.35 | 3.25 | 8.53 | 5.00 | 4.61 | 4.81 | 5.80 | 5.36 | [skps](https://drive.google.com/drive/folders/1JktGIKohpeLO14a6eJqNlZort_46qVC0?usp=share_link) | +| Teacher | 128x128 | **4.17** | 1.38 | 11.53 | 7.14 | 4.32 | 4.01 | 4.03 | 4.98 | 4.68 | [skps](https://drive.google.com/drive/folders/1JktGIKohpeLO14a6eJqNlZort_46qVC0?usp=share_link) | +| Student | 256x256 | **4.35** | 1.39 | 3.25 | 7.53 | 4.52 | 4.16 | 4.21 | 5.34 | 4.93 | [skps](https://drive.google.com/drive/folders/1Y8FvJV1X5YTUkwt5MywVFvqzStpxRK_S?usp=sharing) | +| Teacher | 256x256 | **3.95** | 5.53 | 11.53 | 7.00 | 4.00 | 3.81 | 3.78 | 4.85 | 4.54 | [skps](https://drive.google.com/drive/folders/1Y8FvJV1X5YTUkwt5MywVFvqzStpxRK_S?usp=sharing) | @@ -47,8 +42,6 @@ the model is trained with WFLW data. For student **mobilenetv3-large** was used + opencv -+ python 3.7 - + timm @@ -62,8 +55,18 @@ the model is trained with WFLW data. For student **mobilenetv3-large** was used 1. Download [WFLW](https://wywu.github.io/projects/LAB/WFLW.html) data. Set them in train_config.py. 3. then `run.sh` -4. by default it is trained with mobilenetv3-large as backbone. +4. by default it is trained with mobilenetv3-large as student, efficientnetb5 as teacher. + +### Evaluation + +``` +python tools/eval_WFLW.py --weight xxx.pth --data_dir ./ --img_size 256 +``` + +``` +python vis.py --model ./keypoints.pth +``` ### visualization ``` diff --git a/TRAIN/face_landmark/lib/core/base_trainer/model.py b/TRAIN/face_landmark/lib/core/base_trainer/model.py index ce61327..6b85edd 100644 --- a/TRAIN/face_landmark/lib/core/base_trainer/model.py +++ b/TRAIN/face_landmark/lib/core/base_trainer/model.py @@ -2,44 +2,32 @@ from functools import partial import sys - import torch import torch.nn as nn import torch.nn.functional as F import timm -from torchvision.models.mobilenetv3 import InvertedResidual,InvertedResidualConfig - - -bn_momentum=0.1 - - - +from torchvision.models.mobilenetv3 import InvertedResidual, InvertedResidualConfig -def upsample_x_like_y(x,y): - size = y.shape[-2:] - x=F.interpolate(x, size=size, mode='bilinear') - - return x # from lib.core.base_trainer.mobileone import MobileOneBlock class SeparableConv2d(nn.Module): """ Separable Conv """ + def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, dilation=1, padding=0, bias=False, channel_multiplier=1., pw_kernel_size=1): super(SeparableConv2d, self).__init__() - self.conv_dw = nn.Sequential(nn.Conv2d( - int(in_channels*channel_multiplier), int(in_channels*channel_multiplier), kernel_size, - stride=stride, dilation=dilation, padding=padding, groups=int(in_channels*channel_multiplier)), - nn.BatchNorm2d(in_channels,momentum=bn_momentum), - nn.ReLU(inplace=True) + int(in_channels * channel_multiplier), int(in_channels * channel_multiplier), kernel_size, + stride=stride, dilation=dilation, padding=padding, groups=int(in_channels * channel_multiplier)), + nn.BatchNorm2d(in_channels) + ) self.conv_pw = nn.Conv2d( - int(in_channels*channel_multiplier), out_channels, pw_kernel_size, padding=0, bias=bias) + int(in_channels * channel_multiplier), out_channels, pw_kernel_size, padding=0, bias=bias) @property def in_channels(self): @@ -50,7 +38,6 @@ def out_channels(self): return self.conv_pw.out_channels def forward(self, x): - x = self.conv_dw(x) x = self.conv_pw(x) return x @@ -59,93 +46,74 @@ def forward(self, x): class ASPPPooling(nn.Module): def __init__(self, in_channels, out_channels): super(ASPPPooling, self).__init__() - self.pool=nn.Sequential( + self.pool = nn.Sequential( nn.AdaptiveAvgPool2d(1), nn.Conv2d(in_channels, out_channels, 1, bias=False), - nn.BatchNorm2d(out_channels,momentum=bn_momentum), + nn.BatchNorm2d(out_channels), nn.ReLU()) - def forward(self, x): - - y=x + def forward(self, x): + size = x.shape[-2:] x = self.pool(x) - x= upsample_x_like_y(x,y) + x = F.interpolate(x, size=size) return x class ASPP(nn.Module): - def __init__(self, in_channels, atrous_rates,out_channels=512): + def __init__(self, in_channels, atrous_rates, out_channels=512): super(ASPP, self).__init__() - rate1, rate2, rate3 = tuple(atrous_rates) - self.fm_conx1=nn.Sequential( - nn.Conv2d(in_channels, out_channels//4, 1, bias=False), - nn.BatchNorm2d(out_channels//4,momentum=bn_momentum), - nn.ReLU()) + self.conv1 = nn.Conv2d(in_channels, out_channels // 4, 1, bias=False) + self.conv2 = nn.Conv2d(in_channels, out_channels // 4, kernel_size=3, padding=2, bias=False, dilation=rate1) - self.fm_convx3_rate2=nn.Sequential( - nn.Conv2d(in_channels, out_channels//4, kernel_size=3, padding=2, bias=False,dilation=rate1), - nn.BatchNorm2d(out_channels//4,momentum=bn_momentum), - nn.ReLU(inplace=True) - ) + self.conv3 = nn.Conv2d(in_channels, out_channels // 4, kernel_size=3, padding=4, bias=False, dilation=rate2) - self.fm_convx3_rate4=nn.Sequential( - nn.Conv2d(in_channels, out_channels//4, kernel_size=3, padding=4, bias=False,dilation=rate2), - nn.BatchNorm2d(out_channels//4,momentum=bn_momentum), - nn.ReLU(inplace=True) - ) - - # self.fm_convx3_rate8=nn.Sequential( - # nn.Conv2d(in_channels, out_channels//4, kernel_size=3, padding=8, bias=False,dilation=rate3), - # nn.BatchNorm2d(out_channels//4,momentum=bn_momentum), - # nn.ReLU(inplace=True) - # ) + self.bn_act = nn.Sequential(nn.BatchNorm2d(out_channels), + nn.ReLU(inplace=True)) - self.fm_pool=ASPPPooling(in_channels=in_channels,out_channels=out_channels//4) + self.fm_pool = ASPPPooling(in_channels=in_channels, out_channels=out_channels // 4) self.project = nn.Sequential( - nn.Conv2d(out_channels//4*4, out_channels, 1, bias=False), - nn.BatchNorm2d(out_channels,momentum=bn_momentum), + nn.Conv2d(out_channels // 4 * 4, out_channels, 1, bias=False), + nn.BatchNorm2d(out_channels), nn.ReLU(inplace=True)) def forward(self, x): + fm1 = self.conv1(x) + fm2 = self.conv2(x) + fm4 = self.conv3(x) - fm1=self.fm_conx1(x) - fm2=self.fm_convx3_rate2(x) - fm4=self.fm_convx3_rate4(x) - # fm8=self.fm_convx3_rate8(x) - fm_pool=self.fm_pool(x) + fm_pool = self.fm_pool(x) - res = torch.cat([fm1,fm2,fm4,fm_pool], dim=1) + res = torch.cat([fm1, fm2, fm4, fm_pool], dim=1) - return self.project(res) + res = self.bn_act(res) + return self.project(res) class FeatureFuc(nn.Module): def __init__(self, inchannels=128): super(FeatureFuc, self).__init__() + self.block1 = InvertedResidual(cnf=InvertedResidualConfig(inchannels, 5, 256, inchannels, False, "RE", 1, 1, 1), + norm_layer=partial(nn.BatchNorm2d, )) - self.block1=InvertedResidual(cnf=InvertedResidualConfig( inchannels, 5, 256, inchannels, False, "RE", 1, 1, 1), - norm_layer = partial(nn.BatchNorm2d, momentum=bn_momentum)) - - self.block2=InvertedResidual(cnf=InvertedResidualConfig( inchannels, 5, 256, inchannels, False, "RE", 1, 1, 1), - norm_layer = partial(nn.BatchNorm2d, momentum=bn_momentum)) - + self.block2 = InvertedResidual(cnf=InvertedResidualConfig(inchannels, 5, 256, inchannels, False, "RE", 1, 1, 1), + norm_layer=partial(nn.BatchNorm2d, )) def forward(self, x): + y1 = self.block1(x) - y1=self.block1(x) - - y2=self.block2(y1) + y2 = self.block2(y1) return y2 + class SCSEModule(nn.Module): def __init__(self, in_channels, reduction=4): super().__init__() @@ -159,7 +127,7 @@ def __init__(self, in_channels, reduction=4): self.sSE = nn.Sequential(nn.Conv2d(in_channels, 1, 1), nn.Sigmoid()) def forward(self, x): - return x *self.cSE(x) + x*self.sSE(x) + return x * self.cSE(x) + x * self.sSE(x) class DecoderBlock(nn.Module): @@ -176,21 +144,21 @@ def __init__( super().__init__() if use_separable_conv: self.conv1 = nn.Sequential(SeparableConv2d( - in_channels+skip_channels, + in_channels + skip_channels, out_channels, kernel_size=kernel_size, - padding=kernel_size//2, - ), - nn.BatchNorm2d(out_channels,momentum=bn_momentum), + padding=kernel_size // 2, + ), + nn.BatchNorm2d(out_channels), nn.ReLU(inplace=True)) else: self.conv1 = nn.Sequential(nn.Conv2d( - in_channels+skip_channels, + in_channels + skip_channels, out_channels, kernel_size=kernel_size, - padding=kernel_size//2, - ), - nn.BatchNorm2d(out_channels,momentum=bn_momentum), + padding=kernel_size // 2, + ), + nn.BatchNorm2d(out_channels), nn.ReLU(inplace=True)) # self.attention1 = md.Attention(attention_type, in_channels=in_channels + skip_channels) @@ -200,7 +168,7 @@ def __init__( out_channels, kernel_size=3, padding=1), - nn.BatchNorm2d(out_channels,momentum=bn_momentum), + nn.BatchNorm2d(out_channels), nn.ReLU(inplace=True)) else: self.conv2 = nn.Identity() @@ -208,16 +176,15 @@ def __init__( if use_attention: self.attention2 = SCSEModule(in_channels=out_channels) else: - self.attention2 =nn.Identity() + self.attention2 = nn.Identity() def forward(self, x, skip=None): # x= upsample_x_like_y(x,skip) - x = F.interpolate(x, scale_factor=2, mode='bilinear') + x = F.interpolate(x, scale_factor=2,mode='bilinear') if skip is None: return x - if skip is not None: x = torch.cat([x, skip], dim=1) # x = self.attention1(x) @@ -242,208 +209,212 @@ def weight_init(m): nn.init.constant_(m.bias, 0) -class Matting(nn.Module): +class Decoder(nn.Module): + + def __init__(self, encoder_channels): + super(Decoder, self).__init__() - def __init__(self,encoder_channels): - super(Matting, self).__init__() - self.extra_feature = FeatureFuc(encoder_channels[-1]) - self.aspp = ASPP(encoder_channels[-1], [2, 4, 8]) + self.aspp = ASPP(encoder_channels[-1], [2, 4, 8],out_channels=256) - self.upsampler1=DecoderBlock(512, encoder_channels[-2], 64, \ + self.upsampler1 = DecoderBlock(256, encoder_channels[-2], 256, \ use_separable_conv=True, \ use_attention=True, - kernel_size=5) + kernel_size=3) - self.upsampler2 = DecoderBlock(64, encoder_channels[-3], 32, \ + self.upsampler2 = DecoderBlock(256, encoder_channels[-3], 128, \ use_separable_conv=True, \ use_attention=False, use_second_conv=True, - kernel_size=5) - - self.upsampler3 = DecoderBlock(32, encoder_channels[-4], 32, \ - use_separable_conv=False, \ - use_attention=False, kernel_size=3) - self.apply(weight_init) - def forward(self,features): - - - img,encx2,encx4,encx8,encx16=features + def forward(self, features): + img, encx2, encx4, encx8, encx16 = features ## add extra feature - encx16=self.extra_feature(encx16) - encx16=self.aspp(encx16) - decx8=self.upsampler1(encx16,encx8) + encx16 = self.aspp(encx16) + + decx8 = self.upsampler1(encx16, encx8) decx4 = self.upsampler2(decx8, encx4) - decx2=self.upsampler3(decx4, encx2) #### semantic predict + return [decx4, decx8, encx16] - return[decx2,decx4,decx8,encx16] class Net(nn.Module): - def __init__(self,inp_size=(128,128)): + def __init__(self, inp_size=(128, 128)): super(Net, self).__init__() - - self.input_size=inp_size + self.input_size = inp_size self.encoder = timm.create_model(model_name='mobilenetv3_large_100.ra_in1k', pretrained=True, features_only=True, - out_indices=[0,1,2,4], - bn_momentum=bn_momentum, + out_indices=[0, 1, 2, 4], in_chans=3, output_stride=16, ) # self.encoder.blocks[4][1]=nn.Identity() - self.encoder.blocks[5]=nn.Identity() - self.encoder.blocks[6]=nn.Identity() - - - self.encoder_out_channels = [3, 16, 24, 40, 112] #mobilenetv3 + self.encoder.blocks[6] = nn.Identity() + self.encoder_out_channels = [3, 16, 24, 40, 160] # mobilenetv3 - self.matting=Matting(self.encoder_out_channels) + self.decoder = Decoder(self.encoder_out_channels) self._avg_pooling = nn.AdaptiveAvgPool2d(1) - self.fc = nn.Linear(640, 98 * 2 + 3 + 4, bias=True) - - - self.hm=nn.Conv2d(in_channels=32,out_channels=98,kernel_size=3,stride=1,padding=1,bias=True) + self.fc = nn.Linear(640, 3 + 4, bias=True) + self.hm = nn.Conv2d(in_channels=128, out_channels=98*3, kernel_size=1, stride=1, padding=0, bias=True) weight_init(self.fc) weight_init(self.hm) + def forward(self, x): """Sequentially pass `x` trough model`s encoder, decoder and heads""" - bs=x.size(0) + bs = x.size(0) - features=self.encoder(x) + features = self.encoder(x) - features=[x]+features + features = [x] + features - [encx2,encx4,encx8,encx16]=self.matting(features) + [encx4, encx8, encx16] = self.decoder(features) fmx16 = self._avg_pooling(encx16) fmx8 = self._avg_pooling(encx8) fmx4 = self._avg_pooling(encx4) - fmx2 = self._avg_pooling(encx2) - - - fm=torch.cat([fmx2,fmx4,fmx8,fmx16],dim=1) + fm = torch.cat([fmx4, fmx8, fmx16], dim=1) fm = fm.view(bs, -1) x = self.fc(fm) - ## hm size as 64x64 - if self.input_size==(128,128): - hm = self.hm(encx2) - elif self.input_size == (256, 256): - hm=self.hm(encx4) - else: - print('please change the model ,by default ,we use 64x64 for hm') - raise NotImplementedError + hm = self.hm(encx4) - return x,hm,[encx2,encx4,encx8,encx16,x] + return x, hm, [hm] class TeacherNet(nn.Module): - def __init__(self,inp_size=(128,128)): + def __init__(self, inp_size=(128, 128)): super(TeacherNet, self).__init__() self.input_size = inp_size - self.encoder = timm.create_model(model_name='efficientnet_b5.in12k_ft_in1k', + self.encoder = timm.create_model(model_name='hrnet_w18', pretrained=True, features_only=True, - out_indices=[0,1,2,3], - bn_momentum=bn_momentum, + out_indices=[0, 1, 2, 3], in_chans=3, ) - self.encoder.out_channels=[3, 24 , 40, 64,176] + self.encoder.out_channels = [3, 64, 128, 256, 512] - self.matting = Matting(self.encoder.out_channels) + self.decoder = Decoder(self.encoder.out_channels) self._avg_pooling = nn.AdaptiveAvgPool2d(1) - self.fc = nn.Linear(640, 98 * 2 + 3 + 4, bias=True) + self.fc = nn.Linear(640, 3 + 4, bias=True) - self.hm = nn.Conv2d(in_channels=32, out_channels=98, kernel_size=3, stride=1, padding=1, bias=True) + self.hm = nn.Conv2d(in_channels=128, out_channels=98*3, kernel_size=1, stride=1, padding=0, bias=True) weight_init(self.fc) weight_init(self.hm) + def forward(self, x): """Sequentially pass `x` trough model`s encoder, decoder and heads""" - bs=x.size(0) - features=self.encoder(x) - - features=[x]+features - [encx2, encx4, encx8, encx16] = self.matting(features) + bs = x.size(0) + features = self.encoder(x) + features = [x] + features + [encx4, encx8, encx16] = self.decoder(features) fmx16 = self._avg_pooling(encx16) fmx8 = self._avg_pooling(encx8) fmx4 = self._avg_pooling(encx4) - fmx2 = self._avg_pooling(encx2) - fm = torch.cat([fmx2,fmx4, fmx8, fmx16], dim=1) + + fm = torch.cat([fmx4, fmx8, fmx16], dim=1) fm = fm.view(bs, -1) x = self.fc(fm) - if self.input_size == (128, 128): - hm = self.hm(encx2) - elif self.input_size == (256, 256): - hm = self.hm(encx4) - else: - print('please change the model ,by default ,we use 64x64 for hm') - raise NotImplementedError + hm = self.hm(encx4) - return x,hm, [encx2, encx4, encx8, encx16, x] + return x, hm, [hm] -class COTRAIN(nn.Module): - def __init__(self,inference=False,inp_size=(128,128)): - super(COTRAIN, self).__init__() +class AWingLoss(nn.Module): + def __init__(self, omega=14, theta=0.5, epsilon=1, alpha=2.1, use_weight_map=True): + super(AWingLoss, self).__init__() + self.omega = omega + self.theta = theta + self.epsilon = epsilon + self.alpha = alpha + self.use_weight_map = use_weight_map - self.inference=inference - self.student=Net(inp_size) - self.teacher=TeacherNet(inp_size) + def __repr__(self): + return "AWingLoss()" - self.MSELoss=nn.MSELoss() + def generate_weight_map(self, heatmap, k_size=3, w=10): + dilate = F.max_pool2d(heatmap, kernel_size=k_size, stride=1, padding=1) + weight_map = torch.where(dilate < 0.2, torch.zeros_like(heatmap), torch.ones_like(heatmap)) + return w * weight_map + 1 + def forward(self, output, groundtruth): + """ + input: b x n x h x w + output: b x n x h x w => 1 + """ + delta = (output - groundtruth).abs() + A = self.omega * (1 / (1 + torch.pow(self.theta / self.epsilon, self.alpha - groundtruth))) * ( + self.alpha - groundtruth) * \ + (torch.pow(self.theta / self.epsilon, self.alpha - groundtruth - 1)) * (1 / self.epsilon) + C = self.theta * A - self.omega * \ + torch.log(1 + torch.pow(self.theta / self.epsilon, self.alpha - groundtruth)) + loss = torch.where(delta < self.theta, + self.omega * torch.log(1 + torch.pow(delta / self.epsilon, self.alpha - groundtruth)), + (A * delta - C)) + if self.use_weight_map: + weight = self.generate_weight_map(groundtruth) + loss = loss * weight + return loss - self.BCELoss = nn.BCEWithLogitsLoss(reduction='none') +class COTRAIN(nn.Module): + def __init__(self, inference=None, inp_size=(128, 128)): + super(COTRAIN, self).__init__() - self.act=nn.Sigmoid() + self.inference = inference + self.student = Net(inp_size) + self.teacher = TeacherNet(inp_size) + self.MSELoss = nn.MSELoss() + self.MSELoss_no_reduction = nn.MSELoss(reduction='none') + self.BCELoss = nn.BCEWithLogitsLoss(reduction='none') - def distill_loss(self,student_pres,teacher_pres): - num_level=len(student_pres) - loss=0 - for i in range(num_level): - loss+=self.MSELoss(student_pres[i],teacher_pres[i]) + self.Awing = AWingLoss() + if inference=='teacher': + self.run_with_teacher=True - return loss/num_level + def distill_loss(self, student_pres, teacher_pres): + num_level = len(student_pres) + loss = 0 + for i in range(num_level): + loss += self.MSELoss(student_pres[i], teacher_pres[i].detach()) - def criterion(self,y_pred, y_true): + return loss / num_level - return 0.5*self.BCELoss(y_pred, y_true) + 0.5*self.DiceLoss(y_pred, y_true) + def criterion(self, y_pred, y_true): - def _wing_loss(self,landmarks, labels, w=10.0, epsilon=2.0, weights=1.): + return 0.5 * self.BCELoss(y_pred, y_true) + 0.5 * self.DiceLoss(y_pred, y_true) + + def _wing_loss(self, landmarks, labels, w=10.0, epsilon=2.0, weights=1.): """ Arguments: landmarks, labels: float tensors with shape [batch_size, landmarks]. landmarks means x1,x2,x3,x4...y1,y2,y3,y4 1-D @@ -461,47 +432,37 @@ def _wing_loss(self,landmarks, labels, w=10.0, epsilon=2.0, weights=1.): ) + - losses=losses*weights - loss = torch.sum(torch.mean(losses , dim=[0])) - - return loss - + return losses - def loss(self,predict_keypoints, label_keypoints): + def loss(self, predict_keypoints, label_keypoints): - landmark_label = label_keypoints[:, :98*2] - pose_label = label_keypoints[:, 196:199] + pose_label = label_keypoints[:, 98*2:98*2+3] - cls_label=label_keypoints[:,199:199+4] + cls_label = label_keypoints[:, 98*2+3:98*2+3 + 4] # leye_cls_label = label_keypoints[:, 199] # reye_cls_label = label_keypoints[:, 200] # mouth_cls_label = label_keypoints[:, 201] # big_mouth_cls_label = label_keypoints[:, 202] - landmark_weights=label_keypoints[:,199+4:199+4+196] - cls_weights = label_keypoints[:, -4:] - - - landmark_predict = predict_keypoints[:, :98*2] - pose_predict = predict_keypoints[:, 196:199] + pose_predict = predict_keypoints[:, :3] # leye_cls_predict = predict_keypoints[:, 199] # reye_cls_predict = predict_keypoints[:, 200] # mouth_cls_predict = predict_keypoints[:, 201] # big_mouth_cls_predict = predict_keypoints[:, 202] - cls_label_predict= predict_keypoints[:, 199:199+4] - + cls_label_predict = predict_keypoints[:, 3:3 + 4] - loss = self._wing_loss(landmark_predict, landmark_label,weights=landmark_weights) + #loss = self._wing_loss(landmark_predict, landmark_label, weights=landmark_weights) loss_pose = self.MSELoss(pose_predict, pose_label) - cls_loss=self.BCELoss ( cls_label_predict,cls_label) - cls_loss=cls_loss*cls_weights + cls_loss = self.BCELoss(cls_label_predict, cls_label) + cls_loss = cls_loss * cls_weights - cls_loss=torch.sum(cls_loss)/torch.sum(cls_weights) + cls_loss = torch.sum(cls_loss) / torch.sum(cls_weights) # leye_loss = self.BCELoss (leye_cls_predict, leye_cls_label) # reye_loss = self.BCELoss (reye_cls_predict, reye_cls_label) @@ -510,85 +471,132 @@ def loss(self,predict_keypoints, label_keypoints): # mouth_loss_big = self.BCELoss (big_mouth_cls_predict, big_mouth_cls_label) # mouth_loss = mouth_loss + mouth_loss_big + return loss_pose + cls_loss - return loss + loss_pose + cls_loss - def hm_loss(self,predict_hm, label_hm): + def offside_loss(self,pre,gt,weight): - bs=label_hm.size(0) - - hm_loss = self.BCELoss(predict_hm, label_hm) + loss=self._wing_loss(pre,gt) - hm_loss=torch.sum(hm_loss)/bs/64./64. - return hm_loss + loss=loss*weight + loss=torch.sum(loss)/torch.sum(weight) + return loss + def hm_loss(self, predict_hm, label_hm): + bs = label_hm.size(0) - def postp(self,hm): - bs=hm.size(0) - print(hm.size()) - hm=hm.reshape([bs,98,-1]) + hm=label_hm[:,:98,...] + hm_prd = predict_hm[:, :98, ...] + hm_loss = self.Awing(hm_prd, hm) - hm=torch.argmax(hm,dim=2) + hm_loss = torch.mean(hm_loss) + offside_pre_x=predict_hm[:,98:2*98,...] + offside_gt_x = label_hm[:, 98:2*98, ...] - X=hm%64 - Y=hm//64 + offside_loss_x=self.offside_loss(offside_pre_x,offside_gt_x,label_hm[:,:98,...]) - loc=torch.stack([X,Y],dim=2).float()/64 + offside_pre_y = predict_hm[:, 2 * 98:, ...] + offside_gt_y = label_hm[:, 2 * 98: , ...] + offside_loss_y = self.offside_loss(offside_pre_y, offside_gt_y, label_hm[:, :98, ...]) - return loc - def forward(self, x,gt=None,gt_hm=None): + ##offside_loss - student_pre,student_hm,student_fms=self.student(x) + return hm_loss+offside_loss_x+offside_loss_y - teacher_pre,teacher_hm, teacher_fms = self.teacher(x) + def postp(self, hm): + # + hm_score=hm[:,:98,...] + hm_H = hm.size(2) + hm_W = hm.size(3) + bs = hm.size(0) - if self.inference: - # teacher_pre[:,-4:]=torch.nn.Sigmoid()(teacher_pre[:,-4:]) - # teacher_hm = torch.nn.Sigmoid()(teacher_hm) - # - # loc=self.postp(teacher_hm) + hm_score = hm_score.reshape([bs, 98, -1]) - return teacher_pre#,teacher_hm + hm_indx = torch.argmax(hm_score, dim=2) - distill_loss=self.distill_loss(student_fms,teacher_fms) + #### add offside - student_loss=self.loss(student_pre,gt) + offside_x=hm[:,98:2*98,...].reshape([bs, 98, -1]) + offside_y = hm[:, 2*98:, ...].reshape([bs, 98, -1]) - student_hm_loss=self.hm_loss(student_hm,gt_hm) - student_loss=student_loss+student_hm_loss - teacher_loss=self.loss(teacher_pre,gt) + gether_indx=hm_indx.unsqueeze(-1) + offside_x = torch.gather(offside_x,dim=-1,index=gether_indx).squeeze(-1) + offside_y = torch.gather(offside_y,dim=-1,index=gether_indx).squeeze(-1) + + + X = hm_indx % hm_W + Y = hm_indx // hm_W + + + X_fix = X + offside_x + Y_fix = Y + offside_y + + loc = torch.stack([X, Y], dim=2).float() + loc[..., 0] /= hm_W + loc[..., 1] /= hm_H + loc = loc.view(bs, -1) - teacher_hm_loss = self.hm_loss(teacher_hm, gt_hm) - teacher_loss=teacher_loss+teacher_hm_loss - return student_loss,teacher_loss,distill_loss,student_pre,teacher_pre + loc_fix = torch.stack([X_fix, Y_fix], dim=2).float() + loc_fix[..., 0] /= hm_W + loc_fix[..., 1] /= hm_H + loc_fix = loc_fix.view(bs, -1) + return loc,loc_fix + def forward(self, x, gt=None, gt_hm=None): + student_pre, student_hm, student_fms = self.student(x) + teacher_pre, teacher_hm, teacher_fms = self.teacher(x) + if self.inference : + if self.inference=='teacher': + hm_used=teacher_hm + else: + hm_used=student_hm + teacher_pre, teacher_pre_full = self.postp(hm_used) + return teacher_pre_full # ,teacher_hm + distill_loss = self.distill_loss(student_fms, teacher_fms) -if __name__=='__main__': + student_loss = self.loss(student_pre, gt) + + student_hm_loss = self.hm_loss(student_hm, gt_hm) + + student_loss = student_loss + student_hm_loss + teacher_loss = self.loss(teacher_pre, gt) + + teacher_hm_loss = self.hm_loss(teacher_hm, gt_hm) + + teacher_loss = teacher_loss + teacher_hm_loss + + ### decode hm + student_pre,student_pre_full = self.postp(student_hm) + teacher_pre,teacher_pre_full = self.postp(teacher_hm) + + return student_loss, teacher_loss, distill_loss, student_pre, student_pre_full, teacher_pre, teacher_pre_full + + +if __name__ == '__main__': import torch import torchvision from thop import profile - # dummy_x = torch.randn(1, 3, 288, 160, device='cpu') - - model = COTRAIN(inference=True) + model = COTRAIN(inference='teacher') input = torch.randn(1, 3, 128, 128) flops, params = profile(model, inputs=(input,)) - print(flops/1024/1024) + print(flops / 1024 / 1024 / 1024) + print(params / 1024 / 1024) diff --git a/TRAIN/face_landmark/lib/core/base_trainer/net_work.py b/TRAIN/face_landmark/lib/core/base_trainer/net_work.py index ee71cd9..2f6d69b 100755 --- a/TRAIN/face_landmark/lib/core/base_trainer/net_work.py +++ b/TRAIN/face_landmark/lib/core/base_trainer/net_work.py @@ -234,7 +234,7 @@ def distributed_train_epoch(epoch_num): with torch.cuda.amp.autocast(enabled=self.fp16): - student_loss, teacher_loss, distill_loss,mate,_ = self.model(data,kps,hms) + student_loss, teacher_loss, distill_loss,_,_,_,_ = self.model(data,kps,hms) # calculate the final loss, backward the loss, and update the model current_loss = student_loss+ teacher_loss+ distill_loss @@ -266,7 +266,6 @@ def distributed_train_epoch(epoch_num): if self.iter_num%cfg.TRAIN.log_interval==0: - log_message = '[fold %d], ' \ 'Train Step %d, ' \ '[%d/%d ] ' \ @@ -299,7 +298,9 @@ def distributed_test_epoch(epoch_num): summary_student_mad= AverageMeter() summary_student_mse= AverageMeter() summary_student_nme = AverageMeter() + summary_student_fix_nme = AverageMeter() summary_teacher_nme = AverageMeter() + summary_teacher_fix_nme = AverageMeter() self.model.eval() @@ -313,7 +314,7 @@ def distributed_test_epoch(epoch_num): hms= hms.to(self.device).float() batch_size = data.shape[0] - loss,_,_,student_output,teacher_output = self.model(data,kps,hms) + loss,_,_,student_pre,student_pre_fix,teacher_pre,teacher_pre_fix = self.model(data,kps,hms) if self.ddp: @@ -323,20 +324,32 @@ def distributed_test_epoch(epoch_num): - student_val_mad,student_val_mse,student_val_nme =self.metric(kps[:,:98*2],student_output[:,:98*2]) - teacher_val_mad, teacher_val_mse, teacher_val_nme = self.metric(kps[:, :98 * 2], teacher_output[:, :98 * 2]) + student_val_mad,student_val_mse,student_val_nme =self.metric(kps[:,:98*2], + student_pre[:,:98*2]) + student_fix_val_mad, student_fix_val_mse, student_fix_val_nme = self.metric(kps[:, :98 * 2], + student_pre_fix[:, :98 * 2]) + + techer_val_mad, techer_val_mse, techer_val_nme = self.metric(kps[:, :98 * 2], + teacher_pre[:, :98 * 2]) + techer_fix_val_mad, techer_fix_val_mse, techer_fix_val_nme = self.metric(kps[:, :98 * 2], + teacher_pre_fix[:, :98 * 2]) if self.ddp: torch.distributed.all_reduce(student_val_mad.div_(torch.distributed.get_world_size())) torch.distributed.all_reduce(student_val_mse.div_(torch.distributed.get_world_size())) torch.distributed.all_reduce(student_val_nme.div_(torch.distributed.get_world_size())) - torch.distributed.all_reduce(teacher_val_nme.div_(torch.distributed.get_world_size())) + torch.distributed.all_reduce(student_fix_val_nme.div_(torch.distributed.get_world_size())) + + torch.distributed.all_reduce(techer_val_nme.div_(torch.distributed.get_world_size())) + torch.distributed.all_reduce(techer_fix_val_nme.div_(torch.distributed.get_world_size())) summary_student_mad.update(student_val_mad.detach().item(), batch_size) summary_student_mse.update(student_val_mse.detach().item(), batch_size) summary_student_nme.update(student_val_nme.detach().item(), batch_size) - summary_teacher_nme.update(teacher_val_nme.detach().item(), batch_size) + summary_student_fix_nme.update(student_fix_val_nme.detach().item(), batch_size) + summary_teacher_nme.update(techer_val_nme.detach().item(), batch_size) + summary_teacher_fix_nme.update(techer_fix_val_nme.detach().item(), batch_size) if step % cfg.TRAIN.log_interval == 0: @@ -344,19 +357,22 @@ def distributed_test_epoch(epoch_num): 'Val Step %d, ' \ 'summary_loss: %.6f, ' \ 'student_summary_nme: %.6f, ' \ + 'student_summary_nme_fix: %.6f, ' \ 'teacher_summary_nme: %.6f, ' \ + 'teacher_summary_nme_fix: %.6f, ' \ 'time: %.6f' % ( self.fold,step, summary_loss.avg, summary_student_nme.avg, + summary_student_fix_nme.avg, summary_teacher_nme.avg, + summary_teacher_fix_nme.avg, time.time() - t) logger.info(log_message) - - - return summary_loss,summary_student_mad,summary_student_mse,summary_student_nme,summary_teacher_nme + return summary_loss,summary_student_mad,summary_student_mse,summary_student_nme,summary_student_fix_nme,\ + summary_teacher_nme,summary_teacher_fix_nme @@ -388,15 +404,19 @@ def distributed_test_epoch(epoch_num): if epoch%cfg.TRAIN.test_interval==0 and epoch>0 or epoch%10==0: - summary_loss ,summary_student_mad,summary_student_mse,summary_student_nme,summary_teacher_nme= distributed_test_epoch(epoch) + summary_loss ,summary_student_mad,summary_student_mse,summary_student_nme,summary_student_fix_nme,\ + summary_teacher_nme,summary_teacher_fix_nme\ + = distributed_test_epoch(epoch) val_epoch_log_message = '[fold %d], ' \ '[RESULT]: VAL. Epoch: %d,' \ ' summary_loss: %.5f,' \ ' student_mad_score: %.5f,' \ ' student_mse_score: %.5f,' \ - ' student_nme_score: %.5f,' \ - ' teacher_nme_score: %.5f,' \ + 'student_summary_nme: %.5f, ' \ + 'student_summary_nme_fix: %.5f, ' \ + 'teacher_summary_nme: %.5f, ' \ + 'teacher_summary_nme_fix: %.5f, ' \ ' time:%.5f' % ( self.fold, epoch, @@ -404,7 +424,9 @@ def distributed_test_epoch(epoch_num): summary_student_mad.avg, summary_student_mse.avg, summary_student_nme.avg, + summary_student_fix_nme.avg, summary_teacher_nme.avg, + summary_teacher_fix_nme.avg, (time.time() - t)) logger.info(val_epoch_log_message) @@ -419,10 +441,11 @@ def distributed_test_epoch(epoch_num): #### save the model every end of epoch #### save the model every end of epoch - current_model_saved_name='./models/fold%d_epoch_%d_val_loss_%.6f_val_nme_%.6f.pth'%(self.fold, + current_model_saved_name='./models/fold%d_epoch_%d_val_loss_%.6f_student_nme_%.5f_teacher_nme_%.5f.pth'%(self.fold, epoch, summary_loss.avg, - summary_student_nme.avg,) + summary_student_fix_nme.avg, + summary_teacher_fix_nme.avg) logger.info('A model saved to %s' % current_model_saved_name) #### save the model every end of epoch if self.ddp and torch.distributed.get_rank() == 0 : diff --git a/TRAIN/face_landmark/lib/dataset/dataietr.py b/TRAIN/face_landmark/lib/dataset/dataietr.py index 0755193..2b208e0 100644 --- a/TRAIN/face_landmark/lib/dataset/dataietr.py +++ b/TRAIN/face_landmark/lib/dataset/dataietr.py @@ -1,5 +1,3 @@ - - import random import cv2 import json @@ -9,68 +7,59 @@ from lib.utils.logger import logger - import traceback from train_config import config as cfg import albumentations as A import os import copy -from lib.dataset.augmentor.augmentation import Rotate_aug,\ - Affine_aug,\ - Mirror,\ - Padding_aug +from lib.dataset.augmentor.augmentation import Rotate_aug, \ + Affine_aug, \ + Mirror, \ + Padding_aug from lib.dataset.headpose import get_head_pose - - cv2.setNumThreads(0) cv2.ocl.setUseOpenCL(False) + class AlaskaDataIter(): - def __init__(self, df,img_root, - training_flag=True,shuffle=True): + def __init__(self, df, img_root, + training_flag=True, shuffle=True): self.eye_close_thres = 0.03 self.mouth_close_thres = 0.02 self.big_mouth_open_thres = 0.08 - - self.training_flag = training_flag self.shuffle = shuffle - self.img_root_path=img_root - + self.img_root_path = img_root - self.df=df + self.df = df - if self.training_flag: - self.balance() + # if self.training_flag: + # self.balance() self.train_trans = A.Compose([ - A.RandomBrightnessContrast(p=0.5), - A.HueSaturationValue(p=0.5), - A.GaussianBlur(p=0.3), - A.ToGray(p=0.1), - A.GaussNoise(p=0.2), - A.CoarseDropout(max_holes=8,max_width=16, - max_height=16) - - + A.RandomBrightnessContrast(p=0.5), + A.HueSaturationValue(p=0.5), + A.GaussianBlur(p=0.3), + A.ToGray(p=0.1), + A.GaussNoise(p=0.2), + A.CoarseDropout(max_holes=8, max_width=16, + max_height=16, + p=0.2) ]) - - self.val_trans=A.Compose([ + self.val_trans = A.Compose([ A.Resize(height=cfg.MODEL.hin, width=cfg.MODEL.win) ]) - - def __getitem__(self, item): return self.single_map_func(self.df[item], self.training_flag) @@ -82,16 +71,15 @@ def __len__(self): def balance(self, ): df = copy.deepcopy(self.df) - expanded=[] + expanded = [] lar_count = 0 for i in tqdm(range(len(df))): - cur_df=df[i] + cur_df = df[i] ### 300w balance, according to keypoints - ann=cur_df.split() - label =np.array(ann[:98*2],dtype=np.float32).reshape([-1,2]) - + ann = cur_df.split() + label = np.array(ann[:98 * 2], dtype=np.float32).reshape([-1, 2]) bbox = [float(np.min(label[:, 0])), float(np.min(label[:, 1])), float(np.max(label[:, 0])), float(np.max(label[:, 1]))] @@ -101,7 +89,7 @@ def balance(self, ): # if bbox_width < 50 or bbox_height < 50: # res_anns.remove(ann) - cnt=0 + cnt = 0 left_eye_close = np.sqrt( np.square(label[62, 0] - label[66, 0]) + np.square(label[62, 1] - label[66, 1])) / bbox_height < self.eye_close_thres @@ -115,15 +103,13 @@ def balance(self, ): expanded.append(cur_df) # lar_count += 1 - ##half face if np.sqrt(np.square(label[60, 0] - label[72, 0]) + np.square(label[60, 1] - label[72, 1])) / bbox_width < 0.5: for i in range(5): expanded.append(cur_df) - - #open mouth + # open mouth if np.sqrt(np.square(label[90, 0] - label[94, 0]) + np.square(label[90, 1] - label[94, 1])) / bbox_height > 0.15: for i in range(2): @@ -136,17 +122,16 @@ def balance(self, ): ##########eyes diff aug if left_eye_close and not right_eye_close: - for i in range(20): + for i in range(15): expanded.append(cur_df) lar_count += 1 if not left_eye_close and right_eye_close: - for i in range(20): + for i in range(15): expanded.append(cur_df) lar_count += 1 - print(lar_count) - self.df+=expanded + self.df += expanded logger.info('befor balance the dataset contains %d images' % (len(df))) logger.info('after balanced the datasets contains %d samples' % (len(self.df))) @@ -154,10 +139,10 @@ def augmentationCropImage(self, img, bbox, joints=None, is_training=True): bbox = np.array(bbox).reshape(4, ).astype(np.float32) - bbox_width=bbox[2]-bbox[0] - bbox_height=bbox[3]-bbox[1] + bbox_width = bbox[2] - bbox[0] + bbox_height = bbox[3] - bbox[1] - add = int(max(bbox_width,bbox_height)) + add = int(max(bbox_width, bbox_height)) bimg = cv2.copyMakeBorder(img, add, add, add, add, borderType=cv2.BORDER_CONSTANT) @@ -207,14 +192,14 @@ def augmentationCropImage(self, img, bbox, joints=None, is_training=True): joints[:, 1] = joints[:, 1] * cfg.MODEL.hin return img, joints - def gaussian_k(self,x0, y0, sigma, width, height): + def gaussian_k(self, x0, y0, sigma, width, height): """ Make a square gaussian kernel centered at (x0, y0) with sigma as SD. """ x = np.arange(0, width, 1, float) ## (width,) y = np.arange(0, height, 1, float)[:, np.newaxis] ## (height,1) return np.exp(-((x - x0) ** 2 + (y - y0) ** 2) / (2 * sigma ** 2)) - def generate_hm(self,height, width, landmarks, s=3): + def generate_hm(self, height, width, landmarks, s=3): """ Generate a full Heap Map for every landmarks in an array Args: height : The height of Heat Map (the height of target output) @@ -228,58 +213,63 @@ def generate_hm(self,height, width, landmarks, s=3): # if not np.array_equal(landmarks[i], [-1, -1]): hm[:, :, i] = self.gaussian_k(landmarks[i][0], - landmarks[i][1], - s, height, width) + landmarks[i][1], + s, height, width) # else: # hm[:, :, i] = np.zeros((height, width)) - return hm - + ### make offside - def doeys(self,img,kps): - if random.uniform(0,1)<0.5: - eye_region=kps[60:67,:] - weights_labelel=[0,1] + offside_y, offside_x = np.meshgrid(np.arange(0, height), np.arange(0, width), indexing='ij') + offside_y=np.expand_dims(offside_y,axis=-1) + offside_y=np.repeat(offside_y,Nlandmarks,axis=-1) - else: - - eye_region = kps[68:75, :] - weights_labelel = [1,0] + offside_x = np.expand_dims(offside_x, axis=-1) + offside_x = np.repeat(offside_x, Nlandmarks, axis=-1) + offside_x = landmarks[:, 0] - offside_x + offside_y = landmarks[:, 1] - offside_y + hm=np.concatenate([hm,offside_x,offside_y],axis=-1) + return hm - xmin = int(np.clip(np.min(eye_region[:,0])-10,0,128)) - ymin = int(np.clip(np.min(eye_region[:, 1])-10,0,128)) - xmax = int(np.clip(np.max(eye_region[:, 0])+10,0,128)) - ymax = int( np.clip(np.max(eye_region[:, 1])+10,0,128)) - + def doeys(self, img, kps): - img[ymin:ymax,xmin:xmax,:]=0 + if random.uniform(0, 1) < 0.5: + eye_region = kps[60:67, :] + weights_labelel = [0, 1] - return img,weights_labelel + else: + eye_region = kps[68:75, :] + weights_labelel = [1, 0] + xmin = int(np.clip(np.min(eye_region[:, 0]) - 10, 0, cfg.MODEL.win)) + ymin = int(np.clip(np.min(eye_region[:, 1]) - 10, 0, cfg.MODEL.hin)) + xmax = int(np.clip(np.max(eye_region[:, 0]) + 10, 0, cfg.MODEL.win)) + ymax = int(np.clip(np.max(eye_region[:, 1]) + 10, 0, cfg.MODEL.hin)) + img[ymin:ymax, xmin:xmax, :] = 0 + return img, weights_labelel def single_map_func(self, dp, is_training): """Data augmentation function.""" ####customed here - if 'wink' in dp: dp = dp.split() kps = dp[:98 * 2] fn = dp[-1] - image = cv2.imread( fn) + image = cv2.imread(fn) else: - dp=dp.split() - kps=dp[:98*2] - fn=dp[-1] - image=cv2.imread(os.path.join(self.img_root_path,fn)) + dp = dp.split() + kps = dp[:98 * 2] + fn = dp[-1] + image = cv2.imread(os.path.join(self.img_root_path, fn)) - kps=np.array(kps,dtype=np.float32).reshape([-1,2]) + kps = np.array(kps, dtype=np.float32).reshape([-1, 2]) bbox = [float(np.min(kps[:, 0])), float(np.min(kps[:, 1])), float(np.max(kps[:, 0])), float(np.max(kps[:, 1]))] @@ -317,11 +307,9 @@ def single_map_func(self, dp, is_training): cls_label[0] = 1 if np.sqrt(np.square(label[70, 0] - label[74, 0]) + - np.square(label[70, 1] - label[74, 1])) / cfg.MODEL.hin < self.eye_close_thres : + np.square(label[70, 1] - label[74, 1])) / cfg.MODEL.hin < self.eye_close_thres: cls_label[1] = 1 - - if np.sqrt(np.square(label[89, 0] - label[95, 0]) + np.square(label[89, 1] - label[95, 1])) / cfg.MODEL.hin < self.mouth_close_thres \ or np.sqrt(np.square(label[90, 0] - label[94, 0]) + @@ -335,19 +323,16 @@ def single_map_func(self, dp, is_training): np.square(label[90, 1] - label[94, 1])) / cfg.MODEL.hin > self.big_mouth_open_thres: cls_label[3] = 1 - - - if is_training: kps_weight = np.ones_like(label) cls_weight = np.ones_like(cls_label) - if random.uniform(0,1)>0.5: + if random.uniform(0, 1) > 0.5: - crop_image,weights=self.doeys(crop_image,label) + crop_image, weights = self.doeys(crop_image, label) - if weights==[0,1]: - kps_weight[60:67,:]=0 - cls_weight[0]=0 + if weights == [0, 1]: + kps_weight[60:67, :] = 0 + cls_weight[0] = 0 else: kps_weight[68:75, :] = 0 cls_weight[1] = 0 @@ -356,11 +341,6 @@ def single_map_func(self, dp, is_training): kps_weight = np.ones_like(label) cls_weight = np.ones_like(cls_label) - - - - - crop_image_height, crop_image_width, _ = crop_image.shape label = label.astype(np.float32) @@ -372,20 +352,21 @@ def single_map_func(self, dp, is_training): crop_image = np.transpose(crop_image, axes=[2, 0, 1]) crop_image /= 255. - label = label.reshape([-1]).astype(np.float32) - kps_weight= kps_weight.reshape([-1]).astype(np.float32) + kps_weight = kps_weight.reshape([-1]).astype(np.float32) cls_label = cls_label.astype(np.float32) - cls_weight= cls_weight.astype(np.float32) + cls_weight = cls_weight.astype(np.float32) - total_label = np.concatenate([label, PRY, cls_label,kps_weight,cls_weight], axis=0) + total_label = np.concatenate([label, PRY, cls_label, kps_weight, cls_weight], axis=0) + ### hm size inputsize//4 + kps = label.reshape([-1, 2]) + kps[:, 0] *= cfg.MODEL.win // 4 + kps[:, 1] *= cfg.MODEL.hin // 4 - ### hm size 64x64 - kps=label.reshape([-1,2])*64 - - hm=self.generate_hm(64,64,kps,3) + hm = self.generate_hm(cfg.MODEL.hin // 4, cfg.MODEL.win // 4, kps, 3) hm = np.transpose(hm, axes=[2, 0, 1]) - return fn,crop_image,total_label,hm + + return fn, crop_image, total_label, hm diff --git a/TRAIN/face_landmark/make_json.py b/TRAIN/face_landmark/make_json.py deleted file mode 100644 index 26082a0..0000000 --- a/TRAIN/face_landmark/make_json.py +++ /dev/null @@ -1,211 +0,0 @@ -import os -import random -import numpy as np -import json -import traceback -import cv2 -import pandas as pd - -from tqdm import tqdm -''' -i decide to merge more data from CelebA, the data anns will be complex, so json maybe a better way. -''' - - - - - -data_dir='/media/lz/ssd_2/coco_data/facelandmark/PUB' ########points to your director,300w -#celeba_data_dir='CELEBA' ########points to your director,CELEBA - - -train_json='./train.csv' -val_json='./val.csv' -save_dir='../tmp_crop_data_face_landmark_pytorch' - -if not os.access(save_dir,os.F_OK): - os.mkdir(save_dir) - -def GetFileList(dir, fileList): - newDir = dir - if os.path.isfile(dir): - fileList.append(dir) - elif os.path.isdir(dir): - for s in os.listdir(dir): - - # if s == "pts": - # continue - newDir=os.path.join(dir,s) - GetFileList(newDir, fileList) - return fileList - - - - -pic_list=[] -GetFileList(data_dir,pic_list) - -pic_list=[x for x in pic_list if '.jpg' in x or 'png' in x or 'jpeg' in x ] - - -ratio=0.95 -train_list=[x for x in pic_list if 'AFW' not in x] -val_list=[x for x in pic_list if 'AFW' in x] - -# train_list=[x for x in pic_list if '300W/' not in x] -# val_list=[x for x in pic_list if '300W/' in x] - - -def process_data(data_list,csv_nm): - - - global cnt - image_list=[] - keypoint_list=[] - - for pic in tqdm(data_list): - one_image_ann={} - - ### image_path - one_image_ann['image_path_raw']=pic - - #### keypoints - pts=pic.rsplit('.',1)[0]+'.pts' - if os.access(pic,os.F_OK) and os.access(pts,os.F_OK): - try: - tmp=[] - with open(pts) as p_f: - labels=p_f.readlines()[3:-1] - for _one_p in labels: - xy = _one_p.rstrip().split(' ') - tmp.append([float(xy[0]),float(xy[1])]) - - one_image_ann['keypoints'] = tmp - - label = np.array(tmp).reshape((-1, 2)) - bbox = [float(np.min(label[:, 0])), float(np.min(label[:, 1])), float(np.max(label[:, 0])), float(np.max(label[:, 1]))] - one_image_ann['bbox'] = bbox - - ### placeholder - one_image_ann['attr'] = None - - ###### crop it - - image=cv2.imread(one_image_ann['image_path_raw'],cv2.IMREAD_COLOR) - - h,w,c=image.shape - - ##expanded for - bbox_int = [int(x) for x in bbox] - bbox_width = bbox_int[2] - bbox_int[0] - bbox_height = bbox_int[3] - bbox_int[1] - - center_x=(bbox_int[2] + bbox_int[0])//2 - center_y=(bbox_int[3] + bbox_int[1])//2 - - x1=int(center_x-bbox_width*2) - x1=x1 if x1>=0 else 0 - - y1 = int(center_y - bbox_height*2) - y1 = y1 if y1 >= 0 else 0 - - x2 = int(center_x + bbox_width*2) - x2 = x2 if x2 512: - scale=512/max(hh,ww) - else: - scale=1 - crop_face=cv2.resize(crop_face,None,fx=scale,fy=scale) - - one_image_ann['bbox'][0] *= scale - one_image_ann['bbox'][1] *= scale - one_image_ann['bbox'][2] *= scale - one_image_ann['bbox'][3] *= scale - - - x1*=scale - y1 *= scale - x2 *= scale - y2 *= scale - for i in range(len(one_image_ann['keypoints'])): - one_image_ann['keypoints'][i][0]*= scale - one_image_ann['keypoints'][i][1]*= scale - - fname= one_image_ann['image_path_raw'].split('PUB/')[-1] - - fname=fname.replace('/','_').replace('/','_') - - - # cv2.imwrite(one_image_ann['image_name'],crop_face) - - - one_image_ann['bbox'][0] -= x1 - one_image_ann['bbox'][1] -= y1 - one_image_ann['bbox'][2] -= x1 - one_image_ann['bbox'][3] -= y1 - - for i in range(len(one_image_ann['keypoints'])): - one_image_ann['keypoints'][i][0]-=x1 - one_image_ann['keypoints'][i][1]-=y1 - - - keypoint=list(np.array(one_image_ann['keypoints']).reshape(-1).astype(np.float32)) - # [x1,y1,x2,y2]=[int(x) for x in one_image_ann['bbox']] - # - # cv2.rectangle(crop_face,(x1,y1),(x2,y2),thickness=2,color=(255,0,0)) - # - # landmark=np.array(one_image_ann['keypoints']) - # - # for _index in range(landmark.shape[0]): - # x_y = landmark[_index] - # # print(x_y) - # cv2.circle(crop_face, center=(int(x_y[0] ), - # int(x_y[1] )), - # color=(255, 0, 0), radius=2, thickness=4) - # - # - # cv2.imshow('ss', crop_face) - # cv2.waitKey(0) - - image_list.append(fname) - keypoint_list.append(keypoint) - # json_list.append(one_image_ann) - except: - print(pic) - - print(traceback.print_exc()) - - - # with open(json_nm, 'w') as f: - # json.dump(json_list, f, indent=2) - - data_dict={'image':image_list, - 'keypoint':keypoint_list} - df=pd.DataFrame(data_dict) - - df.to_csv(csv_nm,index=False) - - -process_data(train_list,train_json) - - -process_data(val_list,val_json) - - - - - - - - - - diff --git a/TRAIN/face_landmark/tools/convert_to_onnx.py b/TRAIN/face_landmark/tools/convert_to_onnx.py index 53645a3..aff32e2 100644 --- a/TRAIN/face_landmark/tools/convert_to_onnx.py +++ b/TRAIN/face_landmark/tools/convert_to_onnx.py @@ -13,19 +13,23 @@ parser = argparse.ArgumentParser() -parser.add_argument('--model', type=str,default=None, help='the thres for detect') +parser.add_argument('--weight', type=str,default=None, help='the thres for detect') +parser.add_argument('--input_size', type=int,default=256, help='the thres for detect') +parser.add_argument('--model', type=str,default='teacher', help='teacher or student to inference') args = parser.parse_args() -model_path=args.model +weight=args.weight +input_size=args.input_size +model=args.model device=torch.device('cpu') -dummy_input = torch.randn(1, 3,128, 128 , device='cpu') +dummy_input = torch.randn(1, 3,input_size, input_size , device='cpu') -style_model = COTRAIN(inference=True).to(device) +style_model = COTRAIN(inference=model).to(device) style_model.eval() -if model_path is not None: +if weight is not None: - state_dict = torch.load(model_path,map_location=device) + state_dict = torch.load(weight,map_location=device) # remove saved deprecated running_* keys in InstanceNorm from the checkpoint style_model.load_state_dict(state_dict,strict=False) diff --git a/TRAIN/face_landmark/tools/eval_WFLW.py b/TRAIN/face_landmark/tools/eval_WFLW.py index c97103c..197cc8a 100644 --- a/TRAIN/face_landmark/tools/eval_WFLW.py +++ b/TRAIN/face_landmark/tools/eval_WFLW.py @@ -1,5 +1,6 @@ ##import pandas as pd - +import sys +sys.path.append('.') from lib.core.base_trainer.model import COTRAIN @@ -77,17 +78,7 @@ def augmentationCropImage( img, bbox, joints=None, is_training=True): img = bimg[min_y:max_y, min_x:max_x, :] crop_image_height, crop_image_width, _ = img.shape - joints[:, 0] = joints[:, 0] / crop_image_width - joints[:, 1] = joints[:, 1] / crop_image_height - - # interp_methods = [cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, - # cv2.INTER_LANCZOS4] - # interp_method = random.choice(interp_methods) - img = cv2.resize(img, (cfg.MODEL.win, cfg.MODEL.hin)) - - joints[:, 0] = joints[:, 0] * cfg.MODEL.win - joints[:, 1] = joints[:, 1] * cfg.MODEL.hin return img, joints def nme(target, preds): @@ -102,7 +93,10 @@ def nme(target, preds): return nme -def do_eval(data_dir,model): +def do_eval(data_dir,model,input_size): + + + WFLW_df=load_test_f(os.path.join(data_dir,'WFLW_annotations/list_98pt_test')) model.to(device) @@ -126,7 +120,12 @@ def do_eval(data_dir,model): ### random crop and resize crop_image, label = augmentationCropImage(image, bbox, kps, False) + h,w,c=crop_image.shape + + label[:,0]/=w + label[:, 1] /= h + crop_image=cv2.resize(crop_image,(input_size,input_size)) crop_image = np.transpose(crop_image, axes=[2, 0, 1]) crop_image=crop_image.astype(np.float32) @@ -135,7 +134,7 @@ def do_eval(data_dir,model): crop_image=torch.from_numpy(crop_image).to(device) preds=model(crop_image)[0] preds=preds.cpu().detach().numpy() - preds =preds[:98*2]*256 + preds =preds[:98*2] nme_score=nme(kps,preds) @@ -143,35 +142,41 @@ def do_eval(data_dir,model): print('for cls:',k, ' nme:',np.mean(nme_list)) -def get_model(weight): +def get_model(weight,model='teacher'): - model=COTRAIN(inference=True) + model=COTRAIN(inference=model) model.eval() state_dict = torch.load(weight, map_location=device) model.load_state_dict(state_dict, strict=False) return model -def main(data_dir,weight): +def main(data_dir,weight,input_size,model): - model=get_model(weight) + model=get_model(weight,model) - do_eval(data_dir,model) + do_eval(data_dir,model,input_size) if __name__=='__main__': parser = argparse.ArgumentParser(description='Start train.') - parser.add_argument('--model', dest='model', type=str, default=None, \ - help='the model to use') + parser.add_argument('--weight', dest='weight', type=str, default=None, \ + help='the weight to use') parser.add_argument('--data_dir', dest='data_dir', type=str, default=None, \ help='the data_dir to use') + parser.add_argument('--img_size', dest='img_size', type=int, default=256, \ + help='the inputsize to use') + parser.add_argument('--model', dest='model', type=str, default='teacher', \ + help='teache or student') args = parser.parse_args() data_dir=args.data_dir + weight=args.weight + img_size=args.img_size model=args.model - main(data_dir,model) + main(data_dir,weight,img_size,model) diff --git a/TRAIN/face_landmark/train_config.py b/TRAIN/face_landmark/train_config.py index f62564b..f88b68a 100755 --- a/TRAIN/face_landmark/train_config.py +++ b/TRAIN/face_landmark/train_config.py @@ -18,22 +18,22 @@ config.TRAIN.validatiojn_batch_size = 64 config.TRAIN.accumulation_batch_size=64 config.TRAIN.log_interval = 10 ##10 iters for a log msg -config.TRAIN.epoch = 200 +config.TRAIN.epoch = 100 config.TRAIN.early_stop=20 config.TRAIN.test_interval=1 config.TRAIN.init_lr = 1.e-3 config.TRAIN.warmup_step=1500 -config.TRAIN.weight_decay_factor = 1.e-5 ####l2 +config.TRAIN.weight_decay_factor = 5.e-4 ####l2 config.TRAIN.vis=False #### if to check the training data config.TRAIN.mix_precision=True ##use mix precision to speedup, tf1.14 at least config.TRAIN.opt='Adamw' ##Adam or SGD -config.TRAIN.gradient_clip=5 +config.TRAIN.gradient_clip=-5 config.MODEL = edict() config.MODEL.model_path = './models/' ## save directory -config.MODEL.hin = 128 # input size during training , 128,160, depends on -config.MODEL.win = 128 +config.MODEL.hin = 256 # input size during training , 128,160, depends on +config.MODEL.win = 256 config.MODEL.out_channel=98*2+3+4 # output vector 68 points , 3 headpose ,4 cls params,(left eye, right eye, mouth, big mouth open) diff --git a/config.py b/config.py index 063f542..141a54e 100755 --- a/config.py +++ b/config.py @@ -17,11 +17,11 @@ config.KEYPOINTS = edict() -config.KEYPOINTS.model_path='./pretrained/kps_teacher.onnx' ### saved_model or tflite +config.KEYPOINTS.model_path='./pretrained/kps_student.onnx' ### saved_model or tflite config.KEYPOINTS.dense_dim=136+3+4 #### output dimension config.KEYPOINTS.p_num=68 #### 68 points config.KEYPOINTS.base_extend_range=[0.2,0.3] #### -config.KEYPOINTS.input_shape = (128,128,3) # input size during training , 160 +config.KEYPOINTS.input_shape = (256,256,3) # input size during training , 160 config.TRACE= edict() diff --git a/demo.py b/demo.py index de07f97..14a4aa3 100755 --- a/demo.py +++ b/demo.py @@ -24,7 +24,8 @@ def video(video_path_or_cam): star=time.time() boxes, landmarks, states = facer.run(image) - print(states) + + duration=time.time()-star #print('one iamge cost %f s'%(duration)) @@ -45,9 +46,14 @@ def video(video_path_or_cam): for landmarks_index in range(landmarks[face_index].shape[0]): x_y = landmarks[face_index][landmarks_index] - + score=states[face_index][landmarks_index] + color = (255, 255, 255) + # if score>0.5: + # color=(255,255,255) + # else: + # color = (0, 0, 255) cv2.circle(img_show, (int(x_y[0]), int(x_y[1])), - color=(0, 0, 255), radius=1, thickness=2) + color=color, radius=1, thickness=2) cv2.namedWindow("capture", 0) diff --git a/lib/core/api/face_detector.py b/lib/core/api/face_detector.py index bcd8bbe..2248a17 100755 --- a/lib/core/api/face_detector.py +++ b/lib/core/api/face_detector.py @@ -25,7 +25,7 @@ def __call__(self, image, # Inference t0=time.time() output = self.model(img_for_net) - print(time.time()-t0) + output = np.reshape(output, ( 15120, 16)) output[:,:4] = self.xywh2xyxy(output[:, :4]) diff --git a/lib/core/api/face_landmark.py b/lib/core/api/face_landmark.py index d8548cc..9207be4 100755 --- a/lib/core/api/face_landmark.py +++ b/lib/core/api/face_landmark.py @@ -35,9 +35,10 @@ def __call__(self, img, bboxes): image_croped=image_croped/255. image_croped=np.expand_dims(image_croped,axis=0) - landmark=self.model(image_croped)[0][0] + landmark,score=self.model(image_croped) - state=landmark[-68:] + + state=score.reshape(-1) landmark=np.array(landmark)[:98*2].reshape(-1,2) diff --git a/lib/core/api/onnx_model_base.py b/lib/core/api/onnx_model_base.py index 58e32e0..2d69e24 100644 --- a/lib/core/api/onnx_model_base.py +++ b/lib/core/api/onnx_model_base.py @@ -20,7 +20,7 @@ def __call__(self, data): # Inference - y_onnx = self.session.run([self.session.get_outputs()[0].name], + y_onnx = self.session.run([], {self.session.get_inputs()[0].name: data}) diff --git a/pretrained/.DS_Store b/pretrained/.DS_Store index f1ffaf7..afef959 100644 Binary files a/pretrained/.DS_Store and b/pretrained/.DS_Store differ diff --git a/pretrained/kps_student.onnx b/pretrained/kps_student.onnx new file mode 100644 index 0000000..f99abf4 Binary files /dev/null and b/pretrained/kps_student.onnx differ diff --git a/pretrained/kps_teacher.onnx b/pretrained/kps_teacher.onnx deleted file mode 100644 index 8d3b81b..0000000 Binary files a/pretrained/kps_teacher.onnx and /dev/null differ