diff --git a/.gitignore b/.gitignore index 74c01cd..cedef39 100644 --- a/.gitignore +++ b/.gitignore @@ -92,3 +92,6 @@ ENV/ # Rope project settings .ropeproject + +# Pycharm project settings +.idea/ diff --git a/README.md b/README.md index 8527b64..6977793 100644 --- a/README.md +++ b/README.md @@ -5,4 +5,5 @@ For forward pass for 300x300 model, please, follow `SSD.ipynb` for examples. For Weights are ported from the original models and are available [here](https://mega.nz/#F!7RowVLCL!q3cEVRK9jyOSB9el3SssIA). You need `weights_SSD300.hdf5`, `weights_300x300_old.hdf5` is for the old version of architecture with 3x3 convolution for `pool6`. -This code was tested with `Keras` v1.2.2, `Tensorflow` v1.0.0, `OpenCV` v3.1.0-dev +This code was tested with `Keras` v1.2.2, `Tensorflow` v1.0.0, `OpenCV` v3.1.0-dev\ +Also support newest `Keras` v2.0.1 (using ssd_v2.py) diff --git a/feature_flow.py b/feature_flow.py new file mode 100644 index 0000000..b4feea5 --- /dev/null +++ b/feature_flow.py @@ -0,0 +1,208 @@ +# coding: utf-8 + +from keras.applications.imagenet_utils import preprocess_input +from keras.preprocessing import image +from scipy.misc import imread +import tensorflow as tf +from keras import backend as K +import time + +from plot_util import * +from flow_util import * +from ssd_v2 import SSD300v2 +from ssd_conv4_3 import SSD300_conv4_3 +from ssd_utils import BBoxUtility + +voc_classes = ['Aeroplane', 'Bicycle', 'Bird', 'Boat', 'Bottle', + 'Bus', 'Car', 'Cat', 'Chair', 'Cow', 'Diningtable', + 'Dog', 'Horse', 'Motorbike', 'Person', 'Pottedplant', + 'Sheep', 'Sofa', 'Train', 'Tvmonitor'] + +NUM_CLASSES = len(voc_classes) + 1 +network_size = 1024 +batch_size = 2 +input_shape = (network_size, network_size, 3) +colors = plt.cm.hsv(np.linspace(0, 1, 21)).tolist() + +use_feature_flow = True +use_dump_file = False +plot_activation_enable = False +#image_files = ['/home/cory/cedl/vid/videos/vid04/0270.jpg', '/home/cory/cedl/vid/videos/vid04/0275.jpg'] +#image_files = ['/home/cory/KITTI_Dataset/data_tracking_image_2/training/image_02/0000/000015.png', +# '/home/cory/KITTI_Dataset/data_tracking_image_2/training/image_02/0000/000018.png'] + + +# magic case: vid04 270 - 299 + +# image_files = ['/home/cory/ssd_keras/GTAV/GD1015.png', '/home/cory/ssd_keras/GTAV/GD1020.png'] +image_files = ['/home/cory/ssd_keras/GTAV/GD1293.png', '/home/cory/ssd_keras/GTAV/GD1295.png'] +# '/home/cory/ssd_keras/GTAV/GD21.png' +# '/home/cory/cedl/vid/videos/vid04/1000.jpg' + + +def get_detections(result): + detections = map(lambda r: {'label': r[0], + 'conf': r[1], + 'xmin': r[2], + 'ymin': r[3], + 'xmax': r[4], + 'ymax': r[5]}, + result) + return detections + + +def get_layer_output(model, inputs, output_layer_name): + immediate_layer = K.function([model.input, K.learning_phase()], + [model.get_layer(name=output_layer_name).output]) + output = immediate_layer([inputs, 1])[0] + return output + + +def get_layer_predict(model, input_layer_name, input_layer_feature): + immediate_layer = K.function([model.get_layer(name=input_layer_name), K.learning_phase()], + [model.output]) + model_predict = immediate_layer([input_layer_feature, 1])[0] + return model_predict + + +def load_inputs(file_list): + inputs = [] + images = [] + for file in file_list: + img = image.load_img(file, target_size=(network_size, network_size)) + inputs.append(image.img_to_array(img)) + images.append(imread(file)) + return inputs, images + + +def run_network(model, inputs): + time_begin = time.time() + predictions = model.predict(inputs, batch_size=batch_size, verbose=1) + time_elapsed = time.time() - time_begin + print('elapsed time {:0.4f} sec {:.4f} fps'.format(time_elapsed, batch_size / time_elapsed)) + return predictions + + +def compare_model_layer(model1, input1, layer1, model2, input2, layer2, plot_activation_enable=False): + layer_output1 = get_layer_output(model=model1, inputs=input1, output_layer_name=layer1) + layer_output2 = get_layer_output(model=model2, inputs=input2, output_layer_name=layer2) + diff = (layer_output1 - layer_output2) + + print('layer_output1 sum =', sum(layer_output1[0].ravel())) + print('layer_output2 sum =', sum(layer_output2[0].ravel())) + print('diff min={:f} max={:f} sum={:f}'.format( + min(np.absolute(diff).ravel()), + max(np.absolute(diff).ravel()), + sum(np.absolute(diff).ravel()))) + eq = np.array_equal(layer_output1, layer_output2) + if eq: + print('equal') + else: + print('not equal') + + if plot_activation_enable: + plot_feature_map(layer_output1[0], 'feature_map_1') + plot_feature_map(layer_output2[0], 'feature_map_2') + + +def plot_detections(image_list, detection_result): + # for each image + for i, img in enumerate(image_list): + detections = get_detections(detection_result[i]) + detections = list(filter(lambda x: x['conf'] > 0.8, detections)) + fig = imshow_fig(img, title='frame_{:d}'.format(i+1)) + + current_axis = fig.gca() + for det in detections: + xmin = int(round(det['xmin'] * img.shape[1])) + ymin = int(round(det['ymin'] * img.shape[0])) + xmax = int(round(det['xmax'] * img.shape[1])) + ymax = int(round(det['ymax'] * img.shape[0])) + conf = det['conf'] + label = int(det['label']) + label_name = voc_classes[label - 1] + display_txt = '{:0.2f}, {}'.format(conf, label_name) + # print(display_txt) + coords = (xmin, ymin), xmax - xmin + 1, ymax - ymin + 1 + color = colors[label] + current_axis.add_patch(plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2)) + current_axis.text(xmin, ymin, display_txt, bbox={'facecolor': color, 'alpha': 0.5}) + fig.show() + + +def feature_flow(): + bbox_util = BBoxUtility(NUM_CLASSES) + raw_inputs, images = load_inputs(image_files) + inputs = preprocess_input(np.array(raw_inputs)) + + dump_activation_layer = 'conv4_2' + compare_layer_name = 'conv6_2' + print('dump_activation_layer', dump_activation_layer) + print('target_layer_name', compare_layer_name) + + # normal SSD network + model1 = SSD300v2(input_shape, num_classes=NUM_CLASSES) + model1.load_weights('weights_SSD300.hdf5', by_name=True) + predictions = run_network(model1, inputs) + results = bbox_util.detection_out(predictions) + plot_detections(images, results) + + # get dump layer's output (as input for flow network) + input_img2 = inputs[1:2, :, :, :] + layer_dump = get_layer_output(model=model1, inputs=input_img2, output_layer_name=dump_activation_layer) + print('layer_dump.shape = ', layer_dump.shape) + + # flow (raw rgb) + flow_rgb = compute_flow(image_files[1], image_files[0]) + + print('flow.shape', flow_rgb.shape) + imshow_fig(cv2.cvtColor(draw_hsv(flow_rgb), cv2.COLOR_BGR2RGB), title='flow_rgb') + + # flow (re-sized for feature map) + flow_feature = get_flow_for_filter(flow_rgb) + # imshow_fig(flow_feature[:, :, 0], title='flow_feature_y', cmap='gray') + # imshow_fig(flow_feature[:, :, 1], title='flow_feature_x', cmap='gray') + + # warp image by flow_rgb + iimg1 = cv2.imread(image_files[0]) + img_warp = warp_flow(iimg1, flow_rgb) + imshow_fig(cv2.cvtColor(img_warp, cv2.COLOR_BGR2RGB), title='frame_2_warp') + + # shift feature + shifted_feature = shift_filter(layer_dump, flow_feature) + + # flow net + model2 = SSD300_conv4_3((128, 128, 512), num_classes=NUM_CLASSES) + model2.load_weights('weights_SSD300.hdf5', by_name=True) + predictions = run_network(model2, shifted_feature) + results = bbox_util.detection_out(predictions) + plot_detections(images[1:2], results) + + # get specific layer's output and compare them (for debugging) + compare_model_layer(model1, input_img2, compare_layer_name, + model2, shifted_feature, compare_layer_name, + True) + + sess.close() + plt.show() + + +def get_flow_for_filter(flow): + filter_map_width = 128 + flow_ratio_y = flow.shape[0] / filter_map_width + flow_ratio_x = flow.shape[1] / filter_map_width + flow_small = np.asarray([cv2.resize(flow[:, :, 0] / flow_ratio_y, (filter_map_width, filter_map_width)), + cv2.resize(flow[:, :, 1] / flow_ratio_x, (filter_map_width, filter_map_width))]) + flow_small = flow_small.transpose([1, 2, 0]) + print('flow_small.shape', flow_small.shape) + return flow_small + + +if __name__ == '__main__': + config = tf.ConfigProto( + device_count={'GPU': 1} + ) + config.gpu_options.per_process_gpu_memory_fraction = 0.5 + sess = tf.Session(config=config) + K.set_session(sess) + feature_flow() diff --git a/flow_util.py b/flow_util.py new file mode 100644 index 0000000..0040954 --- /dev/null +++ b/flow_util.py @@ -0,0 +1,50 @@ +import matplotlib.pyplot as plt +import cv2 +import numpy as np +from subprocess import check_output + + +def shift_filter(feature, flow): + # feature shape = (None, 128, 128, 512) + shifted_feature = list() + for feat in feature: + print(feat.shape) + for i in range(feat.shape[-1]): + act2d = feat[..., i] + act2d = act2d[:, :, np.newaxis] + res = warp_flow(act2d, flow) + shifted_feature.append(res) + + if False: + print('act2d', act2d.shape, sum(act2d.ravel())) + print('flow', flow.shape, sum(flow.ravel())) + plt.figure(11) + plt.imshow(act2d[:, :, 0], cmap='gray') + plt.figure(12) + plt.imshow(flow[..., 0], cmap='gray') + plt.figure(13) + plt.imshow(flow[..., 1], cmap='gray') + plt.figure(14) + plt.imshow(res, cmap='gray') + plt.show() + pass + + return np.asarray([shifted_feature]).swapaxes(1, 2).swapaxes(2, 3) + + +def compute_flow(image_path1, image_path2): + flow_cmd = './run_flow.sh ' + image_path1 + ' ' + image_path2 + check_output([flow_cmd], shell=True) + flow = np.load('./flow.npy') + flow = flow.transpose(1, 2, 0) + # flow.shape should be (height, width, 2) + return flow + + +def warp_flow(img, flow): + h, w = flow.shape[:2] + flow_map = flow.copy() + flow_map[:, :, 0] += np.arange(w) + flow_map[:, :, 1] += np.arange(h)[:, np.newaxis] + res = cv2.remap(img, flow_map, None, cv2.INTER_LINEAR) + return res \ No newline at end of file diff --git a/plot_util.py b/plot_util.py new file mode 100644 index 0000000..51c6747 --- /dev/null +++ b/plot_util.py @@ -0,0 +1,56 @@ +import numpy as np +import matplotlib.pyplot as plt +import cv2 +import math + + +def imshow_fig(img, title='', **kwargs): + h = img.shape[0] + w = img.shape[1] + dpi = 96 + fig = plt.figure(figsize=(w/dpi, h/dpi)) + fig.add_axes([0., 0., 1., 1.]) + fig.canvas.set_window_title(title) + plt.imshow(img, **kwargs) + plt.axis('off') + return fig + + +def plot_feature_map(activations, title=''): + + num_channel = activations.shape[2] + act_border = activations.shape[0] + map_border_num = int(math.ceil(math.sqrt(num_channel))) + map_border = act_border * map_border_num + print('create act map {:d} x {:d}'.format(map_border, map_border)) + act_map = np.zeros((map_border, map_border)) + + print(activations.shape) + all_sum = 0 + for i_x in range(map_border_num): + for i_y in range(map_border_num): + idx = i_x * map_border_num + i_y + if idx >= num_channel: + break + act = activations[:, :, idx] + act_map[i_x*act_border:(i_x+1)*act_border, i_y*act_border:(i_y+1)*act_border] = act + act_sum = sum(sum(act)) + all_sum += act_sum + # print('filter-{:d} act_sum={:f}'.format(idx, act_sum)) + + print('all_sum = {:f}'.format(all_sum)) + fig = imshow_fig(act_map, title, cmap='gray') + fig.show() + + +def draw_hsv(flow): + h, w = flow.shape[:2] + fx, fy = flow[:,:,0], flow[:,:,1] + ang = np.arctan2(fy, fx) + np.pi + v = np.sqrt(fx*fx+fy*fy) + hsv = np.zeros((h, w, 3), np.uint8) + hsv[...,0] = ang*(180/np.pi/2) + hsv[...,1] = 255 + hsv[...,2] = np.minimum(v*4, 255) + bgr = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR) + return bgr \ No newline at end of file diff --git a/run_ssd.py b/run_ssd.py new file mode 100644 index 0000000..527f110 --- /dev/null +++ b/run_ssd.py @@ -0,0 +1,120 @@ +# converted from SSD.ipynb + +# In[1]: + +from keras.applications.imagenet_utils import preprocess_input +from keras.backend.tensorflow_backend import set_session +from keras.preprocessing import image +import matplotlib.pyplot as plt +import numpy as np +from scipy.misc import imread +import tensorflow as tf + +from ssd_v2 import SSD300v2 +from ssd_utils import BBoxUtility + +np.set_printoptions(suppress=True) + +config = tf.ConfigProto() +config.gpu_options.per_process_gpu_memory_fraction = 0.45 +set_session(tf.Session(config=config)) + + +# In[2]: + +voc_classes = ['Aeroplane', 'Bicycle', 'Bird', 'Boat', 'Bottle', + 'Bus', 'Car', 'Cat', 'Chair', 'Cow', 'Diningtable', + 'Dog', 'Horse','Motorbike', 'Person', 'Pottedplant', + 'Sheep', 'Sofa', 'Train', 'Tvmonitor'] +NUM_CLASSES = len(voc_classes) + 1 + + +# In[3]: + +input_shape = (300, 300, 3) +model = SSD300v2(input_shape, num_classes=NUM_CLASSES) +model.load_weights('weights_SSD300.hdf5', by_name=True) +bbox_util = BBoxUtility(NUM_CLASSES) + + +# In[4]: + +inputs = [] +images = [] +img_path = './pics/fish-bike.jpg' +img = image.load_img(img_path, target_size=(300, 300)) +img = image.img_to_array(img) +images.append(imread(img_path)) +inputs.append(img.copy()) +img_path = './pics/cat.jpg' +img = image.load_img(img_path, target_size=(300, 300)) +img = image.img_to_array(img) +images.append(imread(img_path)) +inputs.append(img.copy()) +img_path = './pics/boys.jpg' +img = image.load_img(img_path, target_size=(300, 300)) +img = image.img_to_array(img) +images.append(imread(img_path)) +inputs.append(img.copy()) +img_path = './pics/car_cat.jpg' +img = image.load_img(img_path, target_size=(300, 300)) +img = image.img_to_array(img) +images.append(imread(img_path)) +inputs.append(img.copy()) +img_path = './pics/car_cat2.jpg' +img = image.load_img(img_path, target_size=(300, 300)) +img = image.img_to_array(img) +images.append(imread(img_path)) +inputs.append(img.copy()) +inputs = preprocess_input(np.array(inputs)) + + +# In[5]: + +preds = model.predict(inputs, batch_size=1, verbose=1) + + +# In[6]: + +results = bbox_util.detection_out(preds) + + +# In[8]: + +for i, img in enumerate(images): + # Parse the outputs. + det_label = results[i][:, 0] + det_conf = results[i][:, 1] + det_xmin = results[i][:, 2] + det_ymin = results[i][:, 3] + det_xmax = results[i][:, 4] + det_ymax = results[i][:, 5] + + # Get detections with confidence higher than 0.6. + top_indices = [i for i, conf in enumerate(det_conf) if conf >= 0.6] + + top_conf = det_conf[top_indices] + top_label_indices = det_label[top_indices].tolist() + top_xmin = det_xmin[top_indices] + top_ymin = det_ymin[top_indices] + top_xmax = det_xmax[top_indices] + top_ymax = det_ymax[top_indices] + + colors = plt.cm.hsv(np.linspace(0, 1, 21)).tolist() + plt.imshow(img / 255.) + currentAxis = plt.gca() + + for i in range(top_conf.shape[0]): + xmin = int(round(top_xmin[i] * img.shape[1])) + ymin = int(round(top_ymin[i] * img.shape[0])) + xmax = int(round(top_xmax[i] * img.shape[1])) + ymax = int(round(top_ymax[i] * img.shape[0])) + score = top_conf[i] + label = int(top_label_indices[i]) + label_name = voc_classes[label - 1] + display_txt = '{:0.2f}, {}'.format(score, label_name) + coords = (xmin, ymin), xmax-xmin+1, ymax-ymin+1 + color = colors[label] + currentAxis.add_patch(plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2)) + currentAxis.text(xmin, ymin, display_txt, bbox={'facecolor':color, 'alpha':0.5}) + plt.show() diff --git a/ssd_conv4_3.py b/ssd_conv4_3.py new file mode 100644 index 0000000..5ac4fae --- /dev/null +++ b/ssd_conv4_3.py @@ -0,0 +1,274 @@ +"""Keras implementation of SSD.""" + +import keras.backend as K +from keras.layers import Activation +from keras.layers import Conv2D +from keras.layers import Dense +from keras.layers import Flatten +from keras.layers import GlobalAveragePooling2D +from keras.layers import Input +from keras.layers import MaxPooling2D +from keras.layers import Reshape +from keras.layers import ZeroPadding2D +from keras.layers import concatenate +from keras.models import Model + +from ssd_layers import Normalize +from ssd_layers import PriorBox + + +def SSD300_conv4_3(input_shape, num_classes=21): + """SSD300 architecture. + + # Arguments + input_shape: Shape of the input image, + expected to be either (300, 300, 3) or (3, 300, 300)(not tested). + num_classes: Number of classes including background. + + # References + https://arxiv.org/abs/1512.02325 + """ + + # network original input size = (1024, 1024, 3) + # input: conv4_2 with shape = (None, 128, 128, 512) + input_layer = Input(shape=input_shape) + img_size = (1024, 1024) + + conv4_2 = input_layer + + # Block 4 + conv4_3 = Conv2D(512, (3, 3), + name='conv4_3', + padding='same', + activation='relu')(conv4_2) + pool4 = MaxPooling2D(name='pool4', + pool_size=(2, 2), + strides=(2, 2), + padding='same')(conv4_3) + + # Block 5 + conv5_1 = Conv2D(512, (3, 3), + name='conv5_1', + padding='same', + activation='relu')(pool4) + conv5_2 = Conv2D(512, (3, 3), + name='conv5_2', + padding='same', + activation='relu')(conv5_1) + conv5_3 = Conv2D(512, (3, 3), + name='conv5_3', + padding='same', + activation='relu')(conv5_2) + pool5 = MaxPooling2D(name='pool5', + pool_size=(3, 3), + strides=(1, 1), + padding='same')(conv5_3) + + # FC6 + fc6 = Conv2D(1024, (3, 3), + name='fc6', + dilation_rate=(6, 6), + padding='same', + activation='relu' + )(pool5) + + # x = Dropout(0.5, name='drop6')(x) + # FC7 + fc7 = Conv2D(1024, (1, 1), + name='fc7', + padding='same', + activation='relu' + )(fc6) + # x = Dropout(0.5, name='drop7')(x) + + # Block 6 + conv6_1 = Conv2D(256, (1, 1), + name='conv6_1', + padding='same', + activation='relu')(fc7) + conv6_2 = Conv2D(512, (3, 3), + name='conv6_2', + strides=(2, 2), + padding='same', + activation='relu')(conv6_1) + + # Block 7 + conv7_1 = Conv2D(128, (1, 1), + name='conv7_1', + padding='same', + activation='relu')(conv6_2) + conv7_1z = ZeroPadding2D(name='conv7_1z')(conv7_1) + conv7_2 = Conv2D(256, (3, 3), + name='conv7_2', + padding='valid', + strides=(2, 2), + activation='relu')(conv7_1z) + + # Block 8 + conv8_1 = Conv2D(128, (1, 1), + name='conv8_1', + padding='same', + activation='relu')(conv7_2) + conv8_2 = Conv2D(256, (3, 3), + name='conv8_2', + padding='same', + strides=(2, 2), + activation='relu')(conv8_1) + + # Last Pool + pool6 = GlobalAveragePooling2D(name='pool6')(conv8_2) + + # Prediction from conv4_3 + num_priors = 3 + name = 'conv4_3_norm_mbox_conf' + if num_classes != 21: + name += '_{}'.format(num_classes) + + conv4_3_norm = Normalize(20, name='conv4_3_norm')(conv4_3) + conv4_3_norm_mbox_loc = Conv2D(num_priors * 4, (3, 3), + name='conv4_3_norm_mbox_loc', + padding='same')(conv4_3_norm) + conv4_3_norm_mbox_loc_flat = Flatten(name='conv4_3_norm_mbox_loc_flat')(conv4_3_norm_mbox_loc) + conv4_3_norm_mbox_conf = Conv2D(num_priors * num_classes, (3, 3), + name=name, + padding='same')(conv4_3_norm) + conv4_3_norm_mbox_conf_flat = Flatten(name='conv4_3_norm_mbox_conf_flat')(conv4_3_norm_mbox_conf) + conv4_3_norm_mbox_priorbox = PriorBox(img_size, 30.0, + name='conv4_3_norm_mbox_priorbox', + aspect_ratios=[2], + variances=[0.1, 0.1, 0.2, 0.2])(conv4_3_norm) + + # Prediction from fc7 + num_priors = 6 + name = 'fc7_mbox_conf' + if num_classes != 21: + name += '_{}'.format(num_classes) + fc7_mbox_conf = Conv2D(num_priors * num_classes, (3, 3), + padding='same', + name=name)(fc7) + fc7_mbox_conf_flat = Flatten(name='fc7_mbox_conf_flat')(fc7_mbox_conf) + + fc7_mbox_loc = Conv2D(num_priors * 4, (3, 3), + name='fc7_mbox_loc', + padding='same')(fc7) + fc7_mbox_loc_flat = Flatten(name='fc7_mbox_loc_flat')(fc7_mbox_loc) + fc7_mbox_priorbox = PriorBox(img_size, 60.0, + name='fc7_mbox_priorbox', + max_size=114.0, + aspect_ratios=[2, 3], + variances=[0.1, 0.1, 0.2, 0.2] + )(fc7) + + # Prediction from conv6_2 + num_priors = 6 + name = 'conv6_2_mbox_conf' + if num_classes != 21: + name += '_{}'.format(num_classes) + conv6_2_mbox_conf = Conv2D(num_priors * num_classes, (3, 3), + padding='same', + name=name)(conv6_2) + conv6_2_mbox_conf_flat = Flatten(name='conv6_2_mbox_conf_flat')(conv6_2_mbox_conf) + conv6_2_mbox_loc = Conv2D(num_priors * 4, (3, 3,), + name='conv6_2_mbox_loc', + padding='same')(conv6_2) + conv6_2_mbox_loc_flat = Flatten(name='conv6_2_mbox_loc_flat')(conv6_2_mbox_loc) + conv6_2_mbox_priorbox = PriorBox(img_size, 114.0, + max_size=168.0, + aspect_ratios=[2, 3], + variances=[0.1, 0.1, 0.2, 0.2], + name='conv6_2_mbox_priorbox')(conv6_2) + # Prediction from conv7_2 + num_priors = 6 + name = 'conv7_2_mbox_conf' + if num_classes != 21: + name += '_{}'.format(num_classes) + conv7_2_mbox_conf = Conv2D(num_priors * num_classes, (3, 3), + padding='same', + name=name)(conv7_2) + conv7_2_mbox_conf_flat = Flatten(name='conv7_2_mbox_conf_flat')(conv7_2_mbox_conf) + conv7_2_mbox_loc = Conv2D(num_priors * 4, (3, 3), + padding='same', + name='conv7_2_mbox_loc')(conv7_2) + conv7_2_mbox_loc_flat = Flatten(name='conv7_2_mbox_loc_flat')(conv7_2_mbox_loc) + conv7_2_mbox_priorbox = PriorBox(img_size, 168.0, + max_size=222.0, + aspect_ratios=[2, 3], + variances=[0.1, 0.1, 0.2, 0.2], + name='conv7_2_mbox_priorbox')(conv7_2) + # Prediction from conv8_2 + num_priors = 6 + name = 'conv8_2_mbox_conf' + if num_classes != 21: + name += '_{}'.format(num_classes) + conv8_2_mbox_conf = Conv2D(num_priors * num_classes, (3, 3), + padding='same', + name=name)(conv8_2) + conv8_2_mbox_conf_flat = Flatten(name='conv8_2_mbox_conf_flat')(conv8_2_mbox_conf) + conv8_2_mbox_loc = Conv2D(num_priors * 4, (3, 3), + padding='same', + name='conv8_2_mbox_loc')(conv8_2) + conv8_2_mbox_loc_flat = Flatten(name='conv8_2_mbox_loc_flat')(conv8_2_mbox_loc) + conv8_2_mbox_priorbox = PriorBox(img_size, 222.0, + max_size=276.0, + aspect_ratios=[2, 3], + variances=[0.1, 0.1, 0.2, 0.2], + name='conv8_2_mbox_priorbox')(conv8_2) + + # Prediction from pool6 + num_priors = 6 + name = 'pool6_mbox_conf_flat' + if num_classes != 21: + name += '_{}'.format(num_classes) + if K.image_dim_ordering() == 'tf': + target_shape = (1, 1, 256) + else: + target_shape = (256, 1, 1) + pool6_mbox_loc_flat = Dense(num_priors * 4, name='pool6_mbox_loc_flat')(pool6) + pool6_mbox_conf_flat = Dense(num_priors * num_classes, name=name)(pool6) + pool6_reshaped = Reshape(target_shape, + name='pool6_reshaped')(pool6) + pool6_mbox_priorbox = PriorBox(img_size, 276.0, max_size=330.0, aspect_ratios=[2, 3], + variances=[0.1, 0.1, 0.2, 0.2], + name='pool6_mbox_priorbox')(pool6_reshaped) + # Gather all predictions + mbox_loc = concatenate([conv4_3_norm_mbox_loc_flat, + fc7_mbox_loc_flat, + conv6_2_mbox_loc_flat, + conv7_2_mbox_loc_flat, + conv8_2_mbox_loc_flat, + pool6_mbox_loc_flat], + axis=1, + name='mbox_loc') + mbox_conf = concatenate([conv4_3_norm_mbox_conf_flat, + fc7_mbox_conf_flat, + conv6_2_mbox_conf_flat, + conv7_2_mbox_conf_flat, + conv8_2_mbox_conf_flat, + pool6_mbox_conf_flat], + axis=1, + name='mbox_conf') + mbox_priorbox = concatenate([conv4_3_norm_mbox_priorbox, + fc7_mbox_priorbox, + conv6_2_mbox_priorbox, + conv7_2_mbox_priorbox, + conv8_2_mbox_priorbox, + pool6_mbox_priorbox], + axis=1, + name='mbox_priorbox') + if hasattr(mbox_loc, '_keras_shape'): + num_boxes = mbox_loc._keras_shape[-1] // 4 + elif hasattr(mbox_loc, 'int_shape'): + num_boxes = K.int_shape(mbox_loc)[-1] // 4 + mbox_loc = Reshape((num_boxes, 4), + name='mbox_loc_final')(mbox_loc) + mbox_conf = Reshape((num_boxes, num_classes), + name='mbox_conf_logits')(mbox_conf) + mbox_conf = Activation('softmax', + name='mbox_conf_final')(mbox_conf) + predictions = concatenate([mbox_loc, + mbox_conf, + mbox_priorbox], + axis=2, + name='predictions') + model = Model(inputs=input_layer, outputs=predictions) + return model diff --git a/ssd_layers.py b/ssd_layers.py index 5e10478..40adc21 100644 --- a/ssd_layers.py +++ b/ssd_layers.py @@ -113,7 +113,11 @@ def get_output_shape_for(self, input_shape): layer_width = input_shape[self.waxis] layer_height = input_shape[self.haxis] num_boxes = num_priors_ * layer_width * layer_height - return (input_shape[0], num_boxes, 8) + return input_shape[0], num_boxes, 8 + + # support for Keras 2.0 + def compute_output_shape(self, input_shape): + return self.get_output_shape_for(input_shape) def call(self, x, mask=None): if hasattr(x, '_keras_shape'): diff --git a/ssd_v2.py b/ssd_v2.py new file mode 100644 index 0000000..68fcaca --- /dev/null +++ b/ssd_v2.py @@ -0,0 +1,324 @@ +"""Keras implementation of SSD.""" + +import keras.backend as K +from keras.layers import Activation +from keras.layers import Conv2D +from keras.layers import Dense +from keras.layers import Flatten +from keras.layers import GlobalAveragePooling2D +from keras.layers import Input +from keras.layers import MaxPooling2D +from keras.layers import Reshape +from keras.layers import ZeroPadding2D +from keras.layers import concatenate +from keras.models import Model + +from ssd_layers import Normalize +from ssd_layers import PriorBox + + +def SSD300v2(input_shape, num_classes=21): + """SSD300 architecture. + + # Arguments + input_shape: Shape of the input image, + expected to be either (300, 300, 3) or (3, 300, 300)(not tested). + num_classes: Number of classes including background. + + # References + https://arxiv.org/abs/1512.02325 + """ + input_layer = Input(shape=input_shape) + + # Block 1 + conv1_1 = Conv2D(64, (3, 3), + name='conv1_1', + padding='same', + activation='relu')(input_layer) + + conv1_2 = Conv2D(64, (3, 3), + name='conv1_2', + padding='same', + activation='relu')(conv1_1) + pool1 = MaxPooling2D(name='pool1', + pool_size=(2, 2), + strides=(2, 2), + padding='same', )(conv1_2) + + # Block 2 + conv2_1 = Conv2D(128, (3, 3), + name='conv2_1', + padding='same', + activation='relu')(pool1) + conv2_2 = Conv2D(128, (3, 3), + name='conv2_2', + padding='same', + activation='relu')(conv2_1) + pool2 = MaxPooling2D(name='pool2', + pool_size=(2, 2), + strides=(2, 2), + padding='same')(conv2_2) + + # Block 3 + conv3_1 = Conv2D(256, (3, 3), + name='conv3_1', + padding='same', + activation='relu')(pool2) + conv3_2 = Conv2D(256, (3, 3), + name='conv3_2', + padding='same', + activation='relu')(conv3_1) + conv3_3 = Conv2D(256, (3, 3), + name='conv3_3', + padding='same', + activation='relu')(conv3_2) + pool3 = MaxPooling2D(name='pool3', + pool_size=(2, 2), + strides=(2, 2), + padding='same')(conv3_3) + + # Block 4 + conv4_1 = Conv2D(512, (3, 3), + name='conv4_1', + padding='same', + activation='relu')(pool3) + conv4_2 = Conv2D(512, (3, 3), + name='conv4_2', + padding='same', + activation='relu')(conv4_1) + conv4_3 = Conv2D(512, (3, 3), + name='conv4_3', + padding='same', + activation='relu')(conv4_2) + pool4 = MaxPooling2D(name='pool4', + pool_size=(2, 2), + strides=(2, 2), + padding='same')(conv4_3) + + # Block 5 + conv5_1 = Conv2D(512, (3, 3), + name='conv5_1', + padding='same', + activation='relu')(pool4) + conv5_2 = Conv2D(512, (3, 3), + name='conv5_2', + padding='same', + activation='relu')(conv5_1) + conv5_3 = Conv2D(512, (3, 3), + name='conv5_3', + padding='same', + activation='relu')(conv5_2) + pool5 = MaxPooling2D(name='pool5', + pool_size=(3, 3), + strides=(1, 1), + padding='same')(conv5_3) + + # FC6 + fc6 = Conv2D(1024, (3, 3), + name='fc6', + dilation_rate=(6, 6), + padding='same', + activation='relu' + )(pool5) + + # x = Dropout(0.5, name='drop6')(x) + # FC7 + fc7 = Conv2D(1024, (1, 1), + name='fc7', + padding='same', + activation='relu' + )(fc6) + # x = Dropout(0.5, name='drop7')(x) + + # Block 6 + conv6_1 = Conv2D(256, (1, 1), + name='conv6_1', + padding='same', + activation='relu')(fc7) + conv6_2 = Conv2D(512, (3, 3), + name='conv6_2', + strides=(2, 2), + padding='same', + activation='relu')(conv6_1) + + # Block 7 + conv7_1 = Conv2D(128, (1, 1), + name='conv7_1', + padding='same', + activation='relu')(conv6_2) + conv7_1z = ZeroPadding2D(name='conv7_1z')(conv7_1) + conv7_2 = Conv2D(256, (3, 3), + name='conv7_2', + padding='valid', + strides=(2, 2), + activation='relu')(conv7_1z) + + # Block 8 + conv8_1 = Conv2D(128, (1, 1), + name='conv8_1', + padding='same', + activation='relu')(conv7_2) + conv8_2 = Conv2D(256, (3, 3), + name='conv8_2', + padding='same', + strides=(2, 2), + activation='relu')(conv8_1) + + # Last Pool + pool6 = GlobalAveragePooling2D(name='pool6')(conv8_2) + + # Prediction from conv4_3 + num_priors = 3 + img_size = (input_shape[1], input_shape[0]) + name = 'conv4_3_norm_mbox_conf' + if num_classes != 21: + name += '_{}'.format(num_classes) + + conv4_3_norm = Normalize(20, name='conv4_3_norm')(conv4_3) + conv4_3_norm_mbox_loc = Conv2D(num_priors * 4, (3, 3), + name='conv4_3_norm_mbox_loc', + padding='same')(conv4_3_norm) + conv4_3_norm_mbox_loc_flat = Flatten(name='conv4_3_norm_mbox_loc_flat')(conv4_3_norm_mbox_loc) + conv4_3_norm_mbox_conf = Conv2D(num_priors * num_classes, (3, 3), + name=name, + padding='same')(conv4_3_norm) + conv4_3_norm_mbox_conf_flat = Flatten(name='conv4_3_norm_mbox_conf_flat')(conv4_3_norm_mbox_conf) + conv4_3_norm_mbox_priorbox = PriorBox(img_size, 30.0, + name='conv4_3_norm_mbox_priorbox', + aspect_ratios=[2], + variances=[0.1, 0.1, 0.2, 0.2])(conv4_3_norm) + + # Prediction from fc7 + num_priors = 6 + name = 'fc7_mbox_conf' + if num_classes != 21: + name += '_{}'.format(num_classes) + fc7_mbox_conf = Conv2D(num_priors * num_classes, (3, 3), + padding='same', + name=name)(fc7) + fc7_mbox_conf_flat = Flatten(name='fc7_mbox_conf_flat')(fc7_mbox_conf) + + fc7_mbox_loc = Conv2D(num_priors * 4, (3, 3), + name='fc7_mbox_loc', + padding='same')(fc7) + fc7_mbox_loc_flat = Flatten(name='fc7_mbox_loc_flat')(fc7_mbox_loc) + fc7_mbox_priorbox = PriorBox(img_size, 60.0, + name='fc7_mbox_priorbox', + max_size=114.0, + aspect_ratios=[2, 3], + variances=[0.1, 0.1, 0.2, 0.2] + )(fc7) + + # Prediction from conv6_2 + num_priors = 6 + name = 'conv6_2_mbox_conf' + if num_classes != 21: + name += '_{}'.format(num_classes) + conv6_2_mbox_conf = Conv2D(num_priors * num_classes, (3, 3), + padding='same', + name=name)(conv6_2) + conv6_2_mbox_conf_flat = Flatten(name='conv6_2_mbox_conf_flat')(conv6_2_mbox_conf) + conv6_2_mbox_loc = Conv2D(num_priors * 4, (3, 3,), + name='conv6_2_mbox_loc', + padding='same')(conv6_2) + conv6_2_mbox_loc_flat = Flatten(name='conv6_2_mbox_loc_flat')(conv6_2_mbox_loc) + conv6_2_mbox_priorbox = PriorBox(img_size, 114.0, + max_size=168.0, + aspect_ratios=[2, 3], + variances=[0.1, 0.1, 0.2, 0.2], + name='conv6_2_mbox_priorbox')(conv6_2) + # Prediction from conv7_2 + num_priors = 6 + name = 'conv7_2_mbox_conf' + if num_classes != 21: + name += '_{}'.format(num_classes) + conv7_2_mbox_conf = Conv2D(num_priors * num_classes, (3, 3), + padding='same', + name=name)(conv7_2) + conv7_2_mbox_conf_flat = Flatten(name='conv7_2_mbox_conf_flat')(conv7_2_mbox_conf) + conv7_2_mbox_loc = Conv2D(num_priors * 4, (3, 3), + padding='same', + name='conv7_2_mbox_loc')(conv7_2) + conv7_2_mbox_loc_flat = Flatten(name='conv7_2_mbox_loc_flat')(conv7_2_mbox_loc) + conv7_2_mbox_priorbox = PriorBox(img_size, 168.0, + max_size=222.0, + aspect_ratios=[2, 3], + variances=[0.1, 0.1, 0.2, 0.2], + name='conv7_2_mbox_priorbox')(conv7_2) + # Prediction from conv8_2 + num_priors = 6 + name = 'conv8_2_mbox_conf' + if num_classes != 21: + name += '_{}'.format(num_classes) + conv8_2_mbox_conf = Conv2D(num_priors * num_classes, (3, 3), + padding='same', + name=name)(conv8_2) + conv8_2_mbox_conf_flat = Flatten(name='conv8_2_mbox_conf_flat')(conv8_2_mbox_conf) + conv8_2_mbox_loc = Conv2D(num_priors * 4, (3, 3), + padding='same', + name='conv8_2_mbox_loc')(conv8_2) + conv8_2_mbox_loc_flat = Flatten(name='conv8_2_mbox_loc_flat')(conv8_2_mbox_loc) + conv8_2_mbox_priorbox = PriorBox(img_size, 222.0, + max_size=276.0, + aspect_ratios=[2, 3], + variances=[0.1, 0.1, 0.2, 0.2], + name='conv8_2_mbox_priorbox')(conv8_2) + + # Prediction from pool6 + num_priors = 6 + name = 'pool6_mbox_conf_flat' + if num_classes != 21: + name += '_{}'.format(num_classes) + if K.image_dim_ordering() == 'tf': + target_shape = (1, 1, 256) + else: + target_shape = (256, 1, 1) + pool6_mbox_loc_flat = Dense(num_priors * 4, name='pool6_mbox_loc_flat')(pool6) + pool6_mbox_conf_flat = Dense(num_priors * num_classes, name=name)(pool6) + pool6_reshaped = Reshape(target_shape, + name='pool6_reshaped')(pool6) + pool6_mbox_priorbox = PriorBox(img_size, 276.0, max_size=330.0, aspect_ratios=[2, 3], + variances=[0.1, 0.1, 0.2, 0.2], + name='pool6_mbox_priorbox')(pool6_reshaped) + # Gather all predictions + mbox_loc = concatenate([conv4_3_norm_mbox_loc_flat, + fc7_mbox_loc_flat, + conv6_2_mbox_loc_flat, + conv7_2_mbox_loc_flat, + conv8_2_mbox_loc_flat, + pool6_mbox_loc_flat], + axis=1, + name='mbox_loc') + mbox_conf = concatenate([conv4_3_norm_mbox_conf_flat, + fc7_mbox_conf_flat, + conv6_2_mbox_conf_flat, + conv7_2_mbox_conf_flat, + conv8_2_mbox_conf_flat, + pool6_mbox_conf_flat], + axis=1, + name='mbox_conf') + mbox_priorbox = concatenate([conv4_3_norm_mbox_priorbox, + fc7_mbox_priorbox, + conv6_2_mbox_priorbox, + conv7_2_mbox_priorbox, + conv8_2_mbox_priorbox, + pool6_mbox_priorbox], + axis=1, + name='mbox_priorbox') + if hasattr(mbox_loc, '_keras_shape'): + num_boxes = mbox_loc._keras_shape[-1] // 4 + elif hasattr(mbox_loc, 'int_shape'): + num_boxes = K.int_shape(mbox_loc)[-1] // 4 + mbox_loc = Reshape((num_boxes, 4), + name='mbox_loc_final')(mbox_loc) + mbox_conf = Reshape((num_boxes, num_classes), + name='mbox_conf_logits')(mbox_conf) + mbox_conf = Activation('softmax', + name='mbox_conf_final')(mbox_conf) + predictions = concatenate([mbox_loc, + mbox_conf, + mbox_priorbox], + axis=2, + name='predictions') + model = Model(inputs=input_layer, outputs=predictions) + return model diff --git a/test_keras.py b/test_keras.py new file mode 100644 index 0000000..5fb0852 --- /dev/null +++ b/test_keras.py @@ -0,0 +1,16 @@ +import keras +from keras.layers import Conv2D, MaxPooling2D, Input + +input_img = Input(shape=(256, 256, 3)) + +tower_1 = Conv2D(64, (1, 1), padding='same', activation='relu')(input_img) +tower_11 = Conv2D(64, (3, 3), padding='same', activation='relu')(tower_1) + +tower_2 = Conv2D(64, (1, 1), padding='same', activation='relu')(input_img) +tower_22 = Conv2D(64, (5, 5), padding='same', activation='relu')(tower_2) + +tower_3 = MaxPooling2D((3, 3), strides=(1, 1), padding='same')(input_img) +tower_33 = Conv2D(64, (1, 1), padding='same', activation='relu')(tower_3) + +output = keras.layers.concatenate([tower_11, tower_22, tower_33], axis=3) +pass \ No newline at end of file diff --git a/test_ssd.py b/test_ssd.py new file mode 100644 index 0000000..0122142 --- /dev/null +++ b/test_ssd.py @@ -0,0 +1,141 @@ +from keras.applications.imagenet_utils import preprocess_input +from keras.backend.tensorflow_backend import set_session +from keras.preprocessing import image +import matplotlib.pyplot as plt +import matplotlib as mpl +import numpy as np +from scipy.misc import imread +import tensorflow as tf +from keras import backend as K +import math +import time + +from ssd_v2 import SSD300v2 +from ssd_utils import BBoxUtility + +config = tf.ConfigProto( + device_count={'GPU': 0} +) +sess = tf.Session(config=config) +K.set_session(sess) + +np.set_printoptions(suppress=True) + +voc_classes = ['Aeroplane', 'Bicycle', 'Bird', 'Boat', 'Bottle', + 'Bus', 'Car', 'Cat', 'Chair', 'Cow', 'Diningtable', + 'Dog', 'Horse','Motorbike', 'Person', 'Pottedplant', + 'Sheep', 'Sofa', 'Train', 'Tvmonitor'] +NUM_CLASSES = len(voc_classes) + 1 + +network_size = 300 +input_shape=(network_size, network_size, 3) +model = SSD300v2(input_shape, num_classes=NUM_CLASSES) +model.load_weights('weights_SSD300.hdf5', by_name=True) +bbox_util = BBoxUtility(NUM_CLASSES) + +inputs = [] +images = [] + + +def get_image_from_path(img_path): + img = image.load_img(img_path, target_size=(network_size, network_size)) + img = image.img_to_array(img) + images.append(imread(img_path)) + inputs.append(img.copy()) + +for idx in range(1292, 1293): + get_image_from_path('./GTAV/GD' + str(idx) + '.png') + +inputs = preprocess_input(np.array(inputs)) +t1 = time.time() +preds = model.predict(inputs, batch_size=1, verbose=1) +t2 = time.time() +print('elapse time {:f} fsp {:f}'.format(t2-t1, 1/(t2-t1))) +results = bbox_util.detection_out(preds) + +a = model.predict(inputs, batch_size=1) +b = bbox_util.detection_out(preds) + +norm = mpl.colors.Normalize(vmin=0., vmax=5.) + + +def plot_activations(activations, plot_enable=True): + + num_channel = activations.shape[2] + act_border = activations.shape[0] + map_border_num = int(math.ceil(math.sqrt(num_channel))) + map_border = act_border * map_border_num + print('create act map {:d} x {:d}'.format(map_border, map_border)) + act_map = np.zeros((map_border, map_border)) + + print(activations.shape) + all_sum = 0 + for i_x in range(map_border_num): + for i_y in range(map_border_num): + idx = i_x * map_border_num + i_y + if idx >= num_channel: + break + act = activations[:, :, idx] + act_map[i_x*act_border:(i_x+1)*act_border, i_y*act_border:(i_y+1)*act_border] = act + act_sum = sum(sum(act)) + all_sum += act_sum + # print('filter-{:d} act_sum={:f}'.format(idx, act_sum)) + + print('all_sum = {:f}'.format(all_sum)) + fig_act = plt.figure() + plt.imshow(act_map, cmap='gray') + fig_act.show() + + +immediate_layer = K.function([model.input, K.learning_phase()], + [model.get_layer(name='pool5').output]) + +for i, img in enumerate(images): + + # plot activations + layer_output = immediate_layer([inputs, 1])[0][i] + plot_activations(layer_output) + + fig_img = plt.figure() + + # Parse the outputs. + det_label = results[i][:, 0] + det_conf = results[i][:, 1] + det_xmin = results[i][:, 2] + det_ymin = results[i][:, 3] + det_xmax = results[i][:, 4] + det_ymax = results[i][:, 5] + + # Get detections with confidence higher than 0.6. + top_indices = [i for i, conf in enumerate(det_conf) if conf >= 0.6] + + top_conf = det_conf[top_indices] + top_label_indices = det_label[top_indices].tolist() + top_xmin = det_xmin[top_indices] + top_ymin = det_ymin[top_indices] + top_xmax = det_xmax[top_indices] + top_ymax = det_ymax[top_indices] + + colors = plt.cm.hsv(np.linspace(0, 1, 21)).tolist() + + plt.imshow(img, aspect='auto') + currentAxis = plt.gca() + + for i in range(top_conf.shape[0]): + xmin = int(round(top_xmin[i] * img.shape[1])) + ymin = int(round(top_ymin[i] * img.shape[0])) + xmax = int(round(top_xmax[i] * img.shape[1])) + ymax = int(round(top_ymax[i] * img.shape[0])) + score = top_conf[i] + label = int(top_label_indices[i]) + label_name = voc_classes[label - 1] + display_txt = '{:0.2f}, {}'.format(score, label_name) + coords = (xmin, ymin), xmax-xmin+1, ymax-ymin+1 + color = colors[label] + currentAxis.add_patch(plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2)) + currentAxis.text(xmin, ymin, display_txt, bbox={'facecolor':color, 'alpha':0.5}) + + fig_img.show() + +plt.show() +