今回から、実際に開発を開始します。
まず第一段階の開発として、アルゴリズム全体の作成に注力し、Jetsonではなく普通のパソコンを端末として装置が正常に動くか実験します。高速化や小型化、性能向上などは第二段階の開発に回します。
モデル、プログラム作成
まずは人認識のAIを作る必要があります。現在の人認識の主流は、SSD/Yolo/RetinNet/RCNです。これらのモデルが今回のモデルの基本となります。
例えば、mobilenet-SSDのプログラム、
https://github.com/chuanqi305/MobileNet-SSD https://github.com/xiaochus/MobileNetV2/blob/master/mobilenet_v2.py https://github.com/tanakataiki/ssd_kerasV2
を参考にして、モデルを作ると以下のようになります。
import keras.backend as K from keras.models import Model from keras.layers import Input from keras.layers import Conv2D, GlobalAveragePooling2D, SeparableConv2D from keras.layers import Activation, Dropout, BatchNormalization, add, Reshape from keras.layers import AlphaDropout,GaussianDropout from keras.layers import Flatten from keras.layers import Activation from keras.layers.merge import concatenate from keras.layers import Reshape from keras.models import Sequential from keras.layers import DepthwiseConv2D from .ssd_layers import PriorBox from keras.activations import relu def relu6(x): return relu(x, alpha=0.0, max_value=6, threshold=0.0) def MobileNetV2(input_shape): # Reference: https://github.com/xiaochus/MobileNetV2/blob/master/mobilenet_v2.py channel_axis = 1 if K.image_data_format() == 'channels_first' else -1 def _conv_block(inputs, filters, kernel, strides): """Convolution Block This function defines a 2D convolution operation with BN and relu6. # Arguments inputs: Tensor, input tensor of conv layer. filters: Integer, the dimensionality of the output space. kernel: An integer or tuple/list of 2 integers, specifying the width and height of the 2D convolution window. strides: An integer or tuple/list of 2 integers, specifying the strides of the convolution along the width and height. Can be a single integer to specify the same value for all spatial dimensions. # Returns Output tensor. """ x = Conv2D(filters, kernel, padding='same', strides=strides)(inputs) x = BatchNormalization(axis=channel_axis)(x) return Activation(relu6)(x) def _bottleneck(inputs, filters, kernel, t, s, r=False): """Bottleneck This function defines a basic bottleneck structure. # Arguments inputs: Tensor, input tensor of conv layer. filters: Integer, the dimensionality of the output space. kernel: An integer or tuple/list of 2 integers, specifying the width and height of the 2D convolution window. t: Integer, expansion factor. t is always applied to the input size. s: An integer or tuple/list of 2 integers,specifying the strides of the convolution along the width and height.Can be a single integer to specify the same value for all spatial dimensions. r: Boolean, Whether to use the residuals. # Returns Output tensor. """ tchannel = K.int_shape(inputs)[channel_axis] * t x = _conv_block(inputs, tchannel, (1, 1), (1, 1)) x = DepthwiseConv2D(kernel, strides=(s, s), depth_multiplier=1, padding='same')(x) x = BatchNormalization(axis=channel_axis)(x) x = Activation(relu6)(x) x = Conv2D(filters, (1, 1), strides=(1, 1), padding='same')(x) x = BatchNormalization(axis=channel_axis)(x) if r: x = add([x, inputs]) return x def _inverted_residual_block(inputs, filters, kernel, t, strides, n): """Inverted Residual Block This function defines a sequence of 1 or more identical layers. # Arguments inputs: Tensor, input tensor of conv layer. filters: Integer, the dimensionality of the output space. kernel: An integer or tuple/list of 2 integers, specifying the width and height of the 2D convolution window. t: Integer, expansion factor. t is always applied to the input size. s: An integer or tuple/list of 2 integers,specifying the strides of the convolution along the width and height.Can be a single integer to specify the same value for all spatial dimensions. n: Integer, layer repeat times. # Returns Output tensor. """ x = _bottleneck(inputs, filters, kernel, t, strides) for i in range(1, n): x = _bottleneck(x, filters, kernel, t, 1, True) return x inputs = Input(shape=input_shape) x = _conv_block(inputs, 32, (3, 3), strides=(2, 2)) x = _inverted_residual_block(x, 16, (3, 3), t=1, strides=1, n=1) x = _inverted_residual_block(x, 24, (3, 3), t=6, strides=2, n=2) x = _inverted_residual_block(x, 32, (3, 3), t=6, strides=2, n=3) x = _inverted_residual_block(x, 64, (3, 3), t=6, strides=2, n=4) x = _inverted_residual_block(x, 96, (3, 3), t=6, strides=1, n=3) x = _inverted_residual_block(x, 160, (3, 3), t=6, strides=2, n=3) #x = _inverted_residual_block(x, 320, (3, 3), t=6, strides=1, n=1) last_tchannel = K.int_shape(x)[channel_axis] * 6 x = _conv_block(x, last_tchannel, (1, 1), (1, 1)) x = DepthwiseConv2D((3, 3), strides=(1, 1), depth_multiplier=1, padding='same')(x) x = BatchNormalization(axis=channel_axis)(x) x = Activation(relu6)(x) # mobile net top ''' x = _conv_block(x, 1280, (1, 1), strides=(1, 1)) x = GlobalAveragePooling2D()(x) x = Reshape((1, 1, 1280))(x) x = Dropout(0.3, name='Dropout')(x) x = Conv2D(k, (1, 1), padding='same')(x) x = Activation('softmax', name='softmax')(x) output = Reshape((k,))(x) ''' return Model(inputs, x) def SSD(input_shape, num_classes): """SSD300 MobileNet architecture. # Arguments input_shape: Shape of the input image, expected to be either (300, 300, 3) or (3, 300, 300)(not tested). num_classes: Number of classes including background. # References https://github.com/chuanqi305/MobileNet-SSD https://github.com/tanakataiki/ssd_kerasV2 """ img_size=(input_shape[1],input_shape[0]) input_shape=(input_shape[1],input_shape[0],3) mobilenet_input_shape=(224,224,3) net = {} net['input'] = Input(input_shape) FeatureExtractor = MobileNetV2(input_shape) net['mobilenet_conv_dw_11_relu']= FeatureExtractor(net['input']) net['conv11'] = Conv2D(512, (1, 1), padding='same', name='conv11')(net['mobilenet_conv_dw_11_relu']) net['conv11'] = BatchNormalization( momentum=0.99, name='bn11')(net['conv11']) net['conv11'] = Activation('relu')(net['conv11']) # Block #(19,19) net['conv12dw'] = SeparableConv2D(512, (3, 3),strides=(2, 2), padding='same', name='conv12dw')(net['conv11']) net['conv12dw'] = BatchNormalization( momentum=0.99, name='bn12dw')(net['conv12dw']) net['conv12dw'] = Activation('relu')(net['conv12dw']) net['conv12'] = Conv2D(1024, (1, 1), padding='same',name='conv12')(net['conv12dw']) net['conv12'] = BatchNormalization( momentum=0.99, name='bn12')(net['conv12']) net['conv12'] = Activation('relu')(net['conv12']) net['conv13dw'] = SeparableConv2D(1024, (3, 3), padding='same',name='conv13dw')(net['conv12']) net['conv13dw'] = BatchNormalization( momentum=0.99, name='bn13dw')(net['conv13dw']) net['conv13dw'] = Activation('relu')(net['conv13dw']) net['conv13'] = Conv2D(1024, (1, 1), padding='same',name='conv13')(net['conv13dw']) net['conv13'] = BatchNormalization( momentum=0.99, name='bn13')(net['conv13']) net['conv13'] = Activation('relu')(net['conv13']) net['conv14_1'] = Conv2D(256, (1, 1), padding='same', name='conv14_1')(net['conv13']) net['conv14_1'] = BatchNormalization( momentum=0.99, name='bn14_1')(net['conv14_1']) net['conv14_1'] = Activation('relu')(net['conv14_1']) net['conv14_2'] = Conv2D(512, (3, 3), strides=(2, 2), padding='same', name='conv14_2')(net['conv14_1']) net['conv14_2'] = BatchNormalization( momentum=0.99, name='bn14_2')(net['conv14_2']) net['conv14_2'] = Activation('relu')(net['conv14_2']) net['conv15_1'] = Conv2D(128, (1, 1), padding='same',name='conv15_1')(net['conv14_2']) net['conv15_1'] = BatchNormalization( momentum=0.99, name='bn15_1')(net['conv15_1']) net['conv15_1'] = Activation('relu')(net['conv15_1']) net['conv15_2'] = Conv2D(256, (3, 3), strides=(2, 2), padding='same',name='conv15_2')(net['conv15_1']) net['conv15_2'] = BatchNormalization( momentum=0.99, name='bn15_2')(net['conv15_2']) net['conv15_2'] = Activation('relu')(net['conv15_2']) net['conv16_1'] = Conv2D(128, (1, 1), padding='same', name='conv16_1')(net['conv15_2']) net['conv16_1'] = BatchNormalization( momentum=0.99, name='bn16_1')(net['conv16_1']) net['conv16_1'] = Activation('relu')(net['conv16_1']) net['conv16_2'] = Conv2D(256, (3, 3), strides=(2, 2), padding='same', name='conv16_2')(net['conv16_1']) net['conv16_2'] = BatchNormalization( momentum=0.99, name='bn16_2')(net['conv16_2']) net['conv16_2'] = Activation('relu')(net['conv16_2']) net['conv17_1'] = Conv2D(64, (1, 1), padding='same', name='conv17_1')(net['conv16_2']) net['conv17_1'] = BatchNormalization( momentum=0.99, name='bn17_1')(net['conv17_1']) net['conv17_1'] = Activation('relu')(net['conv17_1']) net['conv17_2'] = Conv2D(128, (3, 3), strides=(2, 2), padding='same', name='conv17_2')(net['conv17_1']) net['conv17_2'] = BatchNormalization( momentum=0.99, name='bn17_2')(net['conv17_2']) net['conv17_2'] = Activation('relu')(net['conv17_2']) #Prediction from conv11 num_priors = 3 x = Conv2D(num_priors * 4, (1,1), padding='same',name='conv11_mbox_loc')(net['conv11']) net['conv11_mbox_loc'] = x flatten = Flatten(name='conv11_mbox_loc_flat') net['conv11_mbox_loc_flat'] = flatten(net['conv11_mbox_loc']) name = 'conv11_mbox_conf' if num_classes != 21: name += '_{}'.format(num_classes) x = Conv2D(num_priors * num_classes, (1,1), padding='same',name=name)(net['conv11']) net['conv11_mbox_conf'] = x flatten = Flatten(name='conv11_mbox_conf_flat') net['conv11_mbox_conf_flat'] = flatten(net['conv11_mbox_conf']) priorbox = PriorBox(img_size,60,max_size=None, aspect_ratios=[2],variances=[0.1, 0.1, 0.2, 0.2],name='conv11_mbox_priorbox') net['conv11_mbox_priorbox'] = priorbox(net['conv11']) # Prediction from conv13 num_priors = 6 net['conv13_mbox_loc'] = Conv2D(num_priors * 4, (1,1),padding='same',name='conv13_mbox_loc')(net['conv13']) flatten = Flatten(name='conv13_mbox_loc_flat') net['conv13_mbox_loc_flat'] = flatten(net['conv13_mbox_loc']) name = 'conv13_mbox_conf' if num_classes != 21: name += '_{}'.format(num_classes) net['conv13_mbox_conf'] = Conv2D(num_priors * num_classes, (1,1),padding='same',name=name)(net['conv13']) flatten = Flatten(name='conv13_mbox_conf_flat') net['conv13_mbox_conf_flat'] = flatten(net['conv13_mbox_conf']) priorbox = PriorBox(img_size, 105.0, max_size=150.0, aspect_ratios=[2, 3],variances=[0.1, 0.1, 0.2, 0.2],name='conv13_mbox_priorbox') net['conv13_mbox_priorbox'] = priorbox(net['conv13']) # Prediction from conv12 num_priors = 6 x = Conv2D(num_priors * 4, (1,1), padding='same',name='conv14_2_mbox_loc')(net['conv14_2']) net['conv14_2_mbox_loc'] = x flatten = Flatten(name='conv14_2_mbox_loc_flat') net['conv14_2_mbox_loc_flat'] = flatten(net['conv14_2_mbox_loc']) name = 'conv14_2_mbox_conf' if num_classes != 21: name += '_{}'.format(num_classes) x = Conv2D(num_priors * num_classes, (1,1), padding='same',name=name)(net['conv14_2']) net['conv14_2_mbox_conf'] = x flatten = Flatten(name='conv14_2_mbox_conf_flat') net['conv14_2_mbox_conf_flat'] = flatten(net['conv14_2_mbox_conf']) priorbox = PriorBox(img_size, 150, max_size=195.0, aspect_ratios=[2, 3],variances=[0.1, 0.1, 0.2, 0.2],name='conv14_2_mbox_priorbox') net['conv14_2_mbox_priorbox'] = priorbox(net['conv14_2']) # Prediction from conv15_2_mbox num_priors = 6 x = Conv2D(num_priors * 4, (1,1), padding='same',name='conv15_2_mbox_loc')(net['conv15_2']) net['conv15_2_mbox_loc'] = x flatten = Flatten(name='conv15_2_mbox_loc_flat') net['conv15_2_mbox_loc_flat'] = flatten(net['conv15_2_mbox_loc']) name = 'conv15_2_mbox_conf' if num_classes != 21: name += '_{}'.format(num_classes) x = Conv2D(num_priors * num_classes, (1,1), padding='same',name=name)(net['conv15_2']) net['conv15_2_mbox_conf'] = x flatten = Flatten(name='conv15_2_mbox_conf_flat') net['conv15_2_mbox_conf_flat'] = flatten(net['conv15_2_mbox_conf']) priorbox = PriorBox(img_size, 195.0, max_size=240.0, aspect_ratios=[2, 3],variances=[0.1, 0.1, 0.2, 0.2],name='conv15_2_mbox_priorbox') net['conv15_2_mbox_priorbox'] = priorbox(net['conv15_2']) # Prediction from conv16_2 num_priors = 6 x = Conv2D(num_priors * 4, (1,1), padding='same',name='conv16_2_mbox_loc')(net['conv16_2']) net['conv16_2_mbox_loc'] = x flatten = Flatten(name='conv16_2_mbox_loc_flat') net['conv16_2_mbox_loc_flat'] = flatten(net['conv16_2_mbox_loc']) name = 'conv16_2_mbox_conf' if num_classes != 21: name += '_{}'.format(num_classes) x = Conv2D(num_priors * num_classes, (1,1), padding='same',name=name)(net['conv16_2']) net['conv16_2_mbox_conf'] = x flatten = Flatten(name='conv16_2_mbox_conf_flat') net['conv16_2_mbox_conf_flat'] = flatten(net['conv16_2_mbox_conf']) priorbox = PriorBox(img_size, 240.0, max_size=285.0, aspect_ratios=[2, 3],variances=[0.1, 0.1, 0.2, 0.2],name='conv16_2_mbox_priorbox') net['conv16_2_mbox_priorbox'] = priorbox(net['conv16_2']) # Prediction from conv17_2 num_priors = 6 x = Conv2D(num_priors * 4,(1, 1), padding='same', name='conv17_2_mbox_loc')(net['conv17_2']) net['conv17_2_mbox_loc'] = x flatten = Flatten(name='conv17_2_mbox_loc_flat') net['conv17_2_mbox_loc_flat'] = flatten(net['conv17_2_mbox_loc']) name = 'conv17_2_mbox_conf' if num_classes != 21: name += '_{}'.format(num_classes) x = Conv2D(num_priors * num_classes, (1,1), padding='same', name=name)(net['conv17_2']) net['conv17_2_mbox_conf'] = x flatten = Flatten(name='conv17_2_mbox_conf_flat') net['conv17_2_mbox_conf_flat'] = flatten(net['conv17_2_mbox_conf']) priorbox = PriorBox(img_size, 285.0, max_size=300.0, aspect_ratios=[2, 3], variances=[0.1, 0.1, 0.2, 0.2],name='conv17_2_mbox_priorbox') net['conv17_2_mbox_priorbox'] = priorbox(net['conv17_2']) # Gather all predictions net['mbox_loc'] = concatenate([net['conv11_mbox_loc_flat'],net['conv13_mbox_loc_flat'],net['conv14_2_mbox_loc_flat'],net['conv15_2_mbox_loc_flat'],net['conv16_2_mbox_loc_flat'],net['conv17_2_mbox_loc_flat']],axis=1, name='mbox_loc') net['mbox_conf'] = concatenate([net['conv11_mbox_conf_flat'],net['conv13_mbox_conf_flat'],net['conv14_2_mbox_conf_flat'],net['conv15_2_mbox_conf_flat'],net['conv16_2_mbox_conf_flat'],net['conv17_2_mbox_conf_flat']],axis=1, name='mbox_conf') net['mbox_priorbox'] = concatenate([net['conv11_mbox_priorbox'],net['conv13_mbox_priorbox'],net['conv14_2_mbox_priorbox'],net['conv15_2_mbox_priorbox'],net['conv16_2_mbox_priorbox'],net['conv17_2_mbox_priorbox']],axis=1,name='mbox_priorbox') if hasattr(net['mbox_loc'], '_keras_shape'): num_boxes = net['mbox_loc']._keras_shape[-1] // 4 elif hasattr(net['mbox_loc'], 'int_shape'): num_boxes = K.int_shape(net['mbox_loc'])[-1] // 4 net['mbox_loc'] = Reshape((num_boxes, 4),name='mbox_loc_final')(net['mbox_loc']) net['mbox_conf'] = Reshape((num_boxes, num_classes),name='mbox_conf_logits')(net['mbox_conf']) net['mbox_conf'] = Activation('softmax',name='mbox_conf_final')(net['mbox_conf']) net['predictions'] = concatenate([net['mbox_loc'],net['mbox_conf'],net['mbox_priorbox']],axis=2,name='predictions') model = Model(inputs=net['input'], outputs=net['predictions']) return model if __name__ == "__main__": model = SSD((224, 224, 3), 2) model.summary()
しかし、上記のモデルではOutputのサイズが間違っていたので、一旦他のモデルを代用して学習させることにしました。
学習したモデルで実際にテストしてみると、カラー画像の場合ですが、雨の中暗い場合でも、小さな人でも、よく検知できていました。実用的なモデルであることが確認できました。
また、カメラからのデータ取得などの全体のプログラムの作成、3種類の警告音の作成を行い、安全装置としてのアルゴリズムを作成しました。
パソコンの組み立て
高速化、小型化は後回しにするので普通のパソコンを端末をして使用します。 以下のようなパソコンを作成しました。
重さは約20kgほどになりました。OSはUbuntuをインストールしました。
このパソコンに、前回作成したプログラムを移植してテストを行いました。
第一段階終了
第一段階の開発として、アルゴリズム全体の作成、パソコンを端末として装置が正常に動くか実験をすることが出来ました。
以降は第二段階の開発として、問題点の修正や改良、高速化、小型化などに着手していきます。