iTAC_Technical_Documents

アイタックソリューションズ株式会社

ブログ名

第3回 第一段階目の開発

今回から、実際に開発を開始します。

まず第一段階の開発として、アルゴリズム全体の作成に注力し、Jetsonではなく普通のパソコンを端末として装置が正常に動くか実験します。高速化や小型化、性能向上などは第二段階の開発に回します。

モデル、プログラム作成

まずは人認識のAIを作る必要があります。現在の人認識の主流は、SSD/Yolo/RetinNet/RCNです。これらのモデルが今回のモデルの基本となります。

例えば、mobilenet-SSDのプログラム、

https://github.com/chuanqi305/MobileNet-SSD https://github.com/xiaochus/MobileNetV2/blob/master/mobilenet_v2.py https://github.com/tanakataiki/ssd_kerasV2

を参考にして、モデルを作ると以下のようになります。

import keras.backend as K
from keras.models import Model
from keras.layers import Input
from keras.layers import Conv2D, GlobalAveragePooling2D, SeparableConv2D
from keras.layers import Activation, Dropout, BatchNormalization, add, Reshape
from keras.layers import AlphaDropout,GaussianDropout
from keras.layers import Flatten

from keras.layers import Activation
from keras.layers.merge import concatenate
from keras.layers import Reshape

from keras.models import Sequential
from keras.layers import DepthwiseConv2D
from .ssd_layers import PriorBox

from keras.activations import relu

def relu6(x):
    return relu(x, alpha=0.0, max_value=6, threshold=0.0)

def MobileNetV2(input_shape):

    # Reference: https://github.com/xiaochus/MobileNetV2/blob/master/mobilenet_v2.py

    channel_axis = 1 if K.image_data_format() == 'channels_first' else -1

    def _conv_block(inputs, filters, kernel, strides):
        """Convolution Block
        This function defines a 2D convolution operation with BN and relu6.
        # Arguments
            inputs: Tensor, input tensor of conv layer.
            filters: Integer, the dimensionality of the output space.
            kernel: An integer or tuple/list of 2 integers, specifying the
                width and height of the 2D convolution window.
            strides: An integer or tuple/list of 2 integers,
                specifying the strides of the convolution along the width and height.
                Can be a single integer to specify the same value for
                all spatial dimensions.
        # Returns
            Output tensor.
        """

        x = Conv2D(filters, kernel, padding='same', strides=strides)(inputs)
        x = BatchNormalization(axis=channel_axis)(x)
        return Activation(relu6)(x)


    def _bottleneck(inputs, filters, kernel, t, s, r=False):
        """Bottleneck
        This function defines a basic bottleneck structure.
        # Arguments
            inputs: Tensor, input tensor of conv layer.
            filters: Integer, the dimensionality of the output space.
            kernel: An integer or tuple/list of 2 integers, specifying the
                width and height of the 2D convolution window.
            t: Integer, expansion factor.
                t is always applied to the input size.
            s: An integer or tuple/list of 2 integers,specifying the strides
                of the convolution along the width and height.Can be a single
                integer to specify the same value for all spatial dimensions.
            r: Boolean, Whether to use the residuals.
        # Returns
            Output tensor.
        """

        
        tchannel = K.int_shape(inputs)[channel_axis] * t

        x = _conv_block(inputs, tchannel, (1, 1), (1, 1))

        x = DepthwiseConv2D(kernel, strides=(s, s), depth_multiplier=1, padding='same')(x)
        x = BatchNormalization(axis=channel_axis)(x)
        x = Activation(relu6)(x)

        x = Conv2D(filters, (1, 1), strides=(1, 1), padding='same')(x)
        x = BatchNormalization(axis=channel_axis)(x)

        if r:
            x = add([x, inputs])
        return x


    def _inverted_residual_block(inputs, filters, kernel, t, strides, n):
        """Inverted Residual Block
        This function defines a sequence of 1 or more identical layers.
        # Arguments
            inputs: Tensor, input tensor of conv layer.
            filters: Integer, the dimensionality of the output space.
            kernel: An integer or tuple/list of 2 integers, specifying the
                width and height of the 2D convolution window.
            t: Integer, expansion factor.
                t is always applied to the input size.
            s: An integer or tuple/list of 2 integers,specifying the strides
                of the convolution along the width and height.Can be a single
                integer to specify the same value for all spatial dimensions.
            n: Integer, layer repeat times.
        # Returns
            Output tensor.
        """

        x = _bottleneck(inputs, filters, kernel, t, strides)

        for i in range(1, n):
            x = _bottleneck(x, filters, kernel, t, 1, True)

        return x


    inputs = Input(shape=input_shape)
    x = _conv_block(inputs, 32, (3, 3), strides=(2, 2))

    x = _inverted_residual_block(x, 16, (3, 3), t=1, strides=1, n=1)
    x = _inverted_residual_block(x, 24, (3, 3), t=6, strides=2, n=2)
    x = _inverted_residual_block(x, 32, (3, 3), t=6, strides=2, n=3)
    x = _inverted_residual_block(x, 64, (3, 3), t=6, strides=2, n=4)
    x = _inverted_residual_block(x, 96, (3, 3), t=6, strides=1, n=3)
    x = _inverted_residual_block(x, 160, (3, 3), t=6, strides=2, n=3)
    #x = _inverted_residual_block(x, 320, (3, 3), t=6, strides=1, n=1)

    last_tchannel = K.int_shape(x)[channel_axis] * 6

    x = _conv_block(x, last_tchannel, (1, 1), (1, 1))

    x = DepthwiseConv2D((3, 3), strides=(1, 1), depth_multiplier=1, padding='same')(x)
    x = BatchNormalization(axis=channel_axis)(x)
    x = Activation(relu6)(x)


    # mobile net top
    '''
    x = _conv_block(x, 1280, (1, 1), strides=(1, 1))
    x = GlobalAveragePooling2D()(x)
    x = Reshape((1, 1, 1280))(x)
    x = Dropout(0.3, name='Dropout')(x)
    x = Conv2D(k, (1, 1), padding='same')(x)

    x = Activation('softmax', name='softmax')(x)
    output = Reshape((k,))(x)
    '''

    return Model(inputs, x)


def SSD(input_shape, num_classes):

    """SSD300 MobileNet architecture.
    # Arguments
        input_shape: Shape of the input image,
            expected to be either (300, 300, 3) or (3, 300, 300)(not tested).
        num_classes: Number of classes including background.

    # References
        https://github.com/chuanqi305/MobileNet-SSD
        https://github.com/tanakataiki/ssd_kerasV2
    """

    img_size=(input_shape[1],input_shape[0])
    input_shape=(input_shape[1],input_shape[0],3)
    mobilenet_input_shape=(224,224,3)

    net = {}
    net['input'] = Input(input_shape)
    FeatureExtractor = MobileNetV2(input_shape)
    
    net['mobilenet_conv_dw_11_relu']= FeatureExtractor(net['input'])
    net['conv11'] = Conv2D(512, (1, 1),  padding='same', name='conv11')(net['mobilenet_conv_dw_11_relu'])
    net['conv11'] = BatchNormalization( momentum=0.99, name='bn11')(net['conv11'])
    net['conv11'] = Activation('relu')(net['conv11'])
    # Block
    #(19,19)
    net['conv12dw'] = SeparableConv2D(512, (3, 3),strides=(2, 2),  padding='same', name='conv12dw')(net['conv11'])
    net['conv12dw'] = BatchNormalization( momentum=0.99, name='bn12dw')(net['conv12dw'])
    net['conv12dw'] = Activation('relu')(net['conv12dw'])
    net['conv12'] = Conv2D(1024, (1, 1), padding='same',name='conv12')(net['conv12dw'])
    net['conv12'] = BatchNormalization( momentum=0.99, name='bn12')(net['conv12'])
    net['conv12'] = Activation('relu')(net['conv12'])
    net['conv13dw'] = SeparableConv2D(1024, (3, 3), padding='same',name='conv13dw')(net['conv12'])
    net['conv13dw'] = BatchNormalization( momentum=0.99, name='bn13dw')(net['conv13dw'])
    net['conv13dw'] = Activation('relu')(net['conv13dw'])
    net['conv13'] = Conv2D(1024, (1, 1), padding='same',name='conv13')(net['conv13dw'])
    net['conv13'] = BatchNormalization( momentum=0.99, name='bn13')(net['conv13'])
    net['conv13'] = Activation('relu')(net['conv13'])
    net['conv14_1'] = Conv2D(256, (1, 1),  padding='same', name='conv14_1')(net['conv13'])
    net['conv14_1'] = BatchNormalization( momentum=0.99, name='bn14_1')(net['conv14_1'])
    net['conv14_1'] = Activation('relu')(net['conv14_1'])
    net['conv14_2'] = Conv2D(512, (3, 3), strides=(2, 2),  padding='same', name='conv14_2')(net['conv14_1'])
    net['conv14_2'] = BatchNormalization( momentum=0.99, name='bn14_2')(net['conv14_2'])
    net['conv14_2'] = Activation('relu')(net['conv14_2'])
    net['conv15_1'] = Conv2D(128, (1, 1), padding='same',name='conv15_1')(net['conv14_2'])
    net['conv15_1'] = BatchNormalization( momentum=0.99, name='bn15_1')(net['conv15_1'])
    net['conv15_1'] = Activation('relu')(net['conv15_1'])
    net['conv15_2'] = Conv2D(256, (3, 3), strides=(2, 2), padding='same',name='conv15_2')(net['conv15_1'])
    net['conv15_2'] = BatchNormalization( momentum=0.99, name='bn15_2')(net['conv15_2'])
    net['conv15_2'] = Activation('relu')(net['conv15_2'])
    net['conv16_1'] = Conv2D(128, (1, 1),  padding='same', name='conv16_1')(net['conv15_2'])
    net['conv16_1'] = BatchNormalization( momentum=0.99, name='bn16_1')(net['conv16_1'])
    net['conv16_1'] = Activation('relu')(net['conv16_1'])
    net['conv16_2'] = Conv2D(256, (3, 3), strides=(2, 2),  padding='same', name='conv16_2')(net['conv16_1'])
    net['conv16_2'] = BatchNormalization( momentum=0.99, name='bn16_2')(net['conv16_2'])
    net['conv16_2'] = Activation('relu')(net['conv16_2'])
    net['conv17_1'] = Conv2D(64, (1, 1),  padding='same', name='conv17_1')(net['conv16_2'])
    net['conv17_1'] = BatchNormalization( momentum=0.99, name='bn17_1')(net['conv17_1'])
    net['conv17_1'] = Activation('relu')(net['conv17_1'])
    net['conv17_2'] = Conv2D(128, (3, 3), strides=(2, 2),  padding='same', name='conv17_2')(net['conv17_1'])
    net['conv17_2'] = BatchNormalization( momentum=0.99, name='bn17_2')(net['conv17_2'])
    net['conv17_2'] = Activation('relu')(net['conv17_2'])

    #Prediction from conv11
    num_priors = 3
    x = Conv2D(num_priors * 4, (1,1), padding='same',name='conv11_mbox_loc')(net['conv11'])
    net['conv11_mbox_loc'] = x
    flatten = Flatten(name='conv11_mbox_loc_flat')
    net['conv11_mbox_loc_flat'] = flatten(net['conv11_mbox_loc'])
    name = 'conv11_mbox_conf'
    if num_classes != 21:
        name += '_{}'.format(num_classes)
    x = Conv2D(num_priors * num_classes, (1,1), padding='same',name=name)(net['conv11'])
    net['conv11_mbox_conf'] = x
    flatten = Flatten(name='conv11_mbox_conf_flat')
    net['conv11_mbox_conf_flat'] = flatten(net['conv11_mbox_conf'])
    priorbox = PriorBox(img_size,60,max_size=None, aspect_ratios=[2],variances=[0.1, 0.1, 0.2, 0.2],name='conv11_mbox_priorbox')
    net['conv11_mbox_priorbox'] = priorbox(net['conv11'])
    # Prediction from conv13
    num_priors = 6
    net['conv13_mbox_loc'] = Conv2D(num_priors * 4, (1,1),padding='same',name='conv13_mbox_loc')(net['conv13'])
    flatten = Flatten(name='conv13_mbox_loc_flat')
    net['conv13_mbox_loc_flat'] = flatten(net['conv13_mbox_loc'])
    name = 'conv13_mbox_conf'
    if num_classes != 21:
        name += '_{}'.format(num_classes)
    net['conv13_mbox_conf'] = Conv2D(num_priors * num_classes, (1,1),padding='same',name=name)(net['conv13'])
    flatten = Flatten(name='conv13_mbox_conf_flat')
    net['conv13_mbox_conf_flat'] = flatten(net['conv13_mbox_conf'])
    priorbox = PriorBox(img_size, 105.0, max_size=150.0, aspect_ratios=[2, 3],variances=[0.1, 0.1, 0.2, 0.2],name='conv13_mbox_priorbox')
    net['conv13_mbox_priorbox'] = priorbox(net['conv13'])
    # Prediction from conv12
    num_priors = 6
    x = Conv2D(num_priors * 4, (1,1), padding='same',name='conv14_2_mbox_loc')(net['conv14_2'])
    net['conv14_2_mbox_loc'] = x
    flatten = Flatten(name='conv14_2_mbox_loc_flat')
    net['conv14_2_mbox_loc_flat'] = flatten(net['conv14_2_mbox_loc'])
    name = 'conv14_2_mbox_conf'
    if num_classes != 21:
        name += '_{}'.format(num_classes)
    x = Conv2D(num_priors * num_classes, (1,1), padding='same',name=name)(net['conv14_2'])
    net['conv14_2_mbox_conf'] = x
    flatten = Flatten(name='conv14_2_mbox_conf_flat')
    net['conv14_2_mbox_conf_flat'] = flatten(net['conv14_2_mbox_conf'])
    priorbox = PriorBox(img_size, 150, max_size=195.0, aspect_ratios=[2, 3],variances=[0.1, 0.1, 0.2, 0.2],name='conv14_2_mbox_priorbox')
    net['conv14_2_mbox_priorbox'] = priorbox(net['conv14_2'])
    # Prediction from conv15_2_mbox
    num_priors = 6
    x = Conv2D(num_priors * 4, (1,1), padding='same',name='conv15_2_mbox_loc')(net['conv15_2'])
    net['conv15_2_mbox_loc'] = x
    flatten = Flatten(name='conv15_2_mbox_loc_flat')
    net['conv15_2_mbox_loc_flat'] = flatten(net['conv15_2_mbox_loc'])
    name = 'conv15_2_mbox_conf'
    if num_classes != 21:
        name += '_{}'.format(num_classes)
    x = Conv2D(num_priors * num_classes, (1,1), padding='same',name=name)(net['conv15_2'])
    net['conv15_2_mbox_conf'] = x
    flatten = Flatten(name='conv15_2_mbox_conf_flat')
    net['conv15_2_mbox_conf_flat'] = flatten(net['conv15_2_mbox_conf'])
    priorbox = PriorBox(img_size, 195.0, max_size=240.0, aspect_ratios=[2, 3],variances=[0.1, 0.1, 0.2, 0.2],name='conv15_2_mbox_priorbox')
    net['conv15_2_mbox_priorbox'] = priorbox(net['conv15_2'])

    # Prediction from conv16_2
    num_priors = 6
    x = Conv2D(num_priors * 4, (1,1), padding='same',name='conv16_2_mbox_loc')(net['conv16_2'])
    net['conv16_2_mbox_loc'] = x
    flatten = Flatten(name='conv16_2_mbox_loc_flat')
    net['conv16_2_mbox_loc_flat'] = flatten(net['conv16_2_mbox_loc'])
    name = 'conv16_2_mbox_conf'
    if num_classes != 21:
        name += '_{}'.format(num_classes)
    x = Conv2D(num_priors * num_classes, (1,1), padding='same',name=name)(net['conv16_2'])
    net['conv16_2_mbox_conf'] = x
    flatten = Flatten(name='conv16_2_mbox_conf_flat')
    net['conv16_2_mbox_conf_flat'] = flatten(net['conv16_2_mbox_conf'])
    priorbox = PriorBox(img_size, 240.0, max_size=285.0, aspect_ratios=[2, 3],variances=[0.1, 0.1, 0.2, 0.2],name='conv16_2_mbox_priorbox')
    net['conv16_2_mbox_priorbox'] = priorbox(net['conv16_2'])

    # Prediction from conv17_2
    num_priors = 6
    x = Conv2D(num_priors * 4,(1, 1), padding='same', name='conv17_2_mbox_loc')(net['conv17_2'])
    net['conv17_2_mbox_loc'] = x
    flatten = Flatten(name='conv17_2_mbox_loc_flat')
    net['conv17_2_mbox_loc_flat'] = flatten(net['conv17_2_mbox_loc'])
    name = 'conv17_2_mbox_conf'
    if num_classes != 21:
        name += '_{}'.format(num_classes)
    x = Conv2D(num_priors * num_classes, (1,1), padding='same', name=name)(net['conv17_2'])
    net['conv17_2_mbox_conf'] = x
    flatten = Flatten(name='conv17_2_mbox_conf_flat')
    net['conv17_2_mbox_conf_flat'] = flatten(net['conv17_2_mbox_conf'])
    priorbox = PriorBox(img_size, 285.0, max_size=300.0, aspect_ratios=[2, 3], variances=[0.1, 0.1, 0.2, 0.2],name='conv17_2_mbox_priorbox')
    net['conv17_2_mbox_priorbox'] = priorbox(net['conv17_2'])

    # Gather all predictions
    net['mbox_loc'] = concatenate([net['conv11_mbox_loc_flat'],net['conv13_mbox_loc_flat'],net['conv14_2_mbox_loc_flat'],net['conv15_2_mbox_loc_flat'],net['conv16_2_mbox_loc_flat'],net['conv17_2_mbox_loc_flat']],axis=1, name='mbox_loc')
    net['mbox_conf'] = concatenate([net['conv11_mbox_conf_flat'],net['conv13_mbox_conf_flat'],net['conv14_2_mbox_conf_flat'],net['conv15_2_mbox_conf_flat'],net['conv16_2_mbox_conf_flat'],net['conv17_2_mbox_conf_flat']],axis=1, name='mbox_conf')
    net['mbox_priorbox'] = concatenate([net['conv11_mbox_priorbox'],net['conv13_mbox_priorbox'],net['conv14_2_mbox_priorbox'],net['conv15_2_mbox_priorbox'],net['conv16_2_mbox_priorbox'],net['conv17_2_mbox_priorbox']],axis=1,name='mbox_priorbox')
    if hasattr(net['mbox_loc'], '_keras_shape'):
        num_boxes = net['mbox_loc']._keras_shape[-1] // 4
    elif hasattr(net['mbox_loc'], 'int_shape'):
        num_boxes = K.int_shape(net['mbox_loc'])[-1] // 4
    net['mbox_loc'] = Reshape((num_boxes, 4),name='mbox_loc_final')(net['mbox_loc'])
    net['mbox_conf'] = Reshape((num_boxes, num_classes),name='mbox_conf_logits')(net['mbox_conf'])
    net['mbox_conf'] = Activation('softmax',name='mbox_conf_final')(net['mbox_conf'])
    net['predictions'] = concatenate([net['mbox_loc'],net['mbox_conf'],net['mbox_priorbox']],axis=2,name='predictions')
    model = Model(inputs=net['input'], outputs=net['predictions'])
    return model


if __name__ == "__main__":
    model = SSD((224, 224, 3), 2)
    model.summary()

しかし、上記のモデルではOutputのサイズが間違っていたので、一旦他のモデルを代用して学習させることにしました。

学習したモデルで実際にテストしてみると、カラー画像の場合ですが、雨の中暗い場合でも、小さな人でも、よく検知できていました。実用的なモデルであることが確認できました。

また、カメラからのデータ取得などの全体のプログラムの作成、3種類の警告音の作成を行い、安全装置としてのアルゴリズムを作成しました。

パソコンの組み立て

高速化、小型化は後回しにするので普通のパソコンを端末をして使用します。 以下のようなパソコンを作成しました。

f:id:iTD_GRP:20191130035740j:plain

f:id:iTD_GRP:20191130035752j:plain

重さは約20kgほどになりました。OSはUbuntuをインストールしました。

このパソコンに、前回作成したプログラムを移植してテストを行いました。

第一段階終了

第一段階の開発として、アルゴリズム全体の作成、パソコンを端末として装置が正常に動くか実験をすることが出来ました。

以降は第二段階の開発として、問題点の修正や改良、高速化、小型化などに着手していきます。


次の記事へ

前の記事へ 目次に戻る