# Copyright (c) Microsoft. All rights reserved. # Licensed under the MIT license. See LICENSE.md file in the project root # for full license information. # ============================================================================== from cntk.initializer import he_normal from cntk.layers import AveragePooling, MaxPooling, BatchNormalization, Convolution, Dense, Dropout from cntk.ops import minus, element_times, relu, splice # # This file contains the basic build block of Inception Network as defined in: # # https://arxiv.org/pdf/1512.00567.pdf # # and in Tensorflow implementation # # # Convolution layer with Batch Normalization and Rectifier Linear activation. # def conv_bn_relu_layer(input, num_filters, filter_size, strides=(1,1), pad=True, bnTimeConst=4096, init=he_normal()): conv = Convolution(filter_size, num_filters, activation=None, init=init, pad=pad, strides=strides, bias=False)(input) bn = BatchNormalization(map_rank=1, normalization_time_constant=bnTimeConst, use_cntk_engine=False)(conv) return relu(bn) # # Figure 5 from https://arxiv.org/pdf/1512.00567.pdf # Modified with the added 5x5 branch to match Tensorflow implementation # def inception_block_1(input, num1x1, num5x5, num3x3dbl, numPool, bnTimeConst): # 1x1 Convolution branch1x1 = conv_bn_relu_layer(input, num1x1, (1,1), (1,1), True, bnTimeConst) # 5x5 Convolution branch5x5_1 = conv_bn_relu_layer(input, num5x5[0], (1,1), (1,1), True, bnTimeConst) branch5x5 = conv_bn_relu_layer(branch5x5_1, num5x5[1], (5,5), (1,1), True, bnTimeConst) # Double 3x3 Convolution branch3x3dbl_1 = conv_bn_relu_layer(input, num3x3dbl[0], (1,1), (1,1), True, bnTimeConst) branch3x3dbl_2 = conv_bn_relu_layer(branch3x3dbl_1, num3x3dbl[1], (3,3), (1,1), True, bnTimeConst) branch3x3dbl = conv_bn_relu_layer(branch3x3dbl_2, num3x3dbl[2], (3,3), (1,1), True, bnTimeConst) # Average Pooling branchPool_avgpool = AveragePooling((3,3), strides=(1,1), pad=True)(input) branchPool = conv_bn_relu_layer(branchPool_avgpool, numPool, (1,1), (1,1), True, bnTimeConst) out = splice(branch1x1, branch5x5, branch3x3dbl, branchPool, axis=0) return out def inception_block_2(input, num3x3, num3x3dbl, bnTimeConst): # 3x3 Convolution branch3x3 = conv_bn_relu_layer(input, num3x3, (3,3), (2,2), False, bnTimeConst) # Double 3x3 Convolution branch3x3dbl_1 = conv_bn_relu_layer(input, num3x3dbl[0], (1,1), (1,1), True, bnTimeConst) branch3x3dbl_2 = conv_bn_relu_layer(branch3x3dbl_1, num3x3dbl[1], (3,3), (1,1), True, bnTimeConst) branch3x3dbl = conv_bn_relu_layer(branch3x3dbl_2, num3x3dbl[2], (3,3), (2,2), False, bnTimeConst) # Max Pooling branchPool = MaxPooling((3,3), strides=(2,2), pad=False)(input) out = splice(branch3x3, branch3x3dbl, branchPool, axis=0) return out # # Figure 6 from https://arxiv.org/pdf/1512.00567.pdf # def inception_block_3(input, num1x1, num7x7, num7x7dbl, numPool, bnTimeConst): # 1x1 Convolution branch1x1 = conv_bn_relu_layer(input, num1x1, (1,1), (1,1), True, bnTimeConst) # 7x7 Convolution branch7x7_1 = conv_bn_relu_layer(input, num7x7[0], (1,1), (1,1), True, bnTimeConst) branch7x7_2 = conv_bn_relu_layer(branch7x7_1, num7x7[1], (1,7), (1,1), True, bnTimeConst) branch7x7 = conv_bn_relu_layer(branch7x7_2, num7x7[2], (7,1), (1,1), True, bnTimeConst) # Double 7x7 Convolution branch7x7dbl_1 = conv_bn_relu_layer(input, num7x7dbl[0], (1,1), (1,1), True, bnTimeConst) branch7x7dbl_2 = conv_bn_relu_layer(branch7x7dbl_1, num7x7dbl[1], (7,1), (1,1), True, bnTimeConst) branch7x7dbl_3 = conv_bn_relu_layer(branch7x7dbl_2, num7x7dbl[2], (1,7), (1,1), True, bnTimeConst) branch7x7dbl_4 = conv_bn_relu_layer(branch7x7dbl_3, num7x7dbl[3], (7,1), (1,1), True, bnTimeConst) branch7x7dbl = conv_bn_relu_layer(branch7x7dbl_4, num7x7dbl[4], (1,7), (1,1), True, bnTimeConst) # Average Pooling branchPool_avgpool = AveragePooling((3,3), strides=(1,1), pad=True)(input) branchPool = conv_bn_relu_layer(branchPool_avgpool, numPool, (1,1), (1,1), True, bnTimeConst) out = splice(branch1x1, branch7x7, branch7x7dbl, branchPool, axis=0) return out def inception_block_4(input, num3x3, num7x7_3x3, bnTimeConst): # 3x3 Convolution branch3x3_1 = conv_bn_relu_layer(input, num3x3[0], (1,1), (1,1), True, bnTimeConst) branch3x3 = conv_bn_relu_layer(branch3x3_1, num3x3[1], (3,3), (2,2), False, bnTimeConst) # 7x7 3x3 Convolution branch7x7_3x3_1 = conv_bn_relu_layer(input, num7x7_3x3[0], (1,1), (1,1), True, bnTimeConst) branch7x7_3x3_2 = conv_bn_relu_layer(branch7x7_3x3_1, num7x7_3x3[1], (1,7), (1,1), True, bnTimeConst) branch7x7_3x3_3 = conv_bn_relu_layer(branch7x7_3x3_2, num7x7_3x3[2], (7,1), (1,1), True, bnTimeConst) branch7x7_3x3 = conv_bn_relu_layer(branch7x7_3x3_3, num7x7_3x3[3], (3,3), (2,2), False, bnTimeConst) # Max Pooling branchPool = MaxPooling((3,3), strides=(2,2), pad=False)(input) out = splice(branch3x3, branch7x7_3x3, branchPool, axis=0) return out # # Figure 7 from https://arxiv.org/pdf/1512.00567.pdf # def inception_block_5(input, num1x1, num3x3, num3x3_3x3, numPool, bnTimeConst): # 1x1 Convolution branch1x1 = conv_bn_relu_layer(input, num1x1, (1,1), (1,1), True, bnTimeConst) # 3x3 Convolution branch3x3_1 = conv_bn_relu_layer(input, num3x3[0], (1,1), (1,1), True, bnTimeConst) branch3x3_2 = conv_bn_relu_layer(branch3x3_1, num3x3[1], (1,3), (1,1), True, bnTimeConst) branch3x3_3 = conv_bn_relu_layer(branch3x3_1, num3x3[2], (3,1), (1,1), True, bnTimeConst) branch3x3 = splice(branch3x3_2, branch3x3_3, axis=0) # 3x3 3x3 Convolution branch3x3_3x3_1 = conv_bn_relu_layer(input, num3x3_3x3[0], (1,1), (1,1), True, bnTimeConst) branch3x3_3x3_2 = conv_bn_relu_layer(branch3x3_3x3_1, num3x3_3x3[1], (3,3), (1,1), True, bnTimeConst) branch3x3_3x3_3 = conv_bn_relu_layer(branch3x3_3x3_2, num3x3_3x3[1], (1,3), (1,1), True, bnTimeConst) branch3x3_3x3_4 = conv_bn_relu_layer(branch3x3_3x3_2, num3x3_3x3[3], (3,1), (1,1), True, bnTimeConst) branch3x3_3x3 = splice(branch3x3_3x3_3, branch3x3_3x3_4, axis=0) # Average Pooling branchPool_avgpool = AveragePooling((3,3), strides=(1,1), pad=True)(input) branchPool = conv_bn_relu_layer(branchPool_avgpool, numPool, (1,1), (1,1), True, bnTimeConst) out = splice(branch1x1, branch3x3, branch3x3_3x3, branchPool, axis=0) return out # # Inception V3 model with normalized input, to use the below function # remove "ImageNet1K_mean.xml" from each reader. # def inception_v3_norm_model(input, labelDim, dropRate, bnTimeConst): # Normalize inputs to -1 and 1. featMean = 128 featScale = 1/128 input_subtracted = minus(input, featMean) input_scaled = element_times(input_subtracted, featScale) return inception_v3_model(input_scaled, labelDim, dropRate, bnTimeConst) # # Inception V3 model # def inception_v3_model(input, labelDim, dropRate, bnTimeConst): # 299 x 299 x 3 conv1 = conv_bn_relu_layer(input, 32, (3,3), (2,2), False, bnTimeConst) # 149 x 149 x 32 conv2 = conv_bn_relu_layer(conv1, 32, (3,3), (1,1), False, bnTimeConst) # 147 x 147 x 32 conv3 = conv_bn_relu_layer(conv2, 64, (3,3), (1,1), True, bnTimeConst) # 147 x 147 x 64 pool1 = MaxPooling(filter_shape=(3,3), strides=(2,2), pad=False)(conv3) # 73 x 73 x 64 conv4 = conv_bn_relu_layer(pool1, 80, (1,1), (1,1), False, bnTimeConst) # 73 x 73 x 80 conv5 = conv_bn_relu_layer(conv4, 192, (3,3), (1,1), False, bnTimeConst) # 71 x 71 x 192 pool2 = MaxPooling(filter_shape=(3,3), strides=(2,2), pad=False)(conv5) # 35 x 35 x 192 # # Inception Blocks # mixed1 = inception_block_1(pool2, 64, [48, 64], [64, 96, 96], 32, bnTimeConst) # 35 x 35 x 256 mixed2 = inception_block_1(mixed1, 64, [48, 64], [64, 96, 96], 64, bnTimeConst) # 35 x 35 x 288 mixed3 = inception_block_1(mixed2, 64, [48, 64], [64, 96, 96], 64, bnTimeConst) # 35 x 35 x 288 mixed4 = inception_block_2(mixed3, 384, [64, 96, 96], bnTimeConst) # 17 x 17 x 768 mixed5 = inception_block_3(mixed4, 192, [128, 128, 192], [128, 128, 128, 128, 192], 192, bnTimeConst) # 17 x 17 x 768 mixed6 = inception_block_3(mixed5, 192, [160, 160, 192], [160, 160, 160, 160, 192], 192, bnTimeConst) # 17 x 17 x 768 mixed7 = inception_block_3(mixed6, 192, [160, 160, 192], [160, 160, 160, 160, 192], 192, bnTimeConst) # 17 x 17 x 768 mixed8 = inception_block_3(mixed7, 192, [192, 192, 192], [192, 192, 192, 192, 192], 192, bnTimeConst) # 17 x 17 x 768 mixed9 = inception_block_4(mixed8, [192, 320], [192, 192, 192, 192], bnTimeConst) # 8 x 8 x 1280 mixed10 = inception_block_5(mixed9, 320, [384, 384, 384], [448, 384, 384, 384], 192, bnTimeConst) # 8 x 8 x 2048 mixed11 = inception_block_5(mixed10, 320, [384, 384, 384], [448, 384, 384, 384], 192, bnTimeConst) # 8 x 8 x 2048 # # Prediction # pool3 = AveragePooling(filter_shape=(8,8), pad=False)(mixed11) # 1 x 1 x 2048 drop = Dropout(dropout_rate=dropRate)(pool3) # 1 x 1 x 2048 z = Dense(labelDim, init=he_normal())(drop) # # Auxiliary # # 17 x 17 x 768 auxPool = AveragePooling(filter_shape=(5,5), strides=(3,3), pad=False)(mixed8) # 5 x 5 x 768 auxConv1 = conv_bn_relu_layer(auxPool, 128, (1,1), (1,1), True, bnTimeConst) # 5 x 5 x 128 auxConv2 = conv_bn_relu_layer(auxConv1, 768, (5,5), (1,1), False, bnTimeConst) # 1 x 1 x 768 aux = Dense(labelDim, init=he_normal())(auxConv2) return { 'z': z, 'aux': aux }