source/ch02/10_denseNETcifar10.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
100-Layer DenseNet model implementation on classifying the
CIFAR10 dataset.

With data augmentation:
Greater than 93.55% test accuracy in 200 epochs
225sec per epoch on GTX 1080Ti

Densely Connected Convolutional Networks
    https://arxiv.org/pdf/1608.06993.pdf
    http://openaccess.thecvf.com/content_cvpr_2017/papers/Huang_Densely_Connected_Convolutional_CVPR_2017_paper.pdf

Network below is similar to 100-Layer DenseNet-BC (k=12)

"""

######################
# required libraries #
######################

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

# to supress tensorflow-gpu debug information
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from tensorflow.keras.layers import Dense, Conv2D, BatchNormalization
from tensorflow.keras.layers import MaxPooling2D, AveragePooling2D
from tensorflow.keras.layers import Input, Flatten, Dropout
from tensorflow.keras.layers import concatenate, Activation
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Model
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.utils import plot_model
from tensorflow.keras.utils import to_categorical
import os
import numpy as np
import math

##############
# parameters #
##############

# training parameters
batch_size = 32
epochs = 200
data_augmentation = True

# network parameters
num_classes = 10
num_dense_blocks = 3
use_max_pool = False

##########################
# learning rate schedule #
##########################

def lr_schedule(epoch):

    """
    Learning Rate Schedule

    Learning rate is scheduled to be reduced after 80, 120, 160, 180 epochs.
    Called automatically every epoch as part of callbacks during training.

    # Arguments
        epoch (int): The number of epochs

    # Returns
        lr (float32): learning rate

    """

    lr = 1e-3
    if epoch > 180:
        lr *= 0.5e-3
    elif epoch > 160:
        lr *= 1e-3
    elif epoch > 120:
        lr *= 1e-2
    elif epoch > 80:
        lr *= 1e-1
    print('Learning rate: ', lr)

    return lr

##########################
# DenseNet model builder #
##########################

def denseNet(
    input_shape,
    num_filters_bef_dense_block
):

    # start model definition
    # densenet CNNs (composite function) are made of BN-ReLU-Conv2D
    inputs = Input(shape=input_shape)
    x = BatchNormalization()(inputs)
    x = Activation('relu')(x)
    x = Conv2D(
        num_filters_bef_dense_block,
        kernel_size=3,
        padding='same',
        kernel_initializer='he_normal'
    )(x)
    x = concatenate([inputs, x])

    # stack of dense blocks bridged by transition layers
    for i in range(num_dense_blocks):
        # a dense block is a stack of bottleneck layers
        for j in range(num_bottleneck_layers):
            y = BatchNormalization()(x)
            y = Activation('relu')(y)
            y = Conv2D(
                4 * growth_rate,
                kernel_size=1,
                padding='same',
                kernel_initializer='he_normal'
            )(y)
            if not data_augmentation:
                y = Dropout(0.2)(y)
            y = BatchNormalization()(y)
            y = Activation('relu')(y)
            y = Conv2D(
                growth_rate,
                kernel_size=3,
                padding='same',
                kernel_initializer='he_normal'
            )(y)
            if not data_augmentation:
                y = Dropout(0.2)(y)
            x = concatenate([x, y])

        # no transition layer after the last dense block
        if i == num_dense_blocks - 1:
            continue

        # transition layer compresses num of feature maps and reduces the size by 2
        num_filters_bef_dense_block += num_bottleneck_layers * growth_rate
        num_filters_bef_dense_block = int(num_filters_bef_dense_block * compression_factor)
        y = BatchNormalization()(x)
        y = Conv2D(
            num_filters_bef_dense_block,
            kernel_size=1,
            padding='same',
            kernel_initializer='he_normal'
        )(y)
        if not data_augmentation:
            y = Dropout(0.2)(y)
        x = AveragePooling2D()(y)

    # add classifier on top
    # after average pooling, size of feature map is 1 x 1
    x = AveragePooling2D(pool_size=8)(x)
    y = Flatten()(x)
    outputs = Dense(
        num_classes,
        kernel_initializer='he_normal',
        activation='softmax'
    )(y)

    # instantiate and compile model
    # orig paper uses SGD but RMSprop works better for DenseNet
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(
        loss='categorical_crossentropy',
        optimizer=RMSprop(1e-3),
        metrics=['acc']
    )
    model.summary()

    return model


########
# main #
########

if __name__ == "__main__":

    # load the CIFAR10 data
    (x_train, y_train), (x_test, y_test) = cifar10.load_data()

    # input image dimensions
    input_shape = x_train.shape[1:]

    # normalize data
    x_train = x_train.astype('float32') / 255
    x_test = x_test.astype('float32') / 255
    print('x_train shape:', x_train.shape)
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')
    print('y_train shape:', y_train.shape)

    # convert class vectors to binary class matrices.
    y_train = to_categorical(y_train, num_classes)
    y_test = to_categorical(y_test, num_classes)

    # DenseNet-BC with dataset augmentation
    # Growth rate   | Depth |  Accuracy (paper)| Accuracy (this)      |
    # -----------------------------------------------------------------
    # 12            | 100   |  95.49%          | 93.74%               |
    # 24            | 250   |  96.38%          | requires big mem GPU |
    # 40            | 190   |  96.54%          | requires big mem GPU |
    growth_rate = 12
    depth = 100
    num_bottleneck_layers = (depth - 4) // (2 * num_dense_blocks)

    num_filters_bef_dense_block = 2 * growth_rate
    compression_factor = 0.5

    # create a densenet model
    model = denseNet(input_shape, num_filters_bef_dense_block)

    # enable this if pydot can be installed
    # pip install pydot
    plot_model(model, to_file="cifar10-densenet.png", show_shapes=True)

    # prepare model model saving directory
    save_dir = os.path.join(os.getcwd(), 'saved_models')
    model_name = 'cifar10_densenet_model.{epoch:02d}.h5'
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    filepath = os.path.join(save_dir, model_name)

    # prepare callbacks for model saving and for learning rate reducer
    checkpoint = ModelCheckpoint(
        filepath=filepath,
        monitor='val_acc',
        verbose=1,
        save_best_only=True
    )

    lr_scheduler = LearningRateScheduler(lr_schedule)

    lr_reducer = ReduceLROnPlateau(
        factor=np.sqrt(0.1),
        cooldown=0,
        patience=5,
        min_lr=0.5e-6
    )

    callbacks = [checkpoint, lr_reducer, lr_scheduler]

    # run training, with or without data augmentation
    if not data_augmentation:
        print('Not using data augmentation.')
        model.fit(
            x_train, y_train,
            batch_size=batch_size,
            epochs=epochs,
            validation_data=(x_test, y_test),
            shuffle=True,
            callbacks=callbacks
        )
    else:
        print('Using real-time data augmentation.')
        # preprocessing  and realtime data augmentation
        datagen = ImageDataGenerator(
            # set input mean to 0 over the dataset
            featurewise_center=False,
            # set each sample mean to 0
            samplewise_center=False,
            # divide inputs by std of dataset
            featurewise_std_normalization=False,
            # divide each input by its std
            samplewise_std_normalization=False,
            # apply ZCA whitening
            zca_whitening=False,
            # randomly rotate images in the range (deg 0 to 180)
            rotation_range=0,
            # randomly shift images horizontally
            width_shift_range=0.1,
            # randomly shift images vertically
            height_shift_range=0.1,
            # randomly flip images
            horizontal_flip=True,
            # randomly flip images
            vertical_flip=False
        )

        # compute quantities required for featurewise normalization
        # (std, mean, and principal components if ZCA whitening is applied)
        datagen.fit(x_train)

        steps_per_epoch = math.ceil(len(x_train) / batch_size)
        # fit the model on the batches generated by datagen.flow().
        model.fit(
            x=datagen.flow(x_train, y_train, batch_size=batch_size),
            verbose=1,
            epochs=epochs,
            validation_data=(x_test, y_test),
            steps_per_epoch=steps_per_epoch,
            callbacks=callbacks
        )

        # fit the model on the batches generated by datagen.flow()
        #model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size),
        ##                    steps_per_epoch=x_train.shape[0] // batch_size,
        #                    validation_data=(x_test, y_test),
        #                    epochs=epochs, verbose=1,
        #                    callbacks=callbacks)

    # score trained model
    scores = model.evaluate(x_test, y_test, verbose=0)
    print('Test loss:', scores[0])
    print('Test accuracy:', scores[1])