I propose a example in which a tf.keras model fails to learn from very simple data. I'm using tensorflow-gpu==2.0.0, keras==2.3.0 and Python 3.7. At the end of my post, I give the Python code to reproduce the problem I observed.
The samples are Numpy arrays of shape (6, 16, 16, 16, 3). To make things very simple, I only consider arrays full of 1s and 0s. Arrays with 1s are given the label 1 and arrays with 0s are given the label 0. I can generate some samples (in the following, n_samples = 240) with this code:
def generate_fake_data():
    for j in range(1, 240 + 1):
        if j < 120:
            yield np.ones((6, 16, 16, 16, 3)), np.array([0., 1.])
        else:
            yield np.zeros((6, 16, 16, 16, 3)), np.array([1., 0.])
In order to input this data in a tf.keras model, I create an instance of tf.data.Dataset using the code below. This will essentially create shuffled batches of BATCH_SIZE = 12 samples.
def make_tfdataset(for_training=True):
    dataset = tf.data.Dataset.from_generator(generator=lambda: generate_fake_data(),
                                             output_types=(tf.float32,
                                                           tf.float32),
                                             output_shapes=(tf.TensorShape([6, 16, 16, 16, 3]),
                                                            tf.TensorShape([2])))
    dataset = dataset.repeat()
    if for_training:
        dataset = dataset.shuffle(buffer_size=1000)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset
I propose the following model to classify my samples:
def create_model(in_shape=(6, 16, 16, 16, 3)):
    input_layer = Input(shape=in_shape)
    reshaped_input = Lambda(lambda x: K.reshape(x, (-1, *in_shape[1:])))(input_layer)
    conv3d_layer = Conv3D(filters=64, kernel_size=8, strides=(2, 2, 2), padding='same')(reshaped_input)
    relu_layer_1 = ReLU()(conv3d_layer)
    pooling_layer = GlobalAveragePooling3D()(relu_layer_1)
    reshape_layer_1 = Lambda(lambda x: K.reshape(x, (-1, in_shape[0] * 64)))(pooling_layer)
    expand_dims_layer = Lambda(lambda x: K.expand_dims(x, 1))(reshape_layer_1)
    conv1d_layer = Conv1D(filters=1, kernel_size=1)(expand_dims_layer)
    relu_layer_2 = ReLU()(conv1d_layer)
    reshape_layer_2 = Lambda(lambda x: K.squeeze(x, 1))(relu_layer_2)
    out = Dense(units=2, activation='softmax')(reshape_layer_2)
    return Model(inputs=[input_layer], outputs=[out])
The model is optimized using Adam (with default parameters) and with the binary_crossentropy loss:
clf_model = create_model()
clf_model.compile(optimizer=Adam(),
                  loss='categorical_crossentropy',
                  metrics=['accuracy', 'categorical_crossentropy'])
The output of clf_model.summary() is:
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_1 (InputLayer)         [(None, 6, 16, 16, 16, 3) 0         
_________________________________________________________________
lambda (Lambda)              (None, 16, 16, 16, 3)     0         
_________________________________________________________________
conv3d (Conv3D)              (None, 8, 8, 8, 64)       98368     
_________________________________________________________________
re_lu (ReLU)                 (None, 8, 8, 8, 64)       0         
_________________________________________________________________
global_average_pooling3d (Gl (None, 64)                0         
_________________________________________________________________
lambda_1 (Lambda)            (None, 384)               0         
_________________________________________________________________
lambda_2 (Lambda)            (None, 1, 384)            0         
_________________________________________________________________
conv1d (Conv1D)              (None, 1, 1)              385       
_________________________________________________________________
re_lu_1 (ReLU)               (None, 1, 1)              0         
_________________________________________________________________
lambda_3 (Lambda)            (None, 1)                 0         
_________________________________________________________________
dense (Dense)                (None, 2)                 4         
=================================================================
Total params: 98,757
Trainable params: 98,757
Non-trainable params: 0
The model is trained for 500 epochs as follows:
train_ds = make_tfdataset(for_training=True)
history = clf_model.fit(train_ds,
                        epochs=500,
                        steps_per_epoch=ceil(240 / BATCH_SIZE),
                        verbose=1)
During the 500 epochs, the model loss stays around 0.69 and never goes below 0.69. This is also true if I set the learning rate to
1e-2instead of1e-3. The data is very simple (just 0s and 1s). Naively, I would expect the model to have a better accuracy than just 0.6. In fact, I would expect it to reach 100% accuracy quickly. What I am doing wrong?
import numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from math import ceil
from tensorflow.keras.layers import Input, Dense, Lambda, Conv1D, GlobalAveragePooling3D, Conv3D, ReLU
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
BATCH_SIZE = 12
def generate_fake_data():
    for j in range(1, 240 + 1):
        if j < 120:
            yield np.ones((6, 16, 16, 16, 3)), np.array([0., 1.])
        else:
            yield np.zeros((6, 16, 16, 16, 3)), np.array([1., 0.])
def make_tfdataset(for_training=True):
    dataset = tf.data.Dataset.from_generator(generator=lambda: generate_fake_data(),
                                             output_types=(tf.float32,
                                                           tf.float32),
                                             output_shapes=(tf.TensorShape([6, 16, 16, 16, 3]),
                                                            tf.TensorShape([2])))
    dataset = dataset.repeat()
    if for_training:
        dataset = dataset.shuffle(buffer_size=1000)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset
def create_model(in_shape=(6, 16, 16, 16, 3)):
    input_layer = Input(shape=in_shape)
    reshaped_input = Lambda(lambda x: K.reshape(x, (-1, *in_shape[1:])))(input_layer)
    conv3d_layer = Conv3D(filters=64, kernel_size=8, strides=(2, 2, 2), padding='same')(reshaped_input)
    relu_layer_1 = ReLU()(conv3d_layer)
    pooling_layer = GlobalAveragePooling3D()(relu_layer_1)
    reshape_layer_1 = Lambda(lambda x: K.reshape(x, (-1, in_shape[0] * 64)))(pooling_layer)
    expand_dims_layer = Lambda(lambda x: K.expand_dims(x, 1))(reshape_layer_1)
    conv1d_layer = Conv1D(filters=1, kernel_size=1)(expand_dims_layer)
    relu_layer_2 = ReLU()(conv1d_layer)
    reshape_layer_2 = Lambda(lambda x: K.squeeze(x, 1))(relu_layer_2)
    out = Dense(units=2, activation='softmax')(reshape_layer_2)
    return Model(inputs=[input_layer], outputs=[out])
train_ds = make_tfdataset(for_training=True)
clf_model = create_model(in_shape=(6, 16, 16, 16, 3))
clf_model.summary()
clf_model.compile(optimizer=Adam(lr=1e-3),
                  loss='categorical_crossentropy',
                  metrics=['accuracy', 'categorical_crossentropy'])
history = clf_model.fit(train_ds,
                        epochs=500,
                        steps_per_epoch=ceil(240 / BATCH_SIZE),
                        verbose=1)
Your code has a single critical problem: dimensionality shuffling. The one dimension you should never touch is the batch dimension - as it, by definition, holds independent samples of your data. In your first reshape, you mix features dimensions with the batch dimension:
Tensor("input_1:0", shape=(12, 6, 16, 16, 16, 3), dtype=float32)
Tensor("lambda/Reshape:0", shape=(72, 16, 16, 16, 3), dtype=float32)
This is like feeding 72 independent samples of shape (16,16,16,3). Further layers suffer similar problems.
Reshape), shape your existing Conv and pooling layers to make everything work out directly. GlobalAveragePooling is intended to be the final layer, as it collapses features dimensions - in your case, like so: (12,16,16,16,3) --> (12,3); Conv afterwards serves little purposeConv1D with Conv3D
batch_shape= vs. shape=, as you can inspect layer dimensions in full (very helpful)batch_size here is 6, deducing from your comment replykernel_size=1 and (especially) filters=1 is a very weak convolution, I replaced it accordingly - you can revert if you wishDense(1, 'sigmoid') with binary_crossentropy lossAs a last note: you can toss all of the above out except for the dimensionality shuffling advice, and still get perfect train set performance; it was the root of the problem.
def create_model(batch_size, input_shape):
    ipt = Input(batch_shape=(batch_size, *input_shape))
    x   = Conv3D(filters=64, kernel_size=8, strides=(2, 2, 2),
                             activation='relu', padding='same')(ipt)
    x   = Conv3D(filters=8,  kernel_size=4, strides=(2, 2, 2),
                             activation='relu', padding='same')(x)
    x   = GlobalAveragePooling3D()(x)
    out = Dense(units=2, activation='softmax')(x)
    return Model(inputs=ipt, outputs=out)
BATCH_SIZE = 6
INPUT_SHAPE = (16, 16, 16, 3)
BATCH_SHAPE = (BATCH_SIZE, *INPUT_SHAPE)
def generate_fake_data():
    for j in range(1, 240 + 1):
        if j < 120:
            yield np.ones(INPUT_SHAPE), np.array([0., 1.])
        else:
            yield np.zeros(INPUT_SHAPE), np.array([1., 0.])
def make_tfdataset(for_training=True):
    dataset = tf.data.Dataset.from_generator(generator=lambda: generate_fake_data(),
                                 output_types=(tf.float32,
                                               tf.float32),
                                 output_shapes=(tf.TensorShape(INPUT_SHAPE),
                                                tf.TensorShape([2])))
    dataset = dataset.repeat()
    if for_training:
        dataset = dataset.shuffle(buffer_size=1000)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset
RESULTS:
Epoch 28/500
40/40 [==============================] - 0s 3ms/step - loss: 0.0808 - acc: 1.0000
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With