Usually, weights for neural networks are initialized randomly so that they receive different gradients and learn different weights. In theory, if all weights are initialized the same way, all nodes will have the same weights no matter how long you train. Thus the training shouldn't work at all.

However, the code below gives 56% accuracy on MNIST after 7000 epochs. Why is that the case?

Code

#!/usr/bin/env python """MNIST with Tensorflow.""" from tensorflow.examples.tutorials.mnist import input_data import tensorflow as tf import os import numpy as np epochs = 20000 model_checkpoint_path = 'checkpoints/mnist_tf_model.ckpt' def weight_variable(shape): #initial = tf.truncated_normal(shape, stddev=0.01) initial = tf.constant(0.0, shape=shape) return tf.get_variable(initializer=initial, name='weights') def bias_variable(shape): initial = tf.constant(0.1, shape=shape) return tf.get_variable(initializer=initial, name='biases') def conv2d(x, W): return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME') def max_pool_2x2(x): return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') def eval_network(sess, summary_writer, dataset, correct_prediction, epoch): correct_sum = 0 total_test = 0 training_summary = tf.get_default_graph().get_tensor_by_name("training_accuracy:0") loss_summary = tf.get_default_graph().get_tensor_by_name("loss:0") for i in range(dataset.labels.shape[0] / 1000): feed_dict = {x: dataset.images[i * 1000:(i + 1) * 1000], y_: dataset.labels[i * 1000:(i + 1) * 1000]} [test_correct, train_summ, loss_summ] = sess.run([correct_prediction, training_summary, loss_summary], feed_dict=feed_dict) summary_writer.add_summary(train_summ, epoch) summary_writer.add_summary(loss_summ, epoch) test_correct = correct_prediction.eval(feed_dict=feed_dict) correct_sum += sum(test_correct) total_test += len(test_correct) return float(correct_sum) / total_test def log_score(sess, summary_writer, filename, mnist, scoring, epoch): with open(filename, "a") as myfile: train = eval_network(sess, summary_writer, mnist.train, scoring, epoch) test = eval_network(sess, summary_writer, mnist.test, scoring, epoch) myfile.write("%i;%0.6f;%0.6f

" % (epoch, train, test)) mnist = input_data.read_data_sets('MNIST_data', one_hot=True) with tf.Session() as sess: x = tf.placeholder(tf.float32, shape=[None, 784]) y_ = tf.placeholder(tf.float32, shape=[None, 10]) x_image = tf.reshape(x, [-1, 28, 28, 1]) with tf.variable_scope('conv1') as scope: W_conv1 = weight_variable([5, 5, 1, 32]) b_conv1 = bias_variable([32]) h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1, name='ReLU1') h_pool1 = max_pool_2x2(h_conv1) with tf.variable_scope('conv2') as scope: W_conv2 = weight_variable([5, 5, 32, 64]) b_conv2 = bias_variable([64]) h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2, name='ReLU2') h_pool2 = max_pool_2x2(h_conv2) with tf.variable_scope('fc1'): W_fc1 = weight_variable([7 * 7 * 64, 1024]) b_fc1 = bias_variable([1024]) h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64]) h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1) with tf.variable_scope('softmax'): W_fc2 = weight_variable([1024, 10]) b_fc2 = bias_variable([10]) y_conv = tf.nn.softmax(tf.matmul(h_fc1, W_fc2) + b_fc2) cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv * 10**-7), reduction_indices=[1])) train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) tf.scalar_summary("training_accuracy", accuracy, name="training_accuracy") tf.scalar_summary("loss", cross_entropy, name="loss") summary_writer = tf.train.SummaryWriter('summary_dir', sess.graph) sess.run(tf.initialize_all_variables()) for i in range(epochs): batch = mnist.train.next_batch(50) if i % 100 == 0: log_score(sess, summary_writer, 'validation-curve-accuracy.csv', mnist, correct_prediction, i) train_step.run(feed_dict={x: batch[0], y_: batch[1]}) log_score(sess, summary_writer, 'validation-curve-accuracy.csv', mnist, correct_prediction, epochs)

Plots

Nr 1

After adding 10**-7 to the tf.log(..) term, the NANs are gone:

Nr 2

This is an old plot which did have a problem due to log(0) after 16k epochs.

The loss is plotted here. The triangles are NANs.

Here is the accuracy - due to the smoothing, it does not directly fall to ~10%.