Code

Imports

Load NumPy for linear algebra, Keras for the MNIST dataset, and time for benchmarking.


# imports
import numpy as np
import keras.datasets.mnist as mnist
import time

Math Class

Static helper methods for activation functions and their derivatives used during forward and backward passes.


# Math class

class Math:

    # -----------------------------
    # Activations
    # -----------------------------
    @staticmethod
    def sigmoid(Z):
        return 1 / (1 + np.exp(-Z))

    @staticmethod
    def deriv_sigmoid(Z):
        s = Math.sigmoid(Z)
        return s * (1 - s)

    @staticmethod
    def reLU(Z):
        return np.maximum(0, Z)

    @staticmethod
    def deriv_reLU(Z):
        return (Z > 0).astype(float)

    @staticmethod
    def tanh(Z):
        return np.tanh(Z)

    @staticmethod
    def deriv_tanh(Z):
        return 1 - np.tanh(Z)**2

    @staticmethod
    def softmax(Z):
        expZ = np.exp(Z - np.max(Z, axis=0, keepdims=True))
        return expZ / np.sum(expZ, axis=0, keepdims=True)

    @staticmethod
    def linear(Z):
        return Z


    # -----------------------------
    # Loss functions
    # -----------------------------
    @staticmethod
    def cross_entropy(Y_pred, Y_true):
        m = Y_true.size
        Y_one_hot = Math.one_hot(Y_true, Y_pred.shape[0])
        return -np.sum(Y_one_hot * np.log(Y_pred + 1e-15)) / m

    @staticmethod
    def cross_entropy_grad(Y_pred, Y_true):
        m = Y_true.size
        Y_one_hot = Math.one_hot(Y_true, Y_pred.shape[0])
        return (Y_pred - Y_one_hot) / m

    @staticmethod
    def mse(Y_pred, Y_true):
        m = Y_true.size
        Y_one_hot = Math.one_hot(Y_true, Y_pred.shape[0])
        return np.sum((Y_pred - Y_one_hot)**2) / m

    @staticmethod
    def mse_grad(Y_pred, Y_true):
        m = Y_true.size
        Y_one_hot = Math.one_hot(Y_true, Y_pred.shape[0])
        return 2 * (Y_pred - Y_one_hot) / m


    # -----------------------------
    # One-hot encoding
    # -----------------------------
    @staticmethod
    def one_hot(Y, num_classes):
        one_hot_Y = np.zeros((num_classes, Y.size))
        one_hot_Y[Y, np.arange(Y.size)] = 1
        return one_hot_Y

Classification NN

The neural network class handling layer initialization, forward propagation, backpropagation, and parameter updates.


# Classification NN class

class ClassificationNN():
    
    # add more allowed activations and losses here if needed
    _hidden_activations = ['reLU', 'sigmoid', 'tanh']
    _output_activations = ['softmax', 'sigmoid', 'linear']
    _losses = ['cross_entropy', 'mse']
    
    def __init__(self, layers, hidden_activation='reLU', output_activation='softmax', loss='cross_entropy'):
        
        self.layers = layers # List defining the number of layers and number of neurons in each layer
        self.weights = [] # List to hold weights for each layer
        self.biases = [] # List to hold biases for each layer
        self.Zs = None # Linear combination per layer
        self.As = None # Activation per layer
        self.num_classes = layers[-1] # Number of output classes, used for one-hot encoding
        
        # Initialize weights and biases
        for i in range(len(layers) - 1):
            n_in = layers[i]
            n_out = layers[i + 1]

            # He initialization (good for ReLU)
            W = np.random.randn(n_out, n_in) * np.sqrt(2 / n_in)
            b = np.zeros((n_out, 1))
            self.weights.append(W)
            self.biases.append(b)
        
        # Hidden activation initialization
        if hidden_activation not in self._hidden_activations:
            raise ValueError(f"Hidden activation must be one of {self._hidden_activations}")
        self.hidden_activation = hidden_activation

        # Output activation initialization
        if output_activation not in self._output_activations:
            raise ValueError(f"Output activation must be one of {self._output_activations}")
        self.output_activation = output_activation

        # Loss initialization
        if loss not in self._losses:
            raise ValueError(f"Loss must be one of {self._losses}")
        self.loss_name = loss
        
        
    # Activation functions
    def activation(self, Z, layer_index):
        """Returns activation for the layer"""
        if layer_index == len(self.weights) - 1:
            return getattr(Math, self.output_activation)(Z)
        else:
            return getattr(Math, self.hidden_activation)(Z)
        
    def activation_derivative(self, Z, layer_index):
        """Returns derivative for backpropagation"""
        if layer_index == len(self.weights) - 1:
            return np.ones_like(Z)  # Output handled in loss gradient
        else:
            return getattr(Math, f'deriv_{self.hidden_activation}')(Z).astype(float)
        
    
    # Loss functions
    def compute_loss(self, Y_pred, Y_true):
        """Compute scalar loss"""
        return getattr(Math, self.loss_name)(Y_pred, Y_true)

    def compute_loss_grad(self, Y_pred, Y_true):
        """Compute gradient of loss w.r.t output"""
        return getattr(Math, f'{self.loss_name}_grad')(Y_pred, Y_true)
    
    
    def forward(self, X):
        self.Zs = []
        self.As = [X]
        A = X

        for i in range(len(self.weights)):
            W, b = self.weights[i], self.biases[i]
            Z = np.dot(W, A) + b
            self.Zs.append(Z)

            # Hidden layers use chosen hidden activation
            if i < len(self.weights) - 1:
                A = getattr(Math, self.hidden_activation)(Z)
            else:
                # Output layer uses chosen output activation
                A = getattr(Math, self.output_activation)(Z)
            self.As.append(A)

        return A


    def backward(self, Y):
        m = Y.size
        Y_pred = self.As[-1]
        one_hot_Y = Math.one_hot(Y, self.num_classes)

        # Compute output gradient
        dZ = Y_pred - one_hot_Y

        self.dWs = [None] * len(self.weights)
        self.dbs = [None] * len(self.biases)

        # Backpropagate through all layers
        for i in reversed(range(len(self.weights))):
            A_prev = self.As[i]
            self.dWs[i] = np.dot(dZ, A_prev.T) / m
            self.dbs[i] = np.sum(dZ, axis=1, keepdims=True) / m

            if i > 0:
                W = self.weights[i]
                Z_prev = self.Zs[i-1]
                dZ = np.dot(W.T, dZ) * getattr(Math, f'deriv_{self.hidden_activation}')(Z_prev).astype(float)


    def update_params(self, dWs, dbs, learning_rate):
        for i in range(len(self.weights)):
            self.weights[i] -= learning_rate * dWs[i]
            self.biases[i] -= learning_rate * dbs[i]
    

    def predict(self, X):
        Y_pred = self.forward(X)
        return np.argmax(Y_pred, axis=0)

    def accuracy(self, Y_pred, Y_true):
        return np.mean(Y_pred == Y_true)

Train Model

Loads the MNIST data, initializes the network, and runs mini-batch gradient descent for a set number of epochs.


# Implement custom train mehod

def train(model, X, Y, epochs, learning_rate, batch_size):
        """
        Training loop with mini-batches and SGD updates.
        """
        for epoch in range(epochs):
            permutation = np.random.permutation(X.shape[1])
            X_shuffled = X[:, permutation]
            Y_shuffled = Y[permutation]

            for j in range(0, X.shape[1], batch_size):
                X_batch = X_shuffled[:, j:j+batch_size]
                Y_batch = Y_shuffled[j:j+batch_size]

                model.forward(X_batch)
                model.backward(Y_batch)
                model.update_params(model.dWs, model.dbs, learning_rate)

            # Evaluate accuracy after each epoch
            Y_pred = model.predict(X)
            acc = model.accuracy(Y_pred, Y)
            print(f"Epoch {epoch+1}/{epochs} - Accuracy: {acc:.4f}")

Run Model

Evaluates the trained model on the test set and returns the accuracy along with the total training time.


# init data and use NN

# load MNIST dataset
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# Prepare training data
m_train = X_train.shape[0]
X_train_flat = X_train.reshape(m_train, -1).T / 255.0
Y_train = y_train.astype(int)

# Prepare test data (separate, unseen data)
m_test = X_test.shape[0]
X_test_flat = X_test.reshape(m_test, -1).T / 255.0
Y_test = y_test.astype(int)

model = ClassificationNN(
    layers=[784, 128, 64, 10],
    hidden_activation='reLU',
    output_activation='softmax',
    loss='cross_entropy'
)

# train and test model
start_time = time.time()
train(model, X_train_flat, Y_train, epochs=10, learning_rate=0.01, batch_size=64)
end_time = time.time()
train_time_elapsed = end_time - start_time
Y_pred_test = model.predict(X_test_flat)
accuracy = model.accuracy(Y_pred_test, Y_test)
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Time Elapsed: {train_time_elapsed:.4f}")

Disclaimer: The code shown above is the original Python implementation, kept intact to maintain clarity and explainability. The version running behind the API has been slightly modified to work as a web service.