Advanced

Neural Networks

Building a neural network from scratch is the ultimate ML interview challenge. It tests your understanding of forward propagation, backpropagation via chain rule, weight initialization, activation functions, and optimization. We implement a complete multi-layer perceptron (MLP) with SGD and Adam optimizers.

Activation Functions

Every activation function needs both a forward pass and its derivative for backpropagation.

import numpy as np

# ---- Activation Functions and Their Derivatives ----

def relu(z):
    return np.maximum(0, z)

def relu_derivative(z):
    return (z > 0).astype(float)

def sigmoid(z):
    z = np.clip(z, -500, 500)
    return 1.0 / (1.0 + np.exp(-z))

def sigmoid_derivative(z):
    s = sigmoid(z)
    return s * (1 - s)

def tanh(z):
    return np.tanh(z)

def tanh_derivative(z):
    t = np.tanh(z)
    return 1 - t ** 2

def softmax(z):
    """Numerically stable softmax."""
    z_shifted = z - np.max(z, axis=1, keepdims=True)
    exp_z = np.exp(z_shifted)
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

# Map names to functions for easy lookup
ACTIVATIONS = {
    'relu': (relu, relu_derivative),
    'sigmoid': (sigmoid, sigmoid_derivative),
    'tanh': (tanh, tanh_derivative),
}

Weight Initialization

Proper weight initialization is critical. Bad initialization causes vanishing or exploding gradients.

def initialize_weights(n_in, n_out, method='he'):
    """Initialize weight matrix.

    Methods:
    - 'he': He initialization (best for ReLU) - W ~ N(0, sqrt(2/n_in))
    - 'xavier': Xavier/Glorot (best for sigmoid/tanh) - W ~ N(0, sqrt(2/(n_in+n_out)))
    - 'zeros': Zero initialization (BAD - never use for hidden layers)
    """
    if method == 'he':
        return np.random.randn(n_in, n_out) * np.sqrt(2.0 / n_in)
    elif method == 'xavier':
        return np.random.randn(n_in, n_out) * np.sqrt(2.0 / (n_in + n_out))
    elif method == 'zeros':
        return np.zeros((n_in, n_out))
    else:
        return np.random.randn(n_in, n_out) * 0.01

Full MLP Implementation

class NeuralNetwork:
    """Multi-Layer Perceptron from scratch.

    Example:
        nn = NeuralNetwork([2, 64, 32, 3], activations=['relu', 'relu', 'softmax'])
        nn.fit(X_train, y_train, epochs=100, lr=0.01)
        predictions = nn.predict(X_test)
    """

    def __init__(self, layer_sizes, activations=None):
        """
        Args:
            layer_sizes: list of ints, e.g., [input_dim, 64, 32, output_dim]
            activations: list of activation names for each layer (except input)
        """
        self.layer_sizes = layer_sizes
        self.n_layers = len(layer_sizes) - 1

        # Default activations: ReLU for hidden, softmax for output
        if activations is None:
            activations = ['relu'] * (self.n_layers - 1) + ['softmax']
        self.activation_names = activations

        # Initialize weights and biases
        self.weights = []
        self.biases = []
        for i in range(self.n_layers):
            init_method = 'he' if activations[i] == 'relu' else 'xavier'
            W = initialize_weights(layer_sizes[i], layer_sizes[i+1], init_method)
            b = np.zeros((1, layer_sizes[i+1]))
            self.weights.append(W)
            self.biases.append(b)

        self.loss_history = []

    def _forward(self, X):
        """Forward pass through all layers.
        Stores intermediate values for backpropagation.
        """
        self.z_values = []   # Pre-activation values
        self.a_values = [X]  # Post-activation values (input is first)

        current = X
        for i in range(self.n_layers):
            z = current @ self.weights[i] + self.biases[i]
            self.z_values.append(z)

            if self.activation_names[i] == 'softmax':
                a = softmax(z)
            else:
                act_fn, _ = ACTIVATIONS[self.activation_names[i]]
                a = act_fn(z)

            self.a_values.append(a)
            current = a

        return current

    def _compute_loss(self, y_pred, y_true):
        """Cross-entropy loss for classification."""
        n_samples = y_true.shape[0]

        # One-hot encode if needed
        if y_true.ndim == 1:
            y_onehot = np.zeros_like(y_pred)
            y_onehot[np.arange(n_samples), y_true] = 1
        else:
            y_onehot = y_true

        # Cross-entropy
        log_probs = np.log(np.clip(y_pred, 1e-15, 1.0))
        loss = -np.mean(np.sum(y_onehot * log_probs, axis=1))
        return loss, y_onehot

    def _backward(self, y_onehot):
        """Backpropagation: compute gradients for all layers."""
        n_samples = y_onehot.shape[0]
        self.dW = []
        self.db = []

        # Output layer gradient (softmax + cross-entropy shortcut)
        delta = self.a_values[-1] - y_onehot  # (n_samples, n_classes)

        # Iterate backward through layers
        for i in range(self.n_layers - 1, -1, -1):
            # Gradients for weights and biases of layer i
            dW = (self.a_values[i].T @ delta) / n_samples
            db = np.mean(delta, axis=0, keepdims=True)

            self.dW.insert(0, dW)
            self.db.insert(0, db)

            # Propagate gradient to previous layer (if not input)
            if i > 0:
                delta = delta @ self.weights[i].T
                # Multiply by activation derivative
                _, act_deriv = ACTIVATIONS[self.activation_names[i-1]]
                delta = delta * act_deriv(self.z_values[i-1])

    def _update_sgd(self, lr):
        """Stochastic Gradient Descent update."""
        for i in range(self.n_layers):
            self.weights[i] -= lr * self.dW[i]
            self.biases[i] -= lr * self.db[i]

    def fit(self, X, y, epochs=100, lr=0.01, batch_size=32,
            optimizer='sgd', verbose=True):
        """Train the neural network."""
        n_samples = X.shape[0]

        # Initialize Adam parameters if needed
        if optimizer == 'adam':
            self._init_adam()

        self.loss_history = []

        for epoch in range(epochs):
            # Shuffle data
            indices = np.random.permutation(n_samples)
            X_shuffled = X[indices]
            y_shuffled = y[indices]

            epoch_loss = 0
            n_batches = 0

            for start in range(0, n_samples, batch_size):
                end = min(start + batch_size, n_samples)
                X_batch = X_shuffled[start:end]
                y_batch = y_shuffled[start:end]

                # Forward pass
                y_pred = self._forward(X_batch)

                # Compute loss
                loss, y_onehot = self._compute_loss(y_pred, y_batch)
                epoch_loss += loss
                n_batches += 1

                # Backward pass
                self._backward(y_onehot)

                # Update weights
                if optimizer == 'sgd':
                    self._update_sgd(lr)
                elif optimizer == 'adam':
                    self._update_adam(lr)

            avg_loss = epoch_loss / n_batches
            self.loss_history.append(avg_loss)

            if verbose and (epoch + 1) % 10 == 0:
                acc = self.accuracy(X, y)
                print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f} - Acc: {acc:.4f}")

        return self

    def _init_adam(self):
        """Initialize Adam optimizer state."""
        self.m_W = [np.zeros_like(W) for W in self.weights]
        self.v_W = [np.zeros_like(W) for W in self.weights]
        self.m_b = [np.zeros_like(b) for b in self.biases]
        self.v_b = [np.zeros_like(b) for b in self.biases]
        self.adam_t = 0

    def _update_adam(self, lr, beta1=0.9, beta2=0.999, eps=1e-8):
        """Adam optimizer update."""
        self.adam_t += 1

        for i in range(self.n_layers):
            # Update moment estimates for weights
            self.m_W[i] = beta1 * self.m_W[i] + (1 - beta1) * self.dW[i]
            self.v_W[i] = beta2 * self.v_W[i] + (1 - beta2) * self.dW[i]**2

            # Bias correction
            m_hat = self.m_W[i] / (1 - beta1**self.adam_t)
            v_hat = self.v_W[i] / (1 - beta2**self.adam_t)

            self.weights[i] -= lr * m_hat / (np.sqrt(v_hat) + eps)

            # Same for biases
            self.m_b[i] = beta1 * self.m_b[i] + (1 - beta1) * self.db[i]
            self.v_b[i] = beta2 * self.v_b[i] + (1 - beta2) * self.db[i]**2

            m_hat_b = self.m_b[i] / (1 - beta1**self.adam_t)
            v_hat_b = self.v_b[i] / (1 - beta2**self.adam_t)

            self.biases[i] -= lr * m_hat_b / (np.sqrt(v_hat_b) + eps)

    def predict(self, X):
        probs = self._forward(X)
        return np.argmax(probs, axis=1)

    def accuracy(self, X, y):
        return np.mean(self.predict(X) == y)


# ---- Test: XOR problem (non-linearly separable) ----
np.random.seed(42)
X_xor = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y_xor = np.array([0, 1, 1, 0])

# Repeat data for training
X_train = np.tile(X_xor, (50, 1)) + np.random.randn(200, 2) * 0.1
y_train = np.tile(y_xor, 50)

nn = NeuralNetwork([2, 16, 8, 2], activations=['relu', 'relu', 'softmax'])
nn.fit(X_train, y_train, epochs=100, lr=0.01, batch_size=32,
       optimizer='adam', verbose=True)

print(f"\nFinal accuracy: {nn.accuracy(X_train, y_train):.4f}")
print(f"XOR predictions: {nn.predict(X_xor)}")  # Should be [0, 1, 1, 0]
💡
Interview tip: The key insight of backpropagation is the chain rule. For any layer, the gradient flowing back is: (1) multiply by the transpose of that layer's weights, then (2) element-wise multiply by the activation derivative. If you can explain this clearly, you demonstrate deep understanding. Also note that softmax + cross-entropy has a clean gradient shortcut: dL/dz = probs - y_onehot.

Understanding Backpropagation

The chain rule applied layer by layer:

# For a 3-layer network: Input -> Hidden1 -> Hidden2 -> Output
#
# Forward:
#   z1 = X @ W1 + b1
#   a1 = relu(z1)
#   z2 = a1 @ W2 + b2
#   a2 = relu(z2)
#   z3 = a2 @ W3 + b3
#   a3 = softmax(z3)         # output probabilities
#   L = cross_entropy(a3, y)  # loss
#
# Backward (chain rule):
#   delta3 = a3 - y_onehot                    # dL/dz3 (softmax+CE shortcut)
#   dW3 = a2.T @ delta3 / n                   # dL/dW3
#   db3 = mean(delta3)                        # dL/db3
#
#   delta2 = (delta3 @ W3.T) * relu'(z2)      # dL/dz2
#   dW2 = a1.T @ delta2 / n                   # dL/dW2
#   db2 = mean(delta2)                        # dL/db2
#
#   delta1 = (delta2 @ W2.T) * relu'(z1)      # dL/dz1
#   dW1 = X.T @ delta1 / n                    # dL/dW1
#   db1 = mean(delta1)                        # dL/db1
#
# Each delta is: (gradient from above) @ (W.T) * activation_derivative
# Each dW is: (input to this layer).T @ delta / n_samples

Key Takeaways

💡
  • Forward pass: z = a_prev @ W + b, then a = activation(z)
  • Backward pass: delta propagates back via delta @ W.T * act_deriv(z)
  • Use He initialization for ReLU, Xavier for sigmoid/tanh
  • Adam optimizer: maintains running averages of gradients and squared gradients with bias correction
  • Softmax + cross-entropy gradient simplifies to probs - y_onehot
  • Mini-batch training: shuffle data, split into batches, update after each batch