Advanced

Implement Neural Networks

The most challenging ML coding interview question. Build a complete multi-layer perceptron from scratch with NumPy — including forward pass, backpropagation, activation functions, and SGD optimizer. Understanding every line of this code sets you apart from other candidates.

Activation Functions: Implement From Scratch

Before building the full network, you need to implement activation functions and their derivatives. Interviewers often start by asking you to code these individually.

📝
Interview Question #1: “Implement the sigmoid, ReLU, and softmax activation functions along with their derivatives. Explain when you would use each one.”
import numpy as np

class Activations:
    """Activation functions and their derivatives for neural networks."""

    @staticmethod
    def sigmoid(z):
        """Sigmoid: squashes values to (0, 1). Used for binary output."""
        z = np.clip(z, -500, 500)
        return 1.0 / (1.0 + np.exp(-z))

    @staticmethod
    def sigmoid_derivative(a):
        """Derivative of sigmoid given its output a = sigmoid(z)."""
        return a * (1 - a)

    @staticmethod
    def relu(z):
        """ReLU: max(0, z). Default choice for hidden layers."""
        return np.maximum(0, z)

    @staticmethod
    def relu_derivative(z):
        """Derivative of ReLU. Note: undefined at 0, we use 0."""
        return (z > 0).astype(float)

    @staticmethod
    def tanh(z):
        """Tanh: squashes values to (-1, 1). Centered at zero."""
        return np.tanh(z)

    @staticmethod
    def tanh_derivative(a):
        """Derivative of tanh given its output a = tanh(z)."""
        return 1 - a ** 2

    @staticmethod
    def softmax(z):
        """Softmax: converts logits to probabilities. Used for multi-class output."""
        # Subtract max for numerical stability
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)


# Test each activation
z = np.array([[-2.0, -1.0, 0.0, 1.0, 2.0]])
print(f"Sigmoid: {Activations.sigmoid(z)}")
print(f"ReLU:    {Activations.relu(z)}")
print(f"Tanh:    {Activations.tanh(z)}")
print(f"Softmax: {Activations.softmax(z)}")
ActivationRangeUse CaseProblem
Sigmoid(0, 1)Binary output layerVanishing gradients
ReLU[0, infinity)Hidden layers (default)Dead neurons (ReLU dying)
Tanh(-1, 1)Hidden layers, RNNsVanishing gradients
Softmax(0, 1), sums to 1Multi-class outputNumerical overflow

📝
Interview Question #2: “Implement a 2-layer neural network (MLP) from scratch using only NumPy. It should handle binary classification with backpropagation and SGD. Walk me through the forward and backward passes.”

Full MLP: Complete Solution

import numpy as np

class NeuralNetwork:
    """
    Multi-layer Perceptron from scratch.
    Architecture: Input -> Hidden (ReLU) -> Output (Sigmoid)
    Loss: Binary Cross-Entropy
    Optimizer: SGD with optional momentum
    """

    def __init__(self, layer_sizes, learning_rate=0.01, momentum=0.0):
        """
        Args:
            layer_sizes: list, e.g. [2, 64, 32, 1] for 2 inputs,
                         two hidden layers (64, 32), 1 output
            learning_rate: float
            momentum: float, SGD momentum (0 = vanilla SGD)
        """
        self.lr = learning_rate
        self.momentum = momentum
        self.weights = []
        self.biases = []
        self.velocities_w = []
        self.velocities_b = []

        # Xavier/He initialization
        for i in range(len(layer_sizes) - 1):
            # He initialization for ReLU layers
            scale = np.sqrt(2.0 / layer_sizes[i])
            W = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * scale
            b = np.zeros((1, layer_sizes[i+1]))
            self.weights.append(W)
            self.biases.append(b)
            self.velocities_w.append(np.zeros_like(W))
            self.velocities_b.append(np.zeros_like(b))

        self.loss_history = []

    def _sigmoid(self, z):
        z = np.clip(z, -500, 500)
        return 1.0 / (1.0 + np.exp(-z))

    def _relu(self, z):
        return np.maximum(0, z)

    def _relu_derivative(self, z):
        return (z > 0).astype(float)

    def forward(self, X):
        """
        Forward pass through the network.
        Stores intermediate values for backpropagation.

        Returns: output of the last layer
        """
        self.activations = [X]  # Store input as first "activation"
        self.z_values = []       # Store pre-activation values

        current = X
        n_layers = len(self.weights)

        for i in range(n_layers):
            z = current @ self.weights[i] + self.biases[i]
            self.z_values.append(z)

            if i == n_layers - 1:
                # Output layer: sigmoid for binary classification
                current = self._sigmoid(z)
            else:
                # Hidden layers: ReLU
                current = self._relu(z)

            self.activations.append(current)

        return current

    def backward(self, X, y):
        """
        Backpropagation: compute gradients for all weights and biases.

        This is the core of the interview question. Walk through each step.
        """
        n_samples = X.shape[0]
        n_layers = len(self.weights)

        # Output layer gradient
        # For sigmoid + binary cross-entropy, the gradient simplifies to:
        output = self.activations[-1]
        delta = output - y.reshape(-1, 1)  # (n_samples, 1)

        gradients_w = []
        gradients_b = []

        # Traverse layers in reverse
        for i in range(n_layers - 1, -1, -1):
            # Gradient for weights and biases at layer i
            dW = (1 / n_samples) * (self.activations[i].T @ delta)
            db = (1 / n_samples) * np.sum(delta, axis=0, keepdims=True)

            gradients_w.insert(0, dW)
            gradients_b.insert(0, db)

            if i > 0:
                # Propagate gradient to previous layer
                delta = (delta @ self.weights[i].T) * \
                        self._relu_derivative(self.z_values[i-1])

        return gradients_w, gradients_b

    def _update_weights(self, gradients_w, gradients_b):
        """Update weights using SGD with optional momentum."""
        for i in range(len(self.weights)):
            # Momentum update
            self.velocities_w[i] = (self.momentum * self.velocities_w[i] -
                                     self.lr * gradients_w[i])
            self.velocities_b[i] = (self.momentum * self.velocities_b[i] -
                                     self.lr * gradients_b[i])

            self.weights[i] += self.velocities_w[i]
            self.biases[i] += self.velocities_b[i]

    def fit(self, X, y, epochs=1000, verbose=True):
        """Train the network."""
        for epoch in range(epochs):
            # Forward pass
            output = self.forward(X)

            # Compute loss (binary cross-entropy)
            eps = 1e-15
            loss = -np.mean(
                y.reshape(-1, 1) * np.log(output + eps) +
                (1 - y.reshape(-1, 1)) * np.log(1 - output + eps)
            )
            self.loss_history.append(loss)

            # Backward pass
            gradients_w, gradients_b = self.backward(X, y)

            # Update weights
            self._update_weights(gradients_w, gradients_b)

            if verbose and epoch % 200 == 0:
                print(f"Epoch {epoch}: loss = {loss:.4f}")

        return self

    def predict(self, X, threshold=0.5):
        """Predict class labels."""
        output = self.forward(X)
        return (output >= threshold).astype(int).flatten()


# ---- Test: Learn XOR (the classic neural network test) ----
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=float)
y = np.array([0, 1, 1, 0], dtype=float)  # XOR

# XOR is not linearly separable - needs a hidden layer!
nn = NeuralNetwork(
    layer_sizes=[2, 8, 1],  # 2 inputs, 8 hidden neurons, 1 output
    learning_rate=0.5,
    momentum=0.9
)
nn.fit(X, y, epochs=2000, verbose=True)

predictions = nn.predict(X)
print(f"\nPredictions: {predictions}")
print(f"Actual:      {y.astype(int)}")
print(f"Accuracy:    {np.mean(predictions == y):.0%}")

What interviewers look for:

  • Correct forward pass — matrix multiplication, adding bias, applying activation
  • Correct backpropagation — computing gradients layer by layer in reverse order
  • Weight initialization — He initialization for ReLU, not zeros (zeros cause symmetry breaking failure)
  • Shape tracking — being able to explain the shape of every intermediate tensor
  • Stored activations — understanding that backprop needs the forward pass values
  • XOR test — knowing that XOR requires a hidden layer (not linearly separable)
Critical detail interviewers test: “Why can't you initialize all weights to zero?” Answer: All neurons in a layer would compute the same output, receive the same gradient, and update identically. The network would never learn different features — this is called the symmetry breaking problem. Random initialization breaks this symmetry.

📝
Interview Question #3: “Extend your neural network to support multi-class classification using softmax and cross-entropy loss. Implement the backward pass for softmax.”

Multi-Class Extension with Softmax

class MultiClassNN:
    """
    Neural network with softmax output for multi-class classification.
    """

    def __init__(self, layer_sizes, learning_rate=0.01):
        self.lr = learning_rate
        self.weights = []
        self.biases = []

        for i in range(len(layer_sizes) - 1):
            scale = np.sqrt(2.0 / layer_sizes[i])
            W = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * scale
            b = np.zeros((1, layer_sizes[i+1]))
            self.weights.append(W)
            self.biases.append(b)

    def _relu(self, z):
        return np.maximum(0, z)

    def _relu_derivative(self, z):
        return (z > 0).astype(float)

    def _softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

    def _one_hot(self, y, n_classes):
        """Convert integer labels to one-hot encoding."""
        one_hot = np.zeros((len(y), n_classes))
        one_hot[np.arange(len(y)), y.astype(int)] = 1
        return one_hot

    def forward(self, X):
        self.activations = [X]
        self.z_values = []
        current = X

        for i in range(len(self.weights)):
            z = current @ self.weights[i] + self.biases[i]
            self.z_values.append(z)

            if i == len(self.weights) - 1:
                current = self._softmax(z)  # Softmax for output
            else:
                current = self._relu(z)

            self.activations.append(current)
        return current

    def fit(self, X, y, epochs=1000, verbose=True):
        n_classes = len(np.unique(y))
        y_one_hot = self._one_hot(y, n_classes)

        for epoch in range(epochs):
            # Forward
            output = self.forward(X)
            n = X.shape[0]

            # Cross-entropy loss
            eps = 1e-15
            loss = -np.mean(np.sum(y_one_hot * np.log(output + eps), axis=1))

            # Backward - softmax + cross-entropy gradient simplifies to:
            delta = (output - y_one_hot) / n  # Same elegant form!

            for i in range(len(self.weights) - 1, -1, -1):
                dW = self.activations[i].T @ delta
                db = np.sum(delta, axis=0, keepdims=True)

                if i > 0:
                    delta = (delta @ self.weights[i].T) * \
                            self._relu_derivative(self.z_values[i-1])

                self.weights[i] -= self.lr * dW
                self.biases[i] -= self.lr * db

            if verbose and epoch % 200 == 0:
                print(f"Epoch {epoch}: loss = {loss:.4f}")

        return self

    def predict(self, X):
        probs = self.forward(X)
        return np.argmax(probs, axis=1)


# ---- Test: 3-class classification ----
np.random.seed(42)
X = np.vstack([
    np.random.randn(50, 2) + [0, 0],
    np.random.randn(50, 2) + [3, 3],
    np.random.randn(50, 2) + [6, 0]
])
y = np.array([0]*50 + [1]*50 + [2]*50, dtype=float)

nn = MultiClassNN(layer_sizes=[2, 32, 3], learning_rate=0.05)
nn.fit(X, y, epochs=1000)
preds = nn.predict(X)
print(f"Accuracy: {np.mean(preds == y):.2%}")
💡
Key insight to share with the interviewer: The gradient of softmax + cross-entropy loss simplifies to (predicted - actual), which is the same elegant form as sigmoid + binary cross-entropy. This is not a coincidence — both are special cases of the exponential family, and this simplification is why these loss-activation pairs are preferred in practice.

📝
Interview Question #4: “Walk me through the shapes of all tensors during a forward pass through a network with input size 10, hidden layer size 64, and output size 3. What is the total number of trainable parameters?”

Shape Analysis (Frequently Asked)

# Network: [10, 64, 3]
# Input X: (batch_size, 10)

# Layer 1: Input -> Hidden
# W1: (10, 64)    -> 640 parameters
# b1: (1, 64)     -> 64 parameters
# z1 = X @ W1 + b1  -> (batch_size, 64)
# a1 = relu(z1)     -> (batch_size, 64)

# Layer 2: Hidden -> Output
# W2: (64, 3)     -> 192 parameters
# b2: (1, 3)      -> 3 parameters
# z2 = a1 @ W2 + b2 -> (batch_size, 3)
# a2 = softmax(z2)  -> (batch_size, 3)

# Total parameters: 640 + 64 + 192 + 3 = 899

def count_parameters(layer_sizes):
    """Count total trainable parameters in an MLP."""
    total = 0
    for i in range(len(layer_sizes) - 1):
        weights = layer_sizes[i] * layer_sizes[i+1]
        biases = layer_sizes[i+1]
        total += weights + biases
        print(f"Layer {i+1}: W({layer_sizes[i]}x{layer_sizes[i+1]}) + "
              f"b({layer_sizes[i+1]}) = {weights + biases} params")
    print(f"Total: {total} parameters")
    return total

count_parameters([10, 64, 3])