Advanced
Implement Neural Networks
The most challenging ML coding interview question. Build a complete multi-layer perceptron from scratch with NumPy — including forward pass, backpropagation, activation functions, and SGD optimizer. Understanding every line of this code sets you apart from other candidates.
Activation Functions: Implement From Scratch
Before building the full network, you need to implement activation functions and their derivatives. Interviewers often start by asking you to code these individually.
Interview Question #1: “Implement the sigmoid, ReLU, and softmax activation functions along with their derivatives. Explain when you would use each one.”
import numpy as np
class Activations:
"""Activation functions and their derivatives for neural networks."""
@staticmethod
def sigmoid(z):
"""Sigmoid: squashes values to (0, 1). Used for binary output."""
z = np.clip(z, -500, 500)
return 1.0 / (1.0 + np.exp(-z))
@staticmethod
def sigmoid_derivative(a):
"""Derivative of sigmoid given its output a = sigmoid(z)."""
return a * (1 - a)
@staticmethod
def relu(z):
"""ReLU: max(0, z). Default choice for hidden layers."""
return np.maximum(0, z)
@staticmethod
def relu_derivative(z):
"""Derivative of ReLU. Note: undefined at 0, we use 0."""
return (z > 0).astype(float)
@staticmethod
def tanh(z):
"""Tanh: squashes values to (-1, 1). Centered at zero."""
return np.tanh(z)
@staticmethod
def tanh_derivative(a):
"""Derivative of tanh given its output a = tanh(z)."""
return 1 - a ** 2
@staticmethod
def softmax(z):
"""Softmax: converts logits to probabilities. Used for multi-class output."""
# Subtract max for numerical stability
exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
return exp_z / np.sum(exp_z, axis=1, keepdims=True)
# Test each activation
z = np.array([[-2.0, -1.0, 0.0, 1.0, 2.0]])
print(f"Sigmoid: {Activations.sigmoid(z)}")
print(f"ReLU: {Activations.relu(z)}")
print(f"Tanh: {Activations.tanh(z)}")
print(f"Softmax: {Activations.softmax(z)}")
| Activation | Range | Use Case | Problem |
|---|---|---|---|
| Sigmoid | (0, 1) | Binary output layer | Vanishing gradients |
| ReLU | [0, infinity) | Hidden layers (default) | Dead neurons (ReLU dying) |
| Tanh | (-1, 1) | Hidden layers, RNNs | Vanishing gradients |
| Softmax | (0, 1), sums to 1 | Multi-class output | Numerical overflow |
Interview Question #2: “Implement a 2-layer neural network (MLP) from scratch using only NumPy. It should handle binary classification with backpropagation and SGD. Walk me through the forward and backward passes.”
Full MLP: Complete Solution
import numpy as np
class NeuralNetwork:
"""
Multi-layer Perceptron from scratch.
Architecture: Input -> Hidden (ReLU) -> Output (Sigmoid)
Loss: Binary Cross-Entropy
Optimizer: SGD with optional momentum
"""
def __init__(self, layer_sizes, learning_rate=0.01, momentum=0.0):
"""
Args:
layer_sizes: list, e.g. [2, 64, 32, 1] for 2 inputs,
two hidden layers (64, 32), 1 output
learning_rate: float
momentum: float, SGD momentum (0 = vanilla SGD)
"""
self.lr = learning_rate
self.momentum = momentum
self.weights = []
self.biases = []
self.velocities_w = []
self.velocities_b = []
# Xavier/He initialization
for i in range(len(layer_sizes) - 1):
# He initialization for ReLU layers
scale = np.sqrt(2.0 / layer_sizes[i])
W = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * scale
b = np.zeros((1, layer_sizes[i+1]))
self.weights.append(W)
self.biases.append(b)
self.velocities_w.append(np.zeros_like(W))
self.velocities_b.append(np.zeros_like(b))
self.loss_history = []
def _sigmoid(self, z):
z = np.clip(z, -500, 500)
return 1.0 / (1.0 + np.exp(-z))
def _relu(self, z):
return np.maximum(0, z)
def _relu_derivative(self, z):
return (z > 0).astype(float)
def forward(self, X):
"""
Forward pass through the network.
Stores intermediate values for backpropagation.
Returns: output of the last layer
"""
self.activations = [X] # Store input as first "activation"
self.z_values = [] # Store pre-activation values
current = X
n_layers = len(self.weights)
for i in range(n_layers):
z = current @ self.weights[i] + self.biases[i]
self.z_values.append(z)
if i == n_layers - 1:
# Output layer: sigmoid for binary classification
current = self._sigmoid(z)
else:
# Hidden layers: ReLU
current = self._relu(z)
self.activations.append(current)
return current
def backward(self, X, y):
"""
Backpropagation: compute gradients for all weights and biases.
This is the core of the interview question. Walk through each step.
"""
n_samples = X.shape[0]
n_layers = len(self.weights)
# Output layer gradient
# For sigmoid + binary cross-entropy, the gradient simplifies to:
output = self.activations[-1]
delta = output - y.reshape(-1, 1) # (n_samples, 1)
gradients_w = []
gradients_b = []
# Traverse layers in reverse
for i in range(n_layers - 1, -1, -1):
# Gradient for weights and biases at layer i
dW = (1 / n_samples) * (self.activations[i].T @ delta)
db = (1 / n_samples) * np.sum(delta, axis=0, keepdims=True)
gradients_w.insert(0, dW)
gradients_b.insert(0, db)
if i > 0:
# Propagate gradient to previous layer
delta = (delta @ self.weights[i].T) * \
self._relu_derivative(self.z_values[i-1])
return gradients_w, gradients_b
def _update_weights(self, gradients_w, gradients_b):
"""Update weights using SGD with optional momentum."""
for i in range(len(self.weights)):
# Momentum update
self.velocities_w[i] = (self.momentum * self.velocities_w[i] -
self.lr * gradients_w[i])
self.velocities_b[i] = (self.momentum * self.velocities_b[i] -
self.lr * gradients_b[i])
self.weights[i] += self.velocities_w[i]
self.biases[i] += self.velocities_b[i]
def fit(self, X, y, epochs=1000, verbose=True):
"""Train the network."""
for epoch in range(epochs):
# Forward pass
output = self.forward(X)
# Compute loss (binary cross-entropy)
eps = 1e-15
loss = -np.mean(
y.reshape(-1, 1) * np.log(output + eps) +
(1 - y.reshape(-1, 1)) * np.log(1 - output + eps)
)
self.loss_history.append(loss)
# Backward pass
gradients_w, gradients_b = self.backward(X, y)
# Update weights
self._update_weights(gradients_w, gradients_b)
if verbose and epoch % 200 == 0:
print(f"Epoch {epoch}: loss = {loss:.4f}")
return self
def predict(self, X, threshold=0.5):
"""Predict class labels."""
output = self.forward(X)
return (output >= threshold).astype(int).flatten()
# ---- Test: Learn XOR (the classic neural network test) ----
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=float)
y = np.array([0, 1, 1, 0], dtype=float) # XOR
# XOR is not linearly separable - needs a hidden layer!
nn = NeuralNetwork(
layer_sizes=[2, 8, 1], # 2 inputs, 8 hidden neurons, 1 output
learning_rate=0.5,
momentum=0.9
)
nn.fit(X, y, epochs=2000, verbose=True)
predictions = nn.predict(X)
print(f"\nPredictions: {predictions}")
print(f"Actual: {y.astype(int)}")
print(f"Accuracy: {np.mean(predictions == y):.0%}")
What interviewers look for:
- Correct forward pass — matrix multiplication, adding bias, applying activation
- Correct backpropagation — computing gradients layer by layer in reverse order
- Weight initialization — He initialization for ReLU, not zeros (zeros cause symmetry breaking failure)
- Shape tracking — being able to explain the shape of every intermediate tensor
- Stored activations — understanding that backprop needs the forward pass values
- XOR test — knowing that XOR requires a hidden layer (not linearly separable)
Critical detail interviewers test: “Why can't you initialize all weights to zero?” Answer: All neurons in a layer would compute the same output, receive the same gradient, and update identically. The network would never learn different features — this is called the symmetry breaking problem. Random initialization breaks this symmetry.
Interview Question #3: “Extend your neural network to support multi-class classification using softmax and cross-entropy loss. Implement the backward pass for softmax.”
Multi-Class Extension with Softmax
class MultiClassNN:
"""
Neural network with softmax output for multi-class classification.
"""
def __init__(self, layer_sizes, learning_rate=0.01):
self.lr = learning_rate
self.weights = []
self.biases = []
for i in range(len(layer_sizes) - 1):
scale = np.sqrt(2.0 / layer_sizes[i])
W = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * scale
b = np.zeros((1, layer_sizes[i+1]))
self.weights.append(W)
self.biases.append(b)
def _relu(self, z):
return np.maximum(0, z)
def _relu_derivative(self, z):
return (z > 0).astype(float)
def _softmax(self, z):
exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
return exp_z / np.sum(exp_z, axis=1, keepdims=True)
def _one_hot(self, y, n_classes):
"""Convert integer labels to one-hot encoding."""
one_hot = np.zeros((len(y), n_classes))
one_hot[np.arange(len(y)), y.astype(int)] = 1
return one_hot
def forward(self, X):
self.activations = [X]
self.z_values = []
current = X
for i in range(len(self.weights)):
z = current @ self.weights[i] + self.biases[i]
self.z_values.append(z)
if i == len(self.weights) - 1:
current = self._softmax(z) # Softmax for output
else:
current = self._relu(z)
self.activations.append(current)
return current
def fit(self, X, y, epochs=1000, verbose=True):
n_classes = len(np.unique(y))
y_one_hot = self._one_hot(y, n_classes)
for epoch in range(epochs):
# Forward
output = self.forward(X)
n = X.shape[0]
# Cross-entropy loss
eps = 1e-15
loss = -np.mean(np.sum(y_one_hot * np.log(output + eps), axis=1))
# Backward - softmax + cross-entropy gradient simplifies to:
delta = (output - y_one_hot) / n # Same elegant form!
for i in range(len(self.weights) - 1, -1, -1):
dW = self.activations[i].T @ delta
db = np.sum(delta, axis=0, keepdims=True)
if i > 0:
delta = (delta @ self.weights[i].T) * \
self._relu_derivative(self.z_values[i-1])
self.weights[i] -= self.lr * dW
self.biases[i] -= self.lr * db
if verbose and epoch % 200 == 0:
print(f"Epoch {epoch}: loss = {loss:.4f}")
return self
def predict(self, X):
probs = self.forward(X)
return np.argmax(probs, axis=1)
# ---- Test: 3-class classification ----
np.random.seed(42)
X = np.vstack([
np.random.randn(50, 2) + [0, 0],
np.random.randn(50, 2) + [3, 3],
np.random.randn(50, 2) + [6, 0]
])
y = np.array([0]*50 + [1]*50 + [2]*50, dtype=float)
nn = MultiClassNN(layer_sizes=[2, 32, 3], learning_rate=0.05)
nn.fit(X, y, epochs=1000)
preds = nn.predict(X)
print(f"Accuracy: {np.mean(preds == y):.2%}")
Key insight to share with the interviewer: The gradient of softmax + cross-entropy loss simplifies to
(predicted - actual), which is the same elegant form as sigmoid + binary cross-entropy. This is not a coincidence — both are special cases of the exponential family, and this simplification is why these loss-activation pairs are preferred in practice.Interview Question #4: “Walk me through the shapes of all tensors during a forward pass through a network with input size 10, hidden layer size 64, and output size 3. What is the total number of trainable parameters?”
Shape Analysis (Frequently Asked)
# Network: [10, 64, 3]
# Input X: (batch_size, 10)
# Layer 1: Input -> Hidden
# W1: (10, 64) -> 640 parameters
# b1: (1, 64) -> 64 parameters
# z1 = X @ W1 + b1 -> (batch_size, 64)
# a1 = relu(z1) -> (batch_size, 64)
# Layer 2: Hidden -> Output
# W2: (64, 3) -> 192 parameters
# b2: (1, 3) -> 3 parameters
# z2 = a1 @ W2 + b2 -> (batch_size, 3)
# a2 = softmax(z2) -> (batch_size, 3)
# Total parameters: 640 + 64 + 192 + 3 = 899
def count_parameters(layer_sizes):
"""Count total trainable parameters in an MLP."""
total = 0
for i in range(len(layer_sizes) - 1):
weights = layer_sizes[i] * layer_sizes[i+1]
biases = layer_sizes[i+1]
total += weights + biases
print(f"Layer {i+1}: W({layer_sizes[i]}x{layer_sizes[i+1]}) + "
f"b({layer_sizes[i+1]}) = {weights + biases} params")
print(f"Total: {total} parameters")
return total
count_parameters([10, 64, 3])
Lilly Tech Systems