Advanced
Neural Networks
Building a neural network from scratch is the ultimate ML interview challenge. It tests your understanding of forward propagation, backpropagation via chain rule, weight initialization, activation functions, and optimization. We implement a complete multi-layer perceptron (MLP) with SGD and Adam optimizers.
Activation Functions
Every activation function needs both a forward pass and its derivative for backpropagation.
import numpy as np
# ---- Activation Functions and Their Derivatives ----
def relu(z):
return np.maximum(0, z)
def relu_derivative(z):
return (z > 0).astype(float)
def sigmoid(z):
z = np.clip(z, -500, 500)
return 1.0 / (1.0 + np.exp(-z))
def sigmoid_derivative(z):
s = sigmoid(z)
return s * (1 - s)
def tanh(z):
return np.tanh(z)
def tanh_derivative(z):
t = np.tanh(z)
return 1 - t ** 2
def softmax(z):
"""Numerically stable softmax."""
z_shifted = z - np.max(z, axis=1, keepdims=True)
exp_z = np.exp(z_shifted)
return exp_z / np.sum(exp_z, axis=1, keepdims=True)
# Map names to functions for easy lookup
ACTIVATIONS = {
'relu': (relu, relu_derivative),
'sigmoid': (sigmoid, sigmoid_derivative),
'tanh': (tanh, tanh_derivative),
}
Weight Initialization
Proper weight initialization is critical. Bad initialization causes vanishing or exploding gradients.
def initialize_weights(n_in, n_out, method='he'):
"""Initialize weight matrix.
Methods:
- 'he': He initialization (best for ReLU) - W ~ N(0, sqrt(2/n_in))
- 'xavier': Xavier/Glorot (best for sigmoid/tanh) - W ~ N(0, sqrt(2/(n_in+n_out)))
- 'zeros': Zero initialization (BAD - never use for hidden layers)
"""
if method == 'he':
return np.random.randn(n_in, n_out) * np.sqrt(2.0 / n_in)
elif method == 'xavier':
return np.random.randn(n_in, n_out) * np.sqrt(2.0 / (n_in + n_out))
elif method == 'zeros':
return np.zeros((n_in, n_out))
else:
return np.random.randn(n_in, n_out) * 0.01
Full MLP Implementation
class NeuralNetwork:
"""Multi-Layer Perceptron from scratch.
Example:
nn = NeuralNetwork([2, 64, 32, 3], activations=['relu', 'relu', 'softmax'])
nn.fit(X_train, y_train, epochs=100, lr=0.01)
predictions = nn.predict(X_test)
"""
def __init__(self, layer_sizes, activations=None):
"""
Args:
layer_sizes: list of ints, e.g., [input_dim, 64, 32, output_dim]
activations: list of activation names for each layer (except input)
"""
self.layer_sizes = layer_sizes
self.n_layers = len(layer_sizes) - 1
# Default activations: ReLU for hidden, softmax for output
if activations is None:
activations = ['relu'] * (self.n_layers - 1) + ['softmax']
self.activation_names = activations
# Initialize weights and biases
self.weights = []
self.biases = []
for i in range(self.n_layers):
init_method = 'he' if activations[i] == 'relu' else 'xavier'
W = initialize_weights(layer_sizes[i], layer_sizes[i+1], init_method)
b = np.zeros((1, layer_sizes[i+1]))
self.weights.append(W)
self.biases.append(b)
self.loss_history = []
def _forward(self, X):
"""Forward pass through all layers.
Stores intermediate values for backpropagation.
"""
self.z_values = [] # Pre-activation values
self.a_values = [X] # Post-activation values (input is first)
current = X
for i in range(self.n_layers):
z = current @ self.weights[i] + self.biases[i]
self.z_values.append(z)
if self.activation_names[i] == 'softmax':
a = softmax(z)
else:
act_fn, _ = ACTIVATIONS[self.activation_names[i]]
a = act_fn(z)
self.a_values.append(a)
current = a
return current
def _compute_loss(self, y_pred, y_true):
"""Cross-entropy loss for classification."""
n_samples = y_true.shape[0]
# One-hot encode if needed
if y_true.ndim == 1:
y_onehot = np.zeros_like(y_pred)
y_onehot[np.arange(n_samples), y_true] = 1
else:
y_onehot = y_true
# Cross-entropy
log_probs = np.log(np.clip(y_pred, 1e-15, 1.0))
loss = -np.mean(np.sum(y_onehot * log_probs, axis=1))
return loss, y_onehot
def _backward(self, y_onehot):
"""Backpropagation: compute gradients for all layers."""
n_samples = y_onehot.shape[0]
self.dW = []
self.db = []
# Output layer gradient (softmax + cross-entropy shortcut)
delta = self.a_values[-1] - y_onehot # (n_samples, n_classes)
# Iterate backward through layers
for i in range(self.n_layers - 1, -1, -1):
# Gradients for weights and biases of layer i
dW = (self.a_values[i].T @ delta) / n_samples
db = np.mean(delta, axis=0, keepdims=True)
self.dW.insert(0, dW)
self.db.insert(0, db)
# Propagate gradient to previous layer (if not input)
if i > 0:
delta = delta @ self.weights[i].T
# Multiply by activation derivative
_, act_deriv = ACTIVATIONS[self.activation_names[i-1]]
delta = delta * act_deriv(self.z_values[i-1])
def _update_sgd(self, lr):
"""Stochastic Gradient Descent update."""
for i in range(self.n_layers):
self.weights[i] -= lr * self.dW[i]
self.biases[i] -= lr * self.db[i]
def fit(self, X, y, epochs=100, lr=0.01, batch_size=32,
optimizer='sgd', verbose=True):
"""Train the neural network."""
n_samples = X.shape[0]
# Initialize Adam parameters if needed
if optimizer == 'adam':
self._init_adam()
self.loss_history = []
for epoch in range(epochs):
# Shuffle data
indices = np.random.permutation(n_samples)
X_shuffled = X[indices]
y_shuffled = y[indices]
epoch_loss = 0
n_batches = 0
for start in range(0, n_samples, batch_size):
end = min(start + batch_size, n_samples)
X_batch = X_shuffled[start:end]
y_batch = y_shuffled[start:end]
# Forward pass
y_pred = self._forward(X_batch)
# Compute loss
loss, y_onehot = self._compute_loss(y_pred, y_batch)
epoch_loss += loss
n_batches += 1
# Backward pass
self._backward(y_onehot)
# Update weights
if optimizer == 'sgd':
self._update_sgd(lr)
elif optimizer == 'adam':
self._update_adam(lr)
avg_loss = epoch_loss / n_batches
self.loss_history.append(avg_loss)
if verbose and (epoch + 1) % 10 == 0:
acc = self.accuracy(X, y)
print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f} - Acc: {acc:.4f}")
return self
def _init_adam(self):
"""Initialize Adam optimizer state."""
self.m_W = [np.zeros_like(W) for W in self.weights]
self.v_W = [np.zeros_like(W) for W in self.weights]
self.m_b = [np.zeros_like(b) for b in self.biases]
self.v_b = [np.zeros_like(b) for b in self.biases]
self.adam_t = 0
def _update_adam(self, lr, beta1=0.9, beta2=0.999, eps=1e-8):
"""Adam optimizer update."""
self.adam_t += 1
for i in range(self.n_layers):
# Update moment estimates for weights
self.m_W[i] = beta1 * self.m_W[i] + (1 - beta1) * self.dW[i]
self.v_W[i] = beta2 * self.v_W[i] + (1 - beta2) * self.dW[i]**2
# Bias correction
m_hat = self.m_W[i] / (1 - beta1**self.adam_t)
v_hat = self.v_W[i] / (1 - beta2**self.adam_t)
self.weights[i] -= lr * m_hat / (np.sqrt(v_hat) + eps)
# Same for biases
self.m_b[i] = beta1 * self.m_b[i] + (1 - beta1) * self.db[i]
self.v_b[i] = beta2 * self.v_b[i] + (1 - beta2) * self.db[i]**2
m_hat_b = self.m_b[i] / (1 - beta1**self.adam_t)
v_hat_b = self.v_b[i] / (1 - beta2**self.adam_t)
self.biases[i] -= lr * m_hat_b / (np.sqrt(v_hat_b) + eps)
def predict(self, X):
probs = self._forward(X)
return np.argmax(probs, axis=1)
def accuracy(self, X, y):
return np.mean(self.predict(X) == y)
# ---- Test: XOR problem (non-linearly separable) ----
np.random.seed(42)
X_xor = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y_xor = np.array([0, 1, 1, 0])
# Repeat data for training
X_train = np.tile(X_xor, (50, 1)) + np.random.randn(200, 2) * 0.1
y_train = np.tile(y_xor, 50)
nn = NeuralNetwork([2, 16, 8, 2], activations=['relu', 'relu', 'softmax'])
nn.fit(X_train, y_train, epochs=100, lr=0.01, batch_size=32,
optimizer='adam', verbose=True)
print(f"\nFinal accuracy: {nn.accuracy(X_train, y_train):.4f}")
print(f"XOR predictions: {nn.predict(X_xor)}") # Should be [0, 1, 1, 0]
Interview tip: The key insight of backpropagation is the chain rule. For any layer, the gradient flowing back is: (1) multiply by the transpose of that layer's weights, then (2) element-wise multiply by the activation derivative. If you can explain this clearly, you demonstrate deep understanding. Also note that softmax + cross-entropy has a clean gradient shortcut:
dL/dz = probs - y_onehot.Understanding Backpropagation
The chain rule applied layer by layer:
# For a 3-layer network: Input -> Hidden1 -> Hidden2 -> Output
#
# Forward:
# z1 = X @ W1 + b1
# a1 = relu(z1)
# z2 = a1 @ W2 + b2
# a2 = relu(z2)
# z3 = a2 @ W3 + b3
# a3 = softmax(z3) # output probabilities
# L = cross_entropy(a3, y) # loss
#
# Backward (chain rule):
# delta3 = a3 - y_onehot # dL/dz3 (softmax+CE shortcut)
# dW3 = a2.T @ delta3 / n # dL/dW3
# db3 = mean(delta3) # dL/db3
#
# delta2 = (delta3 @ W3.T) * relu'(z2) # dL/dz2
# dW2 = a1.T @ delta2 / n # dL/dW2
# db2 = mean(delta2) # dL/db2
#
# delta1 = (delta2 @ W2.T) * relu'(z1) # dL/dz1
# dW1 = X.T @ delta1 / n # dL/dW1
# db1 = mean(delta1) # dL/db1
#
# Each delta is: (gradient from above) @ (W.T) * activation_derivative
# Each dW is: (input to this layer).T @ delta / n_samples
Key Takeaways
- Forward pass:
z = a_prev @ W + b, thena = activation(z) - Backward pass: delta propagates back via
delta @ W.T * act_deriv(z) - Use He initialization for ReLU, Xavier for sigmoid/tanh
- Adam optimizer: maintains running averages of gradients and squared gradients with bias correction
- Softmax + cross-entropy gradient simplifies to
probs - y_onehot - Mini-batch training: shuffle data, split into batches, update after each batch
Lilly Tech Systems