Calculus & Gradients
Gradient computation is the engine of machine learning. Every model learns by computing gradients and updating parameters. This lesson implements six essential gradient-related algorithms from scratch: numerical differentiation, automatic differentiation, the chain rule, Jacobians, Hessians, and gradient verification.
Problem 1: Numerical Gradient
Problem: Compute the gradient of a function f: R^n -> R at a point x using finite differences. This is the simplest way to approximate derivatives and serves as a ground truth for testing analytical gradients.
def numerical_gradient(f, x, h=1e-5):
"""
Compute gradient of f at point x using central differences.
f: function R^n -> R
x: list of n floats
h: step size (default 1e-5)
Returns: list of n partial derivatives
Time: O(n * cost(f)), Space: O(n)
"""
n = len(x)
grad = [0.0] * n
for i in range(n):
# Central difference: (f(x+h) - f(x-h)) / (2h)
# More accurate than forward difference: O(h^2) vs O(h) error
x_plus = x[:]
x_minus = x[:]
x_plus[i] += h
x_minus[i] -= h
grad[i] = (f(x_plus) - f(x_minus)) / (2 * h)
return grad
# Test 1: f(x, y) = x^2 + 3xy + y^2
# Analytical gradient: [2x + 3y, 3x + 2y]
def f1(x):
return x[0]**2 + 3*x[0]*x[1] + x[1]**2
grad = numerical_gradient(f1, [2.0, 3.0])
print(f"Numerical: [{grad[0]:.6f}, {grad[1]:.6f}]")
# Expected: [13.0, 12.0] (2*2 + 3*3 = 13, 3*2 + 2*3 = 12)
# Test 2: f(x) = sin(x[0]) * cos(x[1])
import math
def f2(x):
return math.sin(x[0]) * math.cos(x[1])
grad = numerical_gradient(f2, [math.pi/4, math.pi/3])
print(f"Numerical: [{grad[0]:.6f}, {grad[1]:.6f}]")
# cos(pi/4)*cos(pi/3) = 0.3536, sin(pi/4)*(-sin(pi/3)) = -0.6124
Problem 2: Automatic Differentiation (Forward Mode)
Problem: Implement forward-mode automatic differentiation using dual numbers. A dual number a + b*epsilon (where epsilon^2 = 0) carries both a value and its derivative through computations exactly, with no approximation error.
class Dual:
"""
Dual number for forward-mode automatic differentiation.
Represents value + derivative * epsilon, where epsilon^2 = 0.
"""
def __init__(self, value, derivative=0.0):
self.value = value
self.derivative = derivative
def __add__(self, other):
if not isinstance(other, Dual):
other = Dual(other)
return Dual(self.value + other.value,
self.derivative + other.derivative)
def __radd__(self, other):
return self.__add__(other)
def __mul__(self, other):
if not isinstance(other, Dual):
other = Dual(other)
# (a + b*eps) * (c + d*eps) = ac + (ad + bc)*eps
return Dual(self.value * other.value,
self.value * other.derivative +
self.derivative * other.value)
def __rmul__(self, other):
return self.__mul__(other)
def __sub__(self, other):
if not isinstance(other, Dual):
other = Dual(other)
return Dual(self.value - other.value,
self.derivative - other.derivative)
def __rsub__(self, other):
if not isinstance(other, Dual):
other = Dual(other)
return Dual(other.value - self.value,
other.derivative - self.derivative)
def __truediv__(self, other):
if not isinstance(other, Dual):
other = Dual(other)
return Dual(self.value / other.value,
(self.derivative * other.value -
self.value * other.derivative) / (other.value ** 2))
def __pow__(self, n):
return Dual(self.value ** n,
n * self.value ** (n - 1) * self.derivative)
def __repr__(self):
return f"Dual({self.value}, {self.derivative})"
def dual_sin(x):
if isinstance(x, Dual):
return Dual(math.sin(x.value), math.cos(x.value) * x.derivative)
return math.sin(x)
def dual_cos(x):
if isinstance(x, Dual):
return Dual(math.cos(x.value), -math.sin(x.value) * x.derivative)
return math.cos(x)
def dual_exp(x):
if isinstance(x, Dual):
exp_val = math.exp(x.value)
return Dual(exp_val, exp_val * x.derivative)
return math.exp(x)
# Test: f(x) = x^3 + 2x^2 - 5x + 1
# f'(x) = 3x^2 + 4x - 5
def f(x):
return x**3 + 2*x**2 - 5*x + 1
x = Dual(3.0, 1.0) # value=3, derivative seed=1
result = f(x)
print(f"f(3) = {result.value}") # 3^3 + 2*9 - 15 + 1 = 31
print(f"f'(3) = {result.derivative}") # 27 + 12 - 5 = 34
# Test with trig: g(x) = sin(x) * x^2
def g(x):
return dual_sin(x) * x**2
result = g(Dual(math.pi/4, 1.0))
print(f"g(pi/4) = {result.value:.6f}")
print(f"g'(pi/4) = {result.derivative:.6f}")
Problem 3: Chain Rule Implementation
Problem: Implement the chain rule for computing derivatives of composite functions. This is the mathematical foundation of backpropagation in neural networks.
def chain_rule_demo():
"""
Demonstrate the chain rule: d/dx f(g(x)) = f'(g(x)) * g'(x)
Example neural network forward pass:
z1 = w1*x + b1 (linear)
a1 = relu(z1) (activation)
z2 = w2*a1 + b2 (linear)
loss = (z2 - y)^2 (MSE loss)
Backprop computes gradients via chain rule, right to left.
"""
import math
# Forward pass values
x = 2.0
w1, b1 = 0.5, 0.1
w2, b2 = -0.3, 0.2
y = 1.0 # target
# Forward
z1 = w1 * x + b1 # 1.1
a1 = max(0, z1) # 1.1 (ReLU)
z2 = w2 * a1 + b2 # -0.13
loss = (z2 - y) ** 2 # 1.2769
print(f"Forward: z1={z1}, a1={a1}, z2={z2}, loss={loss:.4f}")
# Backward (chain rule, right to left)
dloss_dz2 = 2 * (z2 - y) # -2.26
dz2_da1 = w2 # -0.3
da1_dz1 = 1.0 if z1 > 0 else 0.0 # 1.0 (ReLU derivative)
dz1_dw1 = x # 2.0
# Chain rule: dloss/dw1 = dloss/dz2 * dz2/da1 * da1/dz1 * dz1/dw1
dloss_dw1 = dloss_dz2 * dz2_da1 * da1_dz1 * dz1_dw1
print(f"dloss/dw1 = {dloss_dz2} * {dz2_da1} * {da1_dz1} * {dz1_dw1}")
print(f"dloss/dw1 = {dloss_dw1:.4f}")
# Verify with numerical gradient
def compute_loss(w1_val):
z1 = w1_val * x + b1
a1 = max(0, z1)
z2 = w2 * a1 + b2
return (z2 - y) ** 2
h = 1e-7
numerical = (compute_loss(w1 + h) - compute_loss(w1 - h)) / (2 * h)
print(f"Numerical verification: {numerical:.4f}")
print(f"Match: {abs(dloss_dw1 - numerical) < 1e-5}")
chain_rule_demo()
Problem 4: Jacobian Matrix
Problem: Compute the Jacobian matrix of a vector-valued function f: R^n -> R^m. The Jacobian J[i][j] = partial f_i / partial x_j. It generalizes the gradient to functions with multiple outputs.
def jacobian(f, x, h=1e-5):
"""
Compute Jacobian matrix of f: R^n -> R^m at point x.
J[i][j] = partial f_i / partial x_j
Time: O(n * m * cost(f)), Space: O(m * n)
"""
f0 = f(x)
m = len(f0)
n = len(x)
J = [[0.0] * n for _ in range(m)]
for j in range(n):
x_plus = x[:]
x_minus = x[:]
x_plus[j] += h
x_minus[j] -= h
f_plus = f(x_plus)
f_minus = f(x_minus)
for i in range(m):
J[i][j] = (f_plus[i] - f_minus[i]) / (2 * h)
return J
# Test: f(x, y) = [x^2 * y, x + y^3]
# Jacobian: [[2xy, x^2], [1, 3y^2]]
def f_vec(x):
return [x[0]**2 * x[1], x[0] + x[1]**3]
J = jacobian(f_vec, [2.0, 3.0])
print("Jacobian:")
for row in J:
print([f"{v:.4f}" for v in row])
# Expected: [[12.0, 4.0], [1.0, 27.0]]
# 2*2*3=12, 2^2=4, 1, 3*3^2=27
Problem 5: Hessian Matrix
Problem: Compute the Hessian matrix of a scalar function f: R^n -> R. The Hessian H[i][j] = partial^2 f / (partial x_i * partial x_j). It captures the curvature of the function and is used in second-order optimization methods.
def hessian(f, x, h=1e-4):
"""
Compute Hessian matrix of f: R^n -> R at point x.
H[i][j] = d^2f / (dx_i * dx_j)
Uses central differences for second derivatives.
Time: O(n^2 * cost(f)), Space: O(n^2)
"""
n = len(x)
H = [[0.0] * n for _ in range(n)]
for i in range(n):
for j in range(i, n):
if i == j:
# Diagonal: d^2f/dx_i^2 = (f(x+h) - 2f(x) + f(x-h)) / h^2
x_plus = x[:]
x_minus = x[:]
x_plus[i] += h
x_minus[i] -= h
H[i][i] = (f(x_plus) - 2*f(x) + f(x_minus)) / (h**2)
else:
# Off-diagonal: use four-point formula
# d^2f/(dx_i dx_j) = (f(++)-f(+-)-f(-+)+f(--)) / (4h^2)
xpp = x[:]; xpp[i] += h; xpp[j] += h
xpm = x[:]; xpm[i] += h; xpm[j] -= h
xmp = x[:]; xmp[i] -= h; xmp[j] += h
xmm = x[:]; xmm[i] -= h; xmm[j] -= h
H[i][j] = (f(xpp) - f(xpm) - f(xmp) + f(xmm)) / (4*h**2)
H[j][i] = H[i][j] # Hessian is symmetric
return H
# Test: f(x, y) = x^3 + 2*x*y^2 + y^4
# Hessian: [[6x, 4y], [4y, 4x + 12y^2]]
def f_hess(x):
return x[0]**3 + 2*x[0]*x[1]**2 + x[1]**4
H = hessian(f_hess, [1.0, 2.0])
print("Hessian:")
for row in H:
print([f"{v:.4f}" for v in row])
# Expected: [[6.0, 8.0], [8.0, 52.0]]
# 6*1=6, 4*2=8, 4*1+12*4=52
Problem 6: Gradient Checking
Problem: Implement gradient checking to verify that an analytical gradient implementation is correct. This is essential for debugging backpropagation code. Compare the analytical gradient against the numerical gradient and compute the relative error.
def gradient_check(f, grad_f, x, h=1e-5, threshold=1e-5):
"""
Verify analytical gradient against numerical gradient.
f: function R^n -> R
grad_f: function that returns analytical gradient at x
x: point to check
Returns: True if gradients match, False otherwise
Time: O(n * cost(f)), Space: O(n)
"""
analytical = grad_f(x)
numerical = numerical_gradient(f, x, h)
errors = []
all_pass = True
for i in range(len(x)):
# Relative error: |a - n| / max(|a|, |n|, 1e-8)
a, n = analytical[i], numerical[i]
denom = max(abs(a), abs(n), 1e-8)
rel_error = abs(a - n) / denom
status = "PASS" if rel_error < threshold else "FAIL"
if rel_error >= threshold:
all_pass = False
errors.append({
'index': i,
'analytical': a,
'numerical': n,
'rel_error': rel_error,
'status': status
})
# Report
print(f"{'Param':<6} {'Analytical':>12} {'Numerical':>12} {'Rel Error':>12} {'Status'}")
print("-" * 56)
for e in errors:
print(f"x[{e['index']}] {e['analytical']:>12.6f} {e['numerical']:>12.6f} "
f"{e['rel_error']:>12.2e} {e['status']}")
max_err = max(e['rel_error'] for e in errors)
print(f"\nMax relative error: {max_err:.2e}")
print(f"Overall: {'PASS' if all_pass else 'FAIL'}")
return all_pass
# Test with a correct gradient
def f_test(x):
return x[0]**2 * x[1] + math.sin(x[0]) + x[1]**3
def grad_f_correct(x):
return [
2*x[0]*x[1] + math.cos(x[0]), # df/dx0
x[0]**2 + 3*x[1]**2 # df/dx1
]
print("=== Correct gradient ===")
gradient_check(f_test, grad_f_correct, [2.0, 3.0])
# Test with a WRONG gradient (common bug: forgot the cos term)
def grad_f_wrong(x):
return [
2*x[0]*x[1], # BUG: missing cos(x[0])
x[0]**2 + 3*x[1]**2
]
print("\n=== Buggy gradient ===")
gradient_check(f_test, grad_f_wrong, [2.0, 3.0])
Key Takeaways
- Numerical gradients (central differences) give O(h^2) accuracy and serve as ground truth for testing analytical gradients.
- Forward-mode automatic differentiation uses dual numbers to compute exact derivatives with no approximation error.
- The chain rule is the foundation of backpropagation: multiply local derivatives from output to input.
- The Jacobian generalizes the gradient to vector-valued functions. It is an m x n matrix of partial derivatives.
- The Hessian captures curvature (second derivatives) and is used in Newton's method and other second-order optimizers.
- Gradient checking is essential for debugging: compare analytical and numerical gradients using relative error.
Lilly Tech Systems