Descriptive Statistics
Six coding problems that implement core descriptive statistics from scratch. No numpy.mean, no statistics.median — you build every computation from the raw formula. These are the exact questions quant firms use to filter candidates who understand statistics versus those who only know APIs.
Problem 1: Mean, Median, Mode from Scratch
Problem: Implement functions to compute the arithmetic mean, median, and mode of a list of numbers. Do not use any library functions. Handle edge cases: empty list, even-length list for median, multiple modes.
import math
# ---- CHALLENGE ----
def mean(data):
"""Compute arithmetic mean without any library calls."""
pass
def median(data):
"""Compute median. For even-length lists, return average of two middle values."""
pass
def mode(data):
"""Return the most frequent value. If tie, return the smallest."""
pass
# ---- SOLUTION ----
def mean(data):
"""Arithmetic mean: sum of values divided by count.
Formula: mu = (1/n) * sum(x_i)
"""
if not data:
raise ValueError("mean requires at least one data point")
total = 0
count = 0
for x in data:
total += x
count += 1
return total / count
def median(data):
"""Median: middle value of sorted data.
- Odd length: middle element
- Even length: average of two middle elements
"""
if not data:
raise ValueError("median requires at least one data point")
sorted_data = sorted(data)
n = len(sorted_data)
mid = n // 2
if n % 2 == 1:
return sorted_data[mid]
else:
return (sorted_data[mid - 1] + sorted_data[mid]) / 2
def mode(data):
"""Mode: most frequent value. Ties broken by smallest value.
Uses a frequency counter built from scratch.
"""
if not data:
raise ValueError("mode requires at least one data point")
# Build frequency map without collections.Counter
freq = {}
for x in data:
freq[x] = freq.get(x, 0) + 1
max_count = 0
result = None
for value, count in freq.items():
if count > max_count or (count == max_count and (result is None or value < result)):
max_count = count
result = value
return result
# ---- VERIFICATION ----
assert mean([1, 2, 3, 4, 5]) == 3.0
assert mean([10]) == 10.0
assert median([3, 1, 2]) == 2
assert median([4, 1, 3, 2]) == 2.5
assert median([1]) == 1
assert mode([1, 2, 2, 3, 3, 3]) == 3
assert mode([1, 1, 2, 2]) == 1 # tie -> smallest
print("All tests passed!")
Problem 2: Variance and Standard Deviation
Problem: Implement population variance, sample variance, population standard deviation, and sample standard deviation from scratch. Explain why sample variance divides by n-1 (Bessel's correction).
import math
# ---- CHALLENGE ----
def variance(data, population=True):
"""Compute variance. If population=False, use Bessel's correction (n-1)."""
pass
def std_dev(data, population=True):
"""Compute standard deviation."""
pass
# ---- SOLUTION ----
def variance(data, population=True):
"""Variance: average squared deviation from the mean.
Population variance: sigma^2 = (1/n) * sum((x_i - mu)^2)
Sample variance: s^2 = (1/(n-1)) * sum((x_i - x_bar)^2)
Why n-1? (Bessel's correction)
- A sample mean is computed from the same data, so it is "closer" to the
data points than the true population mean would be.
- This makes sum((x_i - x_bar)^2) systematically smaller than
sum((x_i - mu)^2), so we divide by n-1 to correct the bias.
- With n-1, E[s^2] = sigma^2 (unbiased estimator).
"""
if not data:
raise ValueError("variance requires at least one data point")
n = len(data)
if not population and n < 2:
raise ValueError("sample variance requires at least two data points")
# Step 1: compute mean
mu = sum(data) / n
# Step 2: sum of squared deviations (two-pass algorithm for stability)
ss = sum((x - mu) ** 2 for x in data)
# Step 3: divide by n (population) or n-1 (sample)
return ss / n if population else ss / (n - 1)
def std_dev(data, population=True):
"""Standard deviation: square root of variance."""
return math.sqrt(variance(data, population))
# ---- VERIFICATION ----
data = [2, 4, 4, 4, 5, 5, 7, 9]
pop_var = variance(data, population=True)
sam_var = variance(data, population=False)
assert abs(pop_var - 4.0) < 1e-10, f"Expected 4.0, got {pop_var}"
assert abs(sam_var - 4.571428571) < 1e-6, f"Expected ~4.571, got {sam_var}"
assert abs(std_dev(data, True) - 2.0) < 1e-10
print("All tests passed!")
Problem 3: Percentiles from Scratch
Problem: Implement a function that computes the k-th percentile of a dataset using linear interpolation (the method used by numpy's default). Also implement the quartiles (Q1, Q2, Q3) and the interquartile range (IQR).
import math
# ---- CHALLENGE ----
def percentile(data, k):
"""Compute the k-th percentile (0-100) using linear interpolation."""
pass
def quartiles(data):
"""Return (Q1, Q2, Q3) and IQR."""
pass
# ---- SOLUTION ----
def percentile(data, k):
"""Compute k-th percentile using linear interpolation.
Method (numpy default / linear interpolation):
1. Sort the data
2. Compute the rank: r = (k/100) * (n-1)
3. Lower index: floor(r), upper index: ceil(r)
4. Interpolate: result = data[lo] + (r - lo) * (data[hi] - data[lo])
"""
if not data:
raise ValueError("percentile requires at least one data point")
if not (0 <= k <= 100):
raise ValueError("k must be between 0 and 100")
sorted_data = sorted(data)
n = len(sorted_data)
# Rank in the sorted array
rank = (k / 100) * (n - 1)
lo = int(math.floor(rank))
hi = int(math.ceil(rank))
if lo == hi:
return sorted_data[lo]
# Linear interpolation
fraction = rank - lo
return sorted_data[lo] + fraction * (sorted_data[hi] - sorted_data[lo])
def quartiles(data):
"""Compute Q1 (25th), Q2 (50th), Q3 (75th) percentiles and IQR."""
q1 = percentile(data, 25)
q2 = percentile(data, 50)
q3 = percentile(data, 75)
iqr = q3 - q1
return {"Q1": q1, "Q2": q2, "Q3": q3, "IQR": iqr}
# ---- VERIFICATION ----
data = [7, 15, 36, 39, 40, 41]
assert abs(percentile(data, 50) - 37.5) < 1e-10
assert abs(percentile(data, 25) - 20.25) < 1e-10
assert abs(percentile(data, 0) - 7) < 1e-10
assert abs(percentile(data, 100) - 41) < 1e-10
q = quartiles(data)
assert abs(q["Q2"] - 37.5) < 1e-10
print("All tests passed!")
Problem 4: Histogram from Scratch
Problem: Implement a histogram function that bins continuous data into equal-width bins. Return the bin edges and counts. Also implement a function to choose the number of bins using Sturges' rule and the Freedman-Diaconis rule.
import math
# ---- CHALLENGE ----
def histogram(data, num_bins=None):
"""Bin data into equal-width bins. Return (counts, bin_edges)."""
pass
def sturges_bins(n):
"""Sturges' rule for number of bins."""
pass
def freedman_diaconis_bins(data):
"""Freedman-Diaconis rule for number of bins."""
pass
# ---- SOLUTION ----
def sturges_bins(n):
"""Sturges' rule: k = ceil(log2(n) + 1)
Good for roughly normal data. Simple and widely used.
"""
if n <= 0:
return 1
return int(math.ceil(math.log2(n) + 1))
def freedman_diaconis_bins(data):
"""Freedman-Diaconis rule: bin_width = 2 * IQR * n^(-1/3)
More robust than Sturges' for skewed data.
"""
n = len(data)
if n < 4:
return max(1, n)
sorted_data = sorted(data)
# Compute IQR
q1_rank = 0.25 * (n - 1)
q3_rank = 0.75 * (n - 1)
q1_lo, q1_hi = int(math.floor(q1_rank)), int(math.ceil(q1_rank))
q3_lo, q3_hi = int(math.floor(q3_rank)), int(math.ceil(q3_rank))
q1 = sorted_data[q1_lo] + (q1_rank - q1_lo) * (sorted_data[q1_hi] - sorted_data[q1_lo]) if q1_lo != q1_hi else sorted_data[q1_lo]
q3 = sorted_data[q3_lo] + (q3_rank - q3_lo) * (sorted_data[q3_hi] - sorted_data[q3_lo]) if q3_lo != q3_hi else sorted_data[q3_lo]
iqr = q3 - q1
data_range = sorted_data[-1] - sorted_data[0]
if iqr == 0 or data_range == 0:
return sturges_bins(n)
bin_width = 2 * iqr * (n ** (-1.0 / 3.0))
return max(1, int(math.ceil(data_range / bin_width)))
def histogram(data, num_bins=None):
"""Compute histogram: bin continuous data into equal-width bins.
Returns:
counts: list of integers, count per bin
bin_edges: list of floats, len = num_bins + 1
"""
if not data:
return [], []
if num_bins is None:
num_bins = sturges_bins(len(data))
data_min = min(data)
data_max = max(data)
# Handle all-same-value case
if data_min == data_max:
return [len(data)], [data_min, data_max + 1]
bin_width = (data_max - data_min) / num_bins
# Create bin edges
bin_edges = [data_min + i * bin_width for i in range(num_bins + 1)]
bin_edges[-1] = data_max # ensure last edge is exactly data_max
# Count values in each bin
counts = [0] * num_bins
for x in data:
# Find bin index
if x == data_max:
idx = num_bins - 1 # include right edge in last bin
else:
idx = int((x - data_min) / bin_width)
idx = max(0, min(idx, num_bins - 1))
counts[idx] += 1
return counts, bin_edges
# ---- VERIFICATION ----
data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
counts, edges = histogram(data, num_bins=5)
assert sum(counts) == 10, "All data points must be counted"
assert len(edges) == 6, "5 bins -> 6 edges"
assert counts == [2, 2, 2, 2, 2], f"Expected uniform, got {counts}"
assert sturges_bins(100) == 8 # ceil(log2(100) + 1) = ceil(7.64) = 8
print("All tests passed!")
Problem 5: Pearson Correlation from Scratch
Problem: Implement the Pearson correlation coefficient between two variables. Also implement a function that computes the full correlation matrix for a dataset with multiple variables. Handle edge cases where standard deviation is zero.
import math
# ---- CHALLENGE ----
def pearson_correlation(x, y):
"""Compute Pearson correlation coefficient between two lists."""
pass
def correlation_matrix(data):
"""Compute correlation matrix for data (list of lists, each inner list is a variable)."""
pass
# ---- SOLUTION ----
def pearson_correlation(x, y):
"""Pearson correlation coefficient.
Formula: r = cov(X,Y) / (std(X) * std(Y))
Expanded (single-pass numerically stable version):
r = [n*sum(xi*yi) - sum(xi)*sum(yi)] /
sqrt([n*sum(xi^2) - (sum(xi))^2] * [n*sum(yi^2) - (sum(yi))^2])
"""
if len(x) != len(y):
raise ValueError("x and y must have the same length")
n = len(x)
if n < 2:
raise ValueError("correlation requires at least two data points")
sum_x = sum(x)
sum_y = sum(y)
sum_xy = sum(xi * yi for xi, yi in zip(x, y))
sum_x2 = sum(xi ** 2 for xi in x)
sum_y2 = sum(yi ** 2 for yi in y)
numerator = n * sum_xy - sum_x * sum_y
denominator = math.sqrt((n * sum_x2 - sum_x ** 2) * (n * sum_y2 - sum_y ** 2))
if denominator == 0:
return 0.0 # undefined: one or both variables have zero variance
return numerator / denominator
def correlation_matrix(variables):
"""Compute the full correlation matrix.
Args:
variables: list of lists, where each inner list is one variable's values.
e.g., [[x1,x2,...], [y1,y2,...], [z1,z2,...]]
Returns:
Matrix (list of lists) where entry [i][j] = correlation(var_i, var_j)
"""
p = len(variables)
matrix = [[0.0] * p for _ in range(p)]
for i in range(p):
matrix[i][i] = 1.0 # correlation of a variable with itself is 1
for j in range(i + 1, p):
r = pearson_correlation(variables[i], variables[j])
matrix[i][j] = r
matrix[j][i] = r # symmetric
return matrix
# ---- VERIFICATION ----
x = [1, 2, 3, 4, 5]
y = [2, 4, 6, 8, 10] # perfectly correlated
assert abs(pearson_correlation(x, y) - 1.0) < 1e-10
y_neg = [10, 8, 6, 4, 2] # perfectly negatively correlated
assert abs(pearson_correlation(x, y_neg) - (-1.0)) < 1e-10
y_zero = [5, 5, 5, 5, 5] # zero variance
assert pearson_correlation(x, y_zero) == 0.0
z = [1, 3, 2, 5, 4]
corr = correlation_matrix([x, y, z])
assert abs(corr[0][1] - 1.0) < 1e-10
assert abs(corr[1][0] - 1.0) < 1e-10
assert abs(corr[0][0] - 1.0) < 1e-10
print("All tests passed!")
Problem 6: Running (Online) Statistics
Problem: Implement Welford's online algorithm for computing running mean and variance in a single pass. The algorithm must be numerically stable and use O(1) space. This is the #1 most-asked statistics coding question at quant firms.
import math
# ---- CHALLENGE ----
class RunningStats:
"""Compute mean, variance, std in a single pass using O(1) space."""
def __init__(self):
pass
def push(self, x):
"""Add a new data point."""
pass
def mean(self):
pass
def variance(self, population=True):
pass
def std(self, population=True):
pass
# ---- SOLUTION ----
class RunningStats:
"""Welford's online algorithm for running mean and variance.
This is numerically stable, unlike the naive formula:
var = E[X^2] - (E[X])^2 (CATASTROPHIC CANCELLATION!)
Welford's recurrence:
When a new value x arrives:
n += 1
delta = x - mean
mean += delta / n
delta2 = x - mean (using updated mean)
M2 += delta * delta2
variance = M2 / n (population)
variance = M2 / (n - 1) (sample)
Why this works:
- delta uses the OLD mean, delta2 uses the NEW mean
- Their product accumulates the sum of squared deviations incrementally
- No subtraction of large similar numbers -> no cancellation
"""
def __init__(self):
self.n = 0
self._mean = 0.0
self._M2 = 0.0 # sum of squared deviations from mean
def push(self, x):
"""Add a new data point. O(1) time and space."""
self.n += 1
delta = x - self._mean
self._mean += delta / self.n
delta2 = x - self._mean # uses UPDATED mean
self._M2 += delta * delta2
def mean(self):
if self.n == 0:
raise ValueError("no data points")
return self._mean
def variance(self, population=True):
if self.n == 0:
raise ValueError("no data points")
if not population and self.n < 2:
raise ValueError("sample variance requires n >= 2")
return self._M2 / self.n if population else self._M2 / (self.n - 1)
def std(self, population=True):
return math.sqrt(self.variance(population))
def count(self):
return self.n
# ---- VERIFICATION ----
# Compare Welford's against two-pass computation
data = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
stats = RunningStats()
for x in data:
stats.push(x)
# Two-pass reference
ref_mean = sum(data) / len(data)
ref_var = sum((x - ref_mean) ** 2 for x in data) / len(data)
assert abs(stats.mean() - ref_mean) < 1e-10, f"mean: {stats.mean()} != {ref_mean}"
assert abs(stats.variance(population=True) - ref_var) < 1e-10
assert stats.count() == 10
# Test numerical stability with large values + small deviations
# This is where naive E[X^2] - E[X]^2 FAILS
big_data = [1e9 + i for i in range(100)]
stats2 = RunningStats()
for x in big_data:
stats2.push(x)
ref_mean2 = sum(big_data) / len(big_data)
ref_var2 = sum((x - ref_mean2) ** 2 for x in big_data) / len(big_data)
assert abs(stats2.mean() - ref_mean2) < 1e-4
assert abs(stats2.variance() - ref_var2) < 1e-2 # Welford stays stable
print("All tests passed!")
E[X^2] - E[X]^2 is a famous trap that causes catastrophic cancellation with large values.Key Takeaways
- Always implement from the mathematical formula — write it in a comment first
- Know Bessel's correction: sample variance divides by n-1, not n
- Welford's algorithm is the gold standard for online statistics — memorize it
- The naive variance formula
E[X^2] - E[X]^2is numerically unstable for large values - Percentile interpolation and histogram binning are common follow-up questions
- Always handle edge cases: empty input, n=1, zero variance
Lilly Tech Systems