Hypothesis Testing in Code
Five problems that implement the most important hypothesis tests from scratch. These are the exact tests data scientists run daily at Google, Netflix, and Airbnb for A/B testing, and the exact problems quant firms ask to test your statistical reasoning. Every solution computes the test statistic and p-value without scipy.
Problem 1: Two-Sample t-Test (Welch's)
Problem: Implement Welch's two-sample t-test from scratch. Given two samples, compute the t-statistic, degrees of freedom (Welch-Satterthwaite), and approximate p-value. Do not use scipy.stats.
import math
# ---- CHALLENGE ----
def welch_ttest(a, b):
"""Two-sample t-test (Welch's, unequal variances).
Return t_statistic, degrees_of_freedom, p_value.
"""
pass
# ---- SOLUTION ----
def _erf_approx(x):
"""Abramowitz & Stegun approximation for error function."""
sign = 1 if x >= 0 else -1
x = abs(x)
a1, a2, a3, a4, a5 = 0.254829592, -0.284496736, 1.421413741, -1.453152027, 1.061405429
p = 0.3275911
t = 1.0 / (1.0 + p * x)
y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * math.exp(-x * x)
return sign * y
def _normal_cdf(x):
"""Standard normal CDF."""
return 0.5 * (1 + _erf_approx(x / math.sqrt(2)))
def _t_to_p_approx(t_stat, df):
"""Approximate two-tailed p-value for t-distribution.
For large df, t-distribution approaches normal.
For smaller df, we use the approximation:
z = t * (1 - 1/(4*df)) / sqrt(1 + t^2/(2*df))
which maps t to an approximately standard normal z.
"""
# Cornish-Fisher approximation for t -> normal
t_abs = abs(t_stat)
if df >= 30:
# Good approximation: t ~ N(0,1) for large df
z = t_abs
else:
# Better approximation for small df
z = t_abs * (1 - 1 / (4 * df)) / math.sqrt(1 + t_abs ** 2 / (2 * df))
p = 2 * (1 - _normal_cdf(z))
return p
def welch_ttest(a, b):
"""Welch's two-sample t-test for unequal variances.
Null hypothesis: mu_a = mu_b (the two population means are equal)
Steps:
1. Compute sample means and variances
2. t = (mean_a - mean_b) / sqrt(var_a/n_a + var_b/n_b)
3. df via Welch-Satterthwaite equation
4. Convert t-statistic to p-value
"""
n_a, n_b = len(a), len(b)
if n_a < 2 or n_b < 2:
raise ValueError("Each sample needs at least 2 observations")
# Sample means
mean_a = sum(a) / n_a
mean_b = sum(b) / n_b
# Sample variances (Bessel's correction)
var_a = sum((x - mean_a) ** 2 for x in a) / (n_a - 1)
var_b = sum((x - mean_b) ** 2 for x in b) / (n_b - 1)
# Standard error of the difference
se_sq = var_a / n_a + var_b / n_b
se = math.sqrt(se_sq)
# t-statistic
t_stat = (mean_a - mean_b) / se
# Welch-Satterthwaite degrees of freedom
num = se_sq ** 2
den = (var_a / n_a) ** 2 / (n_a - 1) + (var_b / n_b) ** 2 / (n_b - 1)
df = num / den
# Approximate p-value
p_value = _t_to_p_approx(t_stat, df)
return {"t_statistic": t_stat, "df": df, "p_value": p_value}
# ---- VERIFICATION ----
# Two clearly different groups
group_a = [5.1, 4.9, 5.0, 5.2, 4.8, 5.1, 5.0, 4.9, 5.3, 5.0]
group_b = [6.0, 5.8, 6.1, 5.9, 6.2, 6.0, 5.7, 6.1, 5.8, 6.0]
result = welch_ttest(group_a, group_b)
assert result["t_statistic"] < -5, f"t should be strongly negative: {result['t_statistic']}"
assert result["p_value"] < 0.001, f"p should be very small: {result['p_value']}"
assert result["df"] > 10, "df should be reasonable"
# Two identical groups
same_a = [5.0, 5.1, 4.9, 5.0, 5.1]
same_b = [5.0, 4.9, 5.1, 5.0, 5.0]
result2 = welch_ttest(same_a, same_b)
assert result2["p_value"] > 0.5, "Same groups should have large p-value"
print("All tests passed!")
Problem 2: Chi-Squared Test of Independence
Problem: Implement the chi-squared test of independence for a contingency table. Compute the chi-squared statistic, degrees of freedom, and approximate p-value from scratch.
import math
# ---- CHALLENGE ----
def chi_squared_test(observed):
"""Chi-squared test of independence for a contingency table.
observed: list of lists (2D matrix of counts)
Return chi2_statistic, df, p_value.
"""
pass
# ---- SOLUTION ----
def _chi2_to_p_approx(chi2, df):
"""Approximate p-value for chi-squared distribution.
Uses the Wilson-Hilferty normal approximation:
Z = ((chi2/df)^(1/3) - (1 - 2/(9*df))) / sqrt(2/(9*df))
Then P(chi2 > x) ~ P(Z > z) = 1 - Phi(z).
"""
if df <= 0 or chi2 <= 0:
return 1.0
# Wilson-Hilferty approximation
z = ((chi2 / df) ** (1.0 / 3.0) - (1 - 2.0 / (9 * df))) / math.sqrt(2.0 / (9 * df))
# P(chi2 > observed) = P(Z > z) = 1 - Phi(z)
p = 1.0 - 0.5 * (1 + _erf_approx(z / math.sqrt(2)))
return max(0.0, p)
def _erf_approx(x):
"""Abramowitz & Stegun error function approximation."""
sign = 1 if x >= 0 else -1
x = abs(x)
a1, a2, a3, a4, a5 = 0.254829592, -0.284496736, 1.421413741, -1.453152027, 1.061405429
p = 0.3275911
t = 1.0 / (1.0 + p * x)
y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * math.exp(-x * x)
return sign * y
def chi_squared_test(observed):
"""Chi-squared test of independence.
Steps:
1. Compute row totals, column totals, grand total
2. Compute expected frequencies: E[i][j] = row_total[i] * col_total[j] / grand_total
3. Chi-squared = sum((O - E)^2 / E) over all cells
4. df = (num_rows - 1) * (num_cols - 1)
5. Convert to p-value
Null hypothesis: the two categorical variables are independent.
"""
rows = len(observed)
cols = len(observed[0])
# Row and column totals
row_totals = [sum(row) for row in observed]
col_totals = [sum(observed[r][c] for r in range(rows)) for c in range(cols)]
grand_total = sum(row_totals)
if grand_total == 0:
raise ValueError("Table has no observations")
# Expected frequencies under independence
expected = []
for i in range(rows):
row = []
for j in range(cols):
e = row_totals[i] * col_totals[j] / grand_total
row.append(e)
expected.append(row)
# Chi-squared statistic
chi2 = 0.0
for i in range(rows):
for j in range(cols):
if expected[i][j] > 0:
chi2 += (observed[i][j] - expected[i][j]) ** 2 / expected[i][j]
# Degrees of freedom
df = (rows - 1) * (cols - 1)
# Approximate p-value
p_value = _chi2_to_p_approx(chi2, df)
return {"chi2_statistic": chi2, "df": df, "p_value": p_value, "expected": expected}
# ---- VERIFICATION ----
# Classic example: is gender independent of product preference?
# Product A Product B Product C
# Male 20 30 50
# Female 30 40 30
observed = [
[20, 30, 50],
[30, 40, 30]
]
result = chi_squared_test(observed)
assert result["df"] == 2, "2x3 table -> df = (2-1)*(3-1) = 2"
assert result["chi2_statistic"] > 0
# Perfectly independent table -> chi2 should be 0
independent = [
[10, 20],
[10, 20]
]
result2 = chi_squared_test(independent)
assert abs(result2["chi2_statistic"]) < 1e-10, "Independent table -> chi2 = 0"
print("All tests passed!")
Problem 3: A/B Test Significance
Problem: Implement a complete A/B test analysis for conversion rates. Given control and treatment groups with binary outcomes (converted / not converted), compute the conversion rates, the z-test statistic, p-value, and confidence interval for the difference.
import math
# ---- CHALLENGE ----
def ab_test(control_conversions, control_total, treatment_conversions, treatment_total, alpha=0.05):
"""Complete A/B test analysis for conversion rates."""
pass
# ---- SOLUTION ----
def _normal_cdf(x):
"""Standard normal CDF using error function approximation."""
sign = 1 if x >= 0 else -1
ax = abs(x)
a1, a2, a3, a4, a5 = 0.254829592, -0.284496736, 1.421413741, -1.453152027, 1.061405429
p = 0.3275911
t = 1.0 / (1.0 + p * ax)
erf = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * math.exp(-ax * ax)
return 0.5 * (1 + sign * erf)
def _normal_inv_approx(p):
"""Approximate inverse normal CDF (quantile function).
Rational approximation, accurate to ~4.5e-4.
"""
if p <= 0 or p >= 1:
raise ValueError("p must be in (0, 1)")
if p < 0.5:
return -_normal_inv_approx(1 - p)
# Rational approximation for p in [0.5, 1)
t = math.sqrt(-2 * math.log(1 - p))
c0, c1, c2 = 2.515517, 0.802853, 0.010328
d1, d2, d3 = 1.432788, 0.189269, 0.001308
return t - (c0 + c1 * t + c2 * t ** 2) / (1 + d1 * t + d2 * t ** 2 + d3 * t ** 3)
def ab_test(control_conv, control_total, treat_conv, treat_total, alpha=0.05):
"""Complete A/B test analysis for conversion rate difference.
Uses a two-proportion z-test.
Steps:
1. Compute conversion rates: p_c = control_conv / control_total
2. Compute pooled proportion: p_pool = (conv_c + conv_t) / (n_c + n_t)
3. Standard error: SE = sqrt(p_pool * (1-p_pool) * (1/n_c + 1/n_t))
4. z-statistic: z = (p_t - p_c) / SE
5. p-value from normal CDF
6. Confidence interval for (p_t - p_c)
"""
# Conversion rates
p_c = control_conv / control_total
p_t = treat_conv / treat_total
diff = p_t - p_c
# Pooled proportion (under null: p_c = p_t)
p_pool = (control_conv + treat_conv) / (control_total + treat_total)
# Standard error under null
se = math.sqrt(p_pool * (1 - p_pool) * (1.0 / control_total + 1.0 / treat_total))
if se == 0:
return {"p_control": p_c, "p_treatment": p_t, "difference": diff,
"z_statistic": 0, "p_value": 1.0, "significant": False,
"ci_lower": 0, "ci_upper": 0}
# z-statistic
z = diff / se
# Two-tailed p-value
p_value = 2 * (1 - _normal_cdf(abs(z)))
# Confidence interval for the difference (using unpooled SE)
se_diff = math.sqrt(p_c * (1 - p_c) / control_total + p_t * (1 - p_t) / treat_total)
z_crit = _normal_inv_approx(1 - alpha / 2)
ci_lower = diff - z_crit * se_diff
ci_upper = diff + z_crit * se_diff
return {
"p_control": p_c,
"p_treatment": p_t,
"difference": diff,
"relative_lift": diff / p_c if p_c > 0 else float('inf'),
"z_statistic": z,
"p_value": p_value,
"significant": p_value < alpha,
"ci_lower": ci_lower,
"ci_upper": ci_upper
}
# ---- VERIFICATION ----
# Clear winner: treatment has higher conversion
result = ab_test(
control_conv=100, control_total=1000, # 10% conversion
treat_conv=130, treat_total=1000 # 13% conversion
)
assert abs(result["p_control"] - 0.10) < 1e-10
assert abs(result["p_treatment"] - 0.13) < 1e-10
assert result["z_statistic"] > 0, "Treatment is better"
assert result["p_value"] < 0.05, "Should be significant"
assert result["significant"] is True
# No difference
result2 = ab_test(
control_conv=100, control_total=1000,
treat_conv=102, treat_total=1000
)
assert result2["significant"] is False, "2/1000 difference is not significant"
print("All tests passed!")
Problem 4: Permutation Test
Problem: Implement a permutation test for comparing two groups. The permutation test makes no distributional assumptions — it directly computes the p-value by shuffling group labels and counting how often the shuffled difference exceeds the observed difference.
import random
# ---- CHALLENGE ----
def permutation_test(a, b, num_permutations=10000, stat_func=None):
"""Permutation test for difference in means.
Return observed_stat, p_value.
"""
pass
# ---- SOLUTION ----
def permutation_test(a, b, num_permutations=10000, stat_func=None):
"""Non-parametric permutation test.
Algorithm:
1. Compute the observed test statistic (default: difference in means)
2. Pool all data together
3. For each permutation:
a. Randomly shuffle the pooled data
b. Split into two groups of original sizes
c. Compute the test statistic
4. p-value = proportion of permutation statistics >= observed
Why this works:
- Under the null hypothesis (no difference), group labels are arbitrary
- By shuffling labels, we build the null distribution empirically
- No assumptions about normality, equal variance, etc.
"""
if stat_func is None:
stat_func = lambda x, y: abs(sum(x) / len(x) - sum(y) / len(y))
# Observed test statistic
observed_stat = stat_func(a, b)
# Pool all data
pooled = list(a) + list(b)
n_a = len(a)
n_total = len(pooled)
# Count how many permutations produce a statistic >= observed
count_extreme = 0
for _ in range(num_permutations):
random.shuffle(pooled)
perm_a = pooled[:n_a]
perm_b = pooled[n_a:]
perm_stat = stat_func(perm_a, perm_b)
if perm_stat >= observed_stat:
count_extreme += 1
p_value = count_extreme / num_permutations
return {"observed_statistic": observed_stat, "p_value": p_value,
"num_permutations": num_permutations}
# ---- VERIFICATION ----
random.seed(42)
# Clearly different groups
group_a = [5.1, 4.9, 5.0, 5.2, 4.8, 5.1, 5.0, 4.9, 5.3, 5.0]
group_b = [6.0, 5.8, 6.1, 5.9, 6.2, 6.0, 5.7, 6.1, 5.8, 6.0]
result = permutation_test(group_a, group_b, num_permutations=5000)
assert result["p_value"] < 0.01, f"Should be significant: p={result['p_value']}"
# Same distribution
same_a = [5.0, 5.1, 4.9, 5.2, 4.8]
same_b = [5.0, 4.9, 5.1, 4.8, 5.2]
result2 = permutation_test(same_a, same_b, num_permutations=5000)
assert result2["p_value"] > 0.3, f"Should not be significant: p={result2['p_value']}"
print("All tests passed!")
Problem 5: Bootstrap Confidence Intervals
Problem: Implement the bootstrap method for computing confidence intervals of any statistic (mean, median, correlation, etc.). Implement both the percentile method and the BCa (bias-corrected and accelerated) method.
import math
import random
# ---- CHALLENGE ----
def bootstrap_ci(data, stat_func, num_bootstrap=10000, confidence=0.95):
"""Compute bootstrap confidence interval using percentile method."""
pass
def bootstrap_bca_ci(data, stat_func, num_bootstrap=10000, confidence=0.95):
"""Compute BCa bootstrap confidence interval."""
pass
# ---- SOLUTION ----
def bootstrap_ci(data, stat_func, num_bootstrap=10000, confidence=0.95):
"""Bootstrap confidence interval (percentile method).
Algorithm:
1. For each bootstrap iteration:
a. Sample n values WITH replacement from the data
b. Compute the statistic on the bootstrap sample
2. Sort the bootstrap statistics
3. The CI is the [alpha/2, 1-alpha/2] percentiles
This works because the bootstrap distribution approximates
the sampling distribution of the statistic.
"""
n = len(data)
alpha = 1 - confidence
bootstrap_stats = []
for _ in range(num_bootstrap):
# Resample with replacement
sample = [data[random.randint(0, n - 1)] for _ in range(n)]
bootstrap_stats.append(stat_func(sample))
# Sort and take percentiles
bootstrap_stats.sort()
lower_idx = int(math.floor(alpha / 2 * num_bootstrap))
upper_idx = int(math.ceil((1 - alpha / 2) * num_bootstrap)) - 1
lower_idx = max(0, min(lower_idx, num_bootstrap - 1))
upper_idx = max(0, min(upper_idx, num_bootstrap - 1))
return {
"point_estimate": stat_func(data),
"ci_lower": bootstrap_stats[lower_idx],
"ci_upper": bootstrap_stats[upper_idx],
"confidence": confidence,
"num_bootstrap": num_bootstrap,
"se": _std_dev(bootstrap_stats)
}
def _std_dev(data):
"""Helper: sample standard deviation."""
n = len(data)
if n < 2:
return 0.0
mean = sum(data) / n
return math.sqrt(sum((x - mean) ** 2 for x in data) / (n - 1))
def _normal_cdf_simple(x):
"""Standard normal CDF."""
sign = 1 if x >= 0 else -1
ax = abs(x)
a1, a2, a3, a4, a5 = 0.254829592, -0.284496736, 1.421413741, -1.453152027, 1.061405429
p = 0.3275911
t = 1.0 / (1.0 + p * ax)
erf = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * math.exp(-ax * ax)
return 0.5 * (1 + sign * erf)
def _normal_inv(p):
"""Approximate inverse normal CDF."""
if p <= 0 or p >= 1:
raise ValueError("p must be in (0, 1)")
if p < 0.5:
return -_normal_inv(1 - p)
t = math.sqrt(-2 * math.log(1 - p))
c0, c1, c2 = 2.515517, 0.802853, 0.010328
d1, d2, d3 = 1.432788, 0.189269, 0.001308
return t - (c0 + c1 * t + c2 * t ** 2) / (1 + d1 * t + d2 * t ** 2 + d3 * t ** 3)
def bootstrap_bca_ci(data, stat_func, num_bootstrap=10000, confidence=0.95):
"""BCa (bias-corrected and accelerated) bootstrap CI.
BCa corrects for:
1. Bias: the bootstrap distribution may not be centered on the true parameter
2. Skewness: the bootstrap distribution may be asymmetric
Steps:
1. Compute bootstrap distribution (same as percentile method)
2. Bias correction: z0 = Phi^(-1)(proportion of bootstrap stats < original)
3. Acceleration: a = sum(d_i^3) / (6 * (sum(d_i^2))^(3/2))
where d_i = (mean of jackknife values) - (jackknife value leaving out i)
4. Adjust percentile endpoints using z0 and a
"""
n = len(data)
alpha = 1 - confidence
original_stat = stat_func(data)
# Step 1: Bootstrap distribution
bootstrap_stats = []
for _ in range(num_bootstrap):
sample = [data[random.randint(0, n - 1)] for _ in range(n)]
bootstrap_stats.append(stat_func(sample))
bootstrap_stats.sort()
# Step 2: Bias correction (z0)
count_below = sum(1 for s in bootstrap_stats if s < original_stat)
prop_below = count_below / num_bootstrap
prop_below = max(0.001, min(0.999, prop_below)) # clamp to avoid inf
z0 = _normal_inv(prop_below)
# Step 3: Acceleration (a) via jackknife
jackknife_stats = []
for i in range(n):
jack_sample = data[:i] + data[i+1:]
jackknife_stats.append(stat_func(jack_sample))
jack_mean = sum(jackknife_stats) / n
d = [jack_mean - js for js in jackknife_stats]
d3_sum = sum(di ** 3 for di in d)
d2_sum = sum(di ** 2 for di in d)
a = d3_sum / (6 * d2_sum ** 1.5) if d2_sum > 0 else 0.0
# Step 4: Adjusted percentiles
z_alpha_lo = _normal_inv(alpha / 2)
z_alpha_hi = _normal_inv(1 - alpha / 2)
# BCa adjusted quantiles
adj_lo = _normal_cdf_simple(z0 + (z0 + z_alpha_lo) / (1 - a * (z0 + z_alpha_lo)))
adj_hi = _normal_cdf_simple(z0 + (z0 + z_alpha_hi) / (1 - a * (z0 + z_alpha_hi)))
lo_idx = max(0, min(int(adj_lo * num_bootstrap), num_bootstrap - 1))
hi_idx = max(0, min(int(adj_hi * num_bootstrap), num_bootstrap - 1))
return {
"point_estimate": original_stat,
"ci_lower": bootstrap_stats[lo_idx],
"ci_upper": bootstrap_stats[hi_idx],
"confidence": confidence,
"bias_correction_z0": z0,
"acceleration_a": a
}
# ---- VERIFICATION ----
random.seed(42)
# Bootstrap CI for mean
data = [2.3, 4.1, 3.7, 5.2, 4.8, 3.5, 6.1, 4.4, 3.9, 5.0]
mean_func = lambda d: sum(d) / len(d)
result = bootstrap_ci(data, mean_func, num_bootstrap=5000)
true_mean = mean_func(data)
assert result["ci_lower"] < true_mean < result["ci_upper"]
assert result["ci_lower"] > 2.0 # reasonable bounds
assert result["ci_upper"] < 7.0
# Bootstrap CI for median (works for ANY statistic!)
median_func = lambda d: sorted(d)[len(d) // 2]
result_med = bootstrap_ci(data, median_func, num_bootstrap=5000)
assert result_med["ci_lower"] < result_med["ci_upper"]
# BCa bootstrap
result_bca = bootstrap_bca_ci(data, mean_func, num_bootstrap=5000)
assert result_bca["ci_lower"] < true_mean < result_bca["ci_upper"]
print("All tests passed!")
Key Takeaways
- Welch's t-test is preferred over Student's t-test because it does not assume equal variances
- Chi-squared tests require expected counts > 5 in each cell for the approximation to be valid
- A/B test analysis is the most common data science interview problem — know the z-test for proportions
- Permutation tests make no distributional assumptions and work with any test statistic
- Bootstrap is the most versatile method: it works for any statistic (mean, median, correlation, etc.)
- BCa bootstrap corrects for bias and skewness in the bootstrap distribution
Lilly Tech Systems