Intermediate

Hypothesis Testing in Code

Five problems that implement the most important hypothesis tests from scratch. These are the exact tests data scientists run daily at Google, Netflix, and Airbnb for A/B testing, and the exact problems quant firms ask to test your statistical reasoning. Every solution computes the test statistic and p-value without scipy.

Problem 1: Two-Sample t-Test (Welch's)

Problem: Implement Welch's two-sample t-test from scratch. Given two samples, compute the t-statistic, degrees of freedom (Welch-Satterthwaite), and approximate p-value. Do not use scipy.stats.

import math

# ---- CHALLENGE ----
def welch_ttest(a, b):
    """Two-sample t-test (Welch's, unequal variances).
    Return t_statistic, degrees_of_freedom, p_value.
    """
    pass

# ---- SOLUTION ----
def _erf_approx(x):
    """Abramowitz & Stegun approximation for error function."""
    sign = 1 if x >= 0 else -1
    x = abs(x)
    a1, a2, a3, a4, a5 = 0.254829592, -0.284496736, 1.421413741, -1.453152027, 1.061405429
    p = 0.3275911
    t = 1.0 / (1.0 + p * x)
    y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * math.exp(-x * x)
    return sign * y

def _normal_cdf(x):
    """Standard normal CDF."""
    return 0.5 * (1 + _erf_approx(x / math.sqrt(2)))

def _t_to_p_approx(t_stat, df):
    """Approximate two-tailed p-value for t-distribution.

    For large df, t-distribution approaches normal.
    For smaller df, we use the approximation:
        z = t * (1 - 1/(4*df)) / sqrt(1 + t^2/(2*df))
    which maps t to an approximately standard normal z.
    """
    # Cornish-Fisher approximation for t -> normal
    t_abs = abs(t_stat)
    if df >= 30:
        # Good approximation: t ~ N(0,1) for large df
        z = t_abs
    else:
        # Better approximation for small df
        z = t_abs * (1 - 1 / (4 * df)) / math.sqrt(1 + t_abs ** 2 / (2 * df))
    p = 2 * (1 - _normal_cdf(z))
    return p

def welch_ttest(a, b):
    """Welch's two-sample t-test for unequal variances.

    Null hypothesis: mu_a = mu_b (the two population means are equal)

    Steps:
    1. Compute sample means and variances
    2. t = (mean_a - mean_b) / sqrt(var_a/n_a + var_b/n_b)
    3. df via Welch-Satterthwaite equation
    4. Convert t-statistic to p-value
    """
    n_a, n_b = len(a), len(b)
    if n_a < 2 or n_b < 2:
        raise ValueError("Each sample needs at least 2 observations")

    # Sample means
    mean_a = sum(a) / n_a
    mean_b = sum(b) / n_b

    # Sample variances (Bessel's correction)
    var_a = sum((x - mean_a) ** 2 for x in a) / (n_a - 1)
    var_b = sum((x - mean_b) ** 2 for x in b) / (n_b - 1)

    # Standard error of the difference
    se_sq = var_a / n_a + var_b / n_b
    se = math.sqrt(se_sq)

    # t-statistic
    t_stat = (mean_a - mean_b) / se

    # Welch-Satterthwaite degrees of freedom
    num = se_sq ** 2
    den = (var_a / n_a) ** 2 / (n_a - 1) + (var_b / n_b) ** 2 / (n_b - 1)
    df = num / den

    # Approximate p-value
    p_value = _t_to_p_approx(t_stat, df)

    return {"t_statistic": t_stat, "df": df, "p_value": p_value}

# ---- VERIFICATION ----
# Two clearly different groups
group_a = [5.1, 4.9, 5.0, 5.2, 4.8, 5.1, 5.0, 4.9, 5.3, 5.0]
group_b = [6.0, 5.8, 6.1, 5.9, 6.2, 6.0, 5.7, 6.1, 5.8, 6.0]

result = welch_ttest(group_a, group_b)
assert result["t_statistic"] < -5, f"t should be strongly negative: {result['t_statistic']}"
assert result["p_value"] < 0.001, f"p should be very small: {result['p_value']}"
assert result["df"] > 10, "df should be reasonable"

# Two identical groups
same_a = [5.0, 5.1, 4.9, 5.0, 5.1]
same_b = [5.0, 4.9, 5.1, 5.0, 5.0]
result2 = welch_ttest(same_a, same_b)
assert result2["p_value"] > 0.5, "Same groups should have large p-value"
print("All tests passed!")

Problem 2: Chi-Squared Test of Independence

Problem: Implement the chi-squared test of independence for a contingency table. Compute the chi-squared statistic, degrees of freedom, and approximate p-value from scratch.

import math

# ---- CHALLENGE ----
def chi_squared_test(observed):
    """Chi-squared test of independence for a contingency table.
    observed: list of lists (2D matrix of counts)
    Return chi2_statistic, df, p_value.
    """
    pass

# ---- SOLUTION ----
def _chi2_to_p_approx(chi2, df):
    """Approximate p-value for chi-squared distribution.

    Uses the Wilson-Hilferty normal approximation:
    Z = ((chi2/df)^(1/3) - (1 - 2/(9*df))) / sqrt(2/(9*df))
    Then P(chi2 > x) ~ P(Z > z) = 1 - Phi(z).
    """
    if df <= 0 or chi2 <= 0:
        return 1.0
    # Wilson-Hilferty approximation
    z = ((chi2 / df) ** (1.0 / 3.0) - (1 - 2.0 / (9 * df))) / math.sqrt(2.0 / (9 * df))
    # P(chi2 > observed) = P(Z > z) = 1 - Phi(z)
    p = 1.0 - 0.5 * (1 + _erf_approx(z / math.sqrt(2)))
    return max(0.0, p)

def _erf_approx(x):
    """Abramowitz & Stegun error function approximation."""
    sign = 1 if x >= 0 else -1
    x = abs(x)
    a1, a2, a3, a4, a5 = 0.254829592, -0.284496736, 1.421413741, -1.453152027, 1.061405429
    p = 0.3275911
    t = 1.0 / (1.0 + p * x)
    y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * math.exp(-x * x)
    return sign * y

def chi_squared_test(observed):
    """Chi-squared test of independence.

    Steps:
    1. Compute row totals, column totals, grand total
    2. Compute expected frequencies: E[i][j] = row_total[i] * col_total[j] / grand_total
    3. Chi-squared = sum((O - E)^2 / E) over all cells
    4. df = (num_rows - 1) * (num_cols - 1)
    5. Convert to p-value

    Null hypothesis: the two categorical variables are independent.
    """
    rows = len(observed)
    cols = len(observed[0])

    # Row and column totals
    row_totals = [sum(row) for row in observed]
    col_totals = [sum(observed[r][c] for r in range(rows)) for c in range(cols)]
    grand_total = sum(row_totals)

    if grand_total == 0:
        raise ValueError("Table has no observations")

    # Expected frequencies under independence
    expected = []
    for i in range(rows):
        row = []
        for j in range(cols):
            e = row_totals[i] * col_totals[j] / grand_total
            row.append(e)
        expected.append(row)

    # Chi-squared statistic
    chi2 = 0.0
    for i in range(rows):
        for j in range(cols):
            if expected[i][j] > 0:
                chi2 += (observed[i][j] - expected[i][j]) ** 2 / expected[i][j]

    # Degrees of freedom
    df = (rows - 1) * (cols - 1)

    # Approximate p-value
    p_value = _chi2_to_p_approx(chi2, df)

    return {"chi2_statistic": chi2, "df": df, "p_value": p_value, "expected": expected}

# ---- VERIFICATION ----
# Classic example: is gender independent of product preference?
#              Product A  Product B  Product C
# Male            20          30         50
# Female          30          40         30
observed = [
    [20, 30, 50],
    [30, 40, 30]
]
result = chi_squared_test(observed)
assert result["df"] == 2, "2x3 table -> df = (2-1)*(3-1) = 2"
assert result["chi2_statistic"] > 0

# Perfectly independent table -> chi2 should be 0
independent = [
    [10, 20],
    [10, 20]
]
result2 = chi_squared_test(independent)
assert abs(result2["chi2_statistic"]) < 1e-10, "Independent table -> chi2 = 0"
print("All tests passed!")

Problem 3: A/B Test Significance

Problem: Implement a complete A/B test analysis for conversion rates. Given control and treatment groups with binary outcomes (converted / not converted), compute the conversion rates, the z-test statistic, p-value, and confidence interval for the difference.

import math

# ---- CHALLENGE ----
def ab_test(control_conversions, control_total, treatment_conversions, treatment_total, alpha=0.05):
    """Complete A/B test analysis for conversion rates."""
    pass

# ---- SOLUTION ----
def _normal_cdf(x):
    """Standard normal CDF using error function approximation."""
    sign = 1 if x >= 0 else -1
    ax = abs(x)
    a1, a2, a3, a4, a5 = 0.254829592, -0.284496736, 1.421413741, -1.453152027, 1.061405429
    p = 0.3275911
    t = 1.0 / (1.0 + p * ax)
    erf = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * math.exp(-ax * ax)
    return 0.5 * (1 + sign * erf)

def _normal_inv_approx(p):
    """Approximate inverse normal CDF (quantile function).
    Rational approximation, accurate to ~4.5e-4.
    """
    if p <= 0 or p >= 1:
        raise ValueError("p must be in (0, 1)")
    if p < 0.5:
        return -_normal_inv_approx(1 - p)

    # Rational approximation for p in [0.5, 1)
    t = math.sqrt(-2 * math.log(1 - p))
    c0, c1, c2 = 2.515517, 0.802853, 0.010328
    d1, d2, d3 = 1.432788, 0.189269, 0.001308
    return t - (c0 + c1 * t + c2 * t ** 2) / (1 + d1 * t + d2 * t ** 2 + d3 * t ** 3)

def ab_test(control_conv, control_total, treat_conv, treat_total, alpha=0.05):
    """Complete A/B test analysis for conversion rate difference.

    Uses a two-proportion z-test.

    Steps:
    1. Compute conversion rates: p_c = control_conv / control_total
    2. Compute pooled proportion: p_pool = (conv_c + conv_t) / (n_c + n_t)
    3. Standard error: SE = sqrt(p_pool * (1-p_pool) * (1/n_c + 1/n_t))
    4. z-statistic: z = (p_t - p_c) / SE
    5. p-value from normal CDF
    6. Confidence interval for (p_t - p_c)
    """
    # Conversion rates
    p_c = control_conv / control_total
    p_t = treat_conv / treat_total
    diff = p_t - p_c

    # Pooled proportion (under null: p_c = p_t)
    p_pool = (control_conv + treat_conv) / (control_total + treat_total)

    # Standard error under null
    se = math.sqrt(p_pool * (1 - p_pool) * (1.0 / control_total + 1.0 / treat_total))

    if se == 0:
        return {"p_control": p_c, "p_treatment": p_t, "difference": diff,
                "z_statistic": 0, "p_value": 1.0, "significant": False,
                "ci_lower": 0, "ci_upper": 0}

    # z-statistic
    z = diff / se

    # Two-tailed p-value
    p_value = 2 * (1 - _normal_cdf(abs(z)))

    # Confidence interval for the difference (using unpooled SE)
    se_diff = math.sqrt(p_c * (1 - p_c) / control_total + p_t * (1 - p_t) / treat_total)
    z_crit = _normal_inv_approx(1 - alpha / 2)
    ci_lower = diff - z_crit * se_diff
    ci_upper = diff + z_crit * se_diff

    return {
        "p_control": p_c,
        "p_treatment": p_t,
        "difference": diff,
        "relative_lift": diff / p_c if p_c > 0 else float('inf'),
        "z_statistic": z,
        "p_value": p_value,
        "significant": p_value < alpha,
        "ci_lower": ci_lower,
        "ci_upper": ci_upper
    }

# ---- VERIFICATION ----
# Clear winner: treatment has higher conversion
result = ab_test(
    control_conv=100, control_total=1000,   # 10% conversion
    treat_conv=130, treat_total=1000         # 13% conversion
)
assert abs(result["p_control"] - 0.10) < 1e-10
assert abs(result["p_treatment"] - 0.13) < 1e-10
assert result["z_statistic"] > 0, "Treatment is better"
assert result["p_value"] < 0.05, "Should be significant"
assert result["significant"] is True

# No difference
result2 = ab_test(
    control_conv=100, control_total=1000,
    treat_conv=102, treat_total=1000
)
assert result2["significant"] is False, "2/1000 difference is not significant"
print("All tests passed!")

Problem 4: Permutation Test

Problem: Implement a permutation test for comparing two groups. The permutation test makes no distributional assumptions — it directly computes the p-value by shuffling group labels and counting how often the shuffled difference exceeds the observed difference.

import random

# ---- CHALLENGE ----
def permutation_test(a, b, num_permutations=10000, stat_func=None):
    """Permutation test for difference in means.
    Return observed_stat, p_value.
    """
    pass

# ---- SOLUTION ----
def permutation_test(a, b, num_permutations=10000, stat_func=None):
    """Non-parametric permutation test.

    Algorithm:
    1. Compute the observed test statistic (default: difference in means)
    2. Pool all data together
    3. For each permutation:
       a. Randomly shuffle the pooled data
       b. Split into two groups of original sizes
       c. Compute the test statistic
    4. p-value = proportion of permutation statistics >= observed

    Why this works:
    - Under the null hypothesis (no difference), group labels are arbitrary
    - By shuffling labels, we build the null distribution empirically
    - No assumptions about normality, equal variance, etc.
    """
    if stat_func is None:
        stat_func = lambda x, y: abs(sum(x) / len(x) - sum(y) / len(y))

    # Observed test statistic
    observed_stat = stat_func(a, b)

    # Pool all data
    pooled = list(a) + list(b)
    n_a = len(a)
    n_total = len(pooled)

    # Count how many permutations produce a statistic >= observed
    count_extreme = 0
    for _ in range(num_permutations):
        random.shuffle(pooled)
        perm_a = pooled[:n_a]
        perm_b = pooled[n_a:]
        perm_stat = stat_func(perm_a, perm_b)
        if perm_stat >= observed_stat:
            count_extreme += 1

    p_value = count_extreme / num_permutations
    return {"observed_statistic": observed_stat, "p_value": p_value,
            "num_permutations": num_permutations}

# ---- VERIFICATION ----
random.seed(42)

# Clearly different groups
group_a = [5.1, 4.9, 5.0, 5.2, 4.8, 5.1, 5.0, 4.9, 5.3, 5.0]
group_b = [6.0, 5.8, 6.1, 5.9, 6.2, 6.0, 5.7, 6.1, 5.8, 6.0]

result = permutation_test(group_a, group_b, num_permutations=5000)
assert result["p_value"] < 0.01, f"Should be significant: p={result['p_value']}"

# Same distribution
same_a = [5.0, 5.1, 4.9, 5.2, 4.8]
same_b = [5.0, 4.9, 5.1, 4.8, 5.2]
result2 = permutation_test(same_a, same_b, num_permutations=5000)
assert result2["p_value"] > 0.3, f"Should not be significant: p={result2['p_value']}"
print("All tests passed!")
💡
When to use permutation tests: Permutation tests are the go-to when (1) your data is not normally distributed, (2) sample sizes are small, (3) you have a custom test statistic, or (4) the interviewer asks for a "non-parametric" approach. Netflix and Airbnb use permutation tests extensively for A/B testing with non-standard metrics.

Problem 5: Bootstrap Confidence Intervals

Problem: Implement the bootstrap method for computing confidence intervals of any statistic (mean, median, correlation, etc.). Implement both the percentile method and the BCa (bias-corrected and accelerated) method.

import math
import random

# ---- CHALLENGE ----
def bootstrap_ci(data, stat_func, num_bootstrap=10000, confidence=0.95):
    """Compute bootstrap confidence interval using percentile method."""
    pass

def bootstrap_bca_ci(data, stat_func, num_bootstrap=10000, confidence=0.95):
    """Compute BCa bootstrap confidence interval."""
    pass

# ---- SOLUTION ----
def bootstrap_ci(data, stat_func, num_bootstrap=10000, confidence=0.95):
    """Bootstrap confidence interval (percentile method).

    Algorithm:
    1. For each bootstrap iteration:
       a. Sample n values WITH replacement from the data
       b. Compute the statistic on the bootstrap sample
    2. Sort the bootstrap statistics
    3. The CI is the [alpha/2, 1-alpha/2] percentiles

    This works because the bootstrap distribution approximates
    the sampling distribution of the statistic.
    """
    n = len(data)
    alpha = 1 - confidence
    bootstrap_stats = []

    for _ in range(num_bootstrap):
        # Resample with replacement
        sample = [data[random.randint(0, n - 1)] for _ in range(n)]
        bootstrap_stats.append(stat_func(sample))

    # Sort and take percentiles
    bootstrap_stats.sort()
    lower_idx = int(math.floor(alpha / 2 * num_bootstrap))
    upper_idx = int(math.ceil((1 - alpha / 2) * num_bootstrap)) - 1

    lower_idx = max(0, min(lower_idx, num_bootstrap - 1))
    upper_idx = max(0, min(upper_idx, num_bootstrap - 1))

    return {
        "point_estimate": stat_func(data),
        "ci_lower": bootstrap_stats[lower_idx],
        "ci_upper": bootstrap_stats[upper_idx],
        "confidence": confidence,
        "num_bootstrap": num_bootstrap,
        "se": _std_dev(bootstrap_stats)
    }

def _std_dev(data):
    """Helper: sample standard deviation."""
    n = len(data)
    if n < 2:
        return 0.0
    mean = sum(data) / n
    return math.sqrt(sum((x - mean) ** 2 for x in data) / (n - 1))

def _normal_cdf_simple(x):
    """Standard normal CDF."""
    sign = 1 if x >= 0 else -1
    ax = abs(x)
    a1, a2, a3, a4, a5 = 0.254829592, -0.284496736, 1.421413741, -1.453152027, 1.061405429
    p = 0.3275911
    t = 1.0 / (1.0 + p * ax)
    erf = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * math.exp(-ax * ax)
    return 0.5 * (1 + sign * erf)

def _normal_inv(p):
    """Approximate inverse normal CDF."""
    if p <= 0 or p >= 1:
        raise ValueError("p must be in (0, 1)")
    if p < 0.5:
        return -_normal_inv(1 - p)
    t = math.sqrt(-2 * math.log(1 - p))
    c0, c1, c2 = 2.515517, 0.802853, 0.010328
    d1, d2, d3 = 1.432788, 0.189269, 0.001308
    return t - (c0 + c1 * t + c2 * t ** 2) / (1 + d1 * t + d2 * t ** 2 + d3 * t ** 3)

def bootstrap_bca_ci(data, stat_func, num_bootstrap=10000, confidence=0.95):
    """BCa (bias-corrected and accelerated) bootstrap CI.

    BCa corrects for:
    1. Bias: the bootstrap distribution may not be centered on the true parameter
    2. Skewness: the bootstrap distribution may be asymmetric

    Steps:
    1. Compute bootstrap distribution (same as percentile method)
    2. Bias correction: z0 = Phi^(-1)(proportion of bootstrap stats < original)
    3. Acceleration: a = sum(d_i^3) / (6 * (sum(d_i^2))^(3/2))
       where d_i = (mean of jackknife values) - (jackknife value leaving out i)
    4. Adjust percentile endpoints using z0 and a
    """
    n = len(data)
    alpha = 1 - confidence
    original_stat = stat_func(data)

    # Step 1: Bootstrap distribution
    bootstrap_stats = []
    for _ in range(num_bootstrap):
        sample = [data[random.randint(0, n - 1)] for _ in range(n)]
        bootstrap_stats.append(stat_func(sample))
    bootstrap_stats.sort()

    # Step 2: Bias correction (z0)
    count_below = sum(1 for s in bootstrap_stats if s < original_stat)
    prop_below = count_below / num_bootstrap
    prop_below = max(0.001, min(0.999, prop_below))  # clamp to avoid inf
    z0 = _normal_inv(prop_below)

    # Step 3: Acceleration (a) via jackknife
    jackknife_stats = []
    for i in range(n):
        jack_sample = data[:i] + data[i+1:]
        jackknife_stats.append(stat_func(jack_sample))
    jack_mean = sum(jackknife_stats) / n
    d = [jack_mean - js for js in jackknife_stats]
    d3_sum = sum(di ** 3 for di in d)
    d2_sum = sum(di ** 2 for di in d)
    a = d3_sum / (6 * d2_sum ** 1.5) if d2_sum > 0 else 0.0

    # Step 4: Adjusted percentiles
    z_alpha_lo = _normal_inv(alpha / 2)
    z_alpha_hi = _normal_inv(1 - alpha / 2)

    # BCa adjusted quantiles
    adj_lo = _normal_cdf_simple(z0 + (z0 + z_alpha_lo) / (1 - a * (z0 + z_alpha_lo)))
    adj_hi = _normal_cdf_simple(z0 + (z0 + z_alpha_hi) / (1 - a * (z0 + z_alpha_hi)))

    lo_idx = max(0, min(int(adj_lo * num_bootstrap), num_bootstrap - 1))
    hi_idx = max(0, min(int(adj_hi * num_bootstrap), num_bootstrap - 1))

    return {
        "point_estimate": original_stat,
        "ci_lower": bootstrap_stats[lo_idx],
        "ci_upper": bootstrap_stats[hi_idx],
        "confidence": confidence,
        "bias_correction_z0": z0,
        "acceleration_a": a
    }

# ---- VERIFICATION ----
random.seed(42)

# Bootstrap CI for mean
data = [2.3, 4.1, 3.7, 5.2, 4.8, 3.5, 6.1, 4.4, 3.9, 5.0]
mean_func = lambda d: sum(d) / len(d)

result = bootstrap_ci(data, mean_func, num_bootstrap=5000)
true_mean = mean_func(data)
assert result["ci_lower"] < true_mean < result["ci_upper"]
assert result["ci_lower"] > 2.0  # reasonable bounds
assert result["ci_upper"] < 7.0

# Bootstrap CI for median (works for ANY statistic!)
median_func = lambda d: sorted(d)[len(d) // 2]
result_med = bootstrap_ci(data, median_func, num_bootstrap=5000)
assert result_med["ci_lower"] < result_med["ci_upper"]

# BCa bootstrap
result_bca = bootstrap_bca_ci(data, mean_func, num_bootstrap=5000)
assert result_bca["ci_lower"] < true_mean < result_bca["ci_upper"]
print("All tests passed!")

Key Takeaways

💡
  • Welch's t-test is preferred over Student's t-test because it does not assume equal variances
  • Chi-squared tests require expected counts > 5 in each cell for the approximation to be valid
  • A/B test analysis is the most common data science interview problem — know the z-test for proportions
  • Permutation tests make no distributional assumptions and work with any test statistic
  • Bootstrap is the most versatile method: it works for any statistic (mean, median, correlation, etc.)
  • BCa bootstrap corrects for bias and skewness in the bootstrap distribution