Intermediate

Array Operations

Six coding challenges covering the fundamental NumPy operations that appear in every ML coding interview: reshape and transpose, broadcasting, fancy indexing, boolean masking, array stacking, and dataset splitting.

Challenge 1: Reshape and Transpose

Problem: Given a 1D array of pixel values with length H * W * C, reshape it into an image tensor of shape (H, W, C) and then convert it to channel-first format (C, H, W) used by PyTorch. Also, given a batch of images (N, H, W, C), convert to (N, C, H, W).

💡

Why this is asked: ML frameworks use different memory layouts. TensorFlow uses NHWC (channel-last), PyTorch uses NCHW (channel-first). Converting between them without loops is a daily task.

import numpy as np

# ---- CHALLENGE ----
def reshape_image(pixels, H, W, C):
    """Reshape flat pixels to (H, W, C), then convert to (C, H, W)."""
    pass

def batch_channel_first(images):
    """Convert batch from (N, H, W, C) to (N, C, H, W)."""
    pass

# ---- SOLUTION ----
def reshape_image(pixels, H, W, C):
    """Reshape flat pixels to (H, W, C), then convert to (C, H, W)."""
    # Step 1: reshape to image
    img = pixels.reshape(H, W, C)
    # Step 2: move channel axis from last to first
    # transpose(2, 0, 1) moves axis 2->0, 0->1, 1->2
    img_chw = img.transpose(2, 0, 1)
    return img_chw

def batch_channel_first(images):
    """Convert batch from (N, H, W, C) to (N, C, H, W)."""
    # transpose: keep batch dim (0), move channel (3) before H,W (1,2)
    return images.transpose(0, 3, 1, 2)

# ---- VERIFICATION ----
pixels = np.arange(2 * 3 * 3)  # 18 pixels, RGB image 2x3
result = reshape_image(pixels, 2, 3, 3)
assert result.shape == (3, 2, 3), f"Expected (3, 2, 3), got {result.shape}"

batch = np.random.randn(8, 32, 32, 3)  # 8 images, 32x32, RGB
result = batch_channel_first(batch)
assert result.shape == (8, 3, 32, 32), f"Expected (8, 3, 32, 32), got {result.shape}"
print("Challenge 1 passed!")

Challenge 2: Broadcasting Arithmetic

Problem: Given a feature matrix X of shape (N, D) where N is samples and D is features, normalize each feature to zero mean and unit variance without any loops. Then, add a bias vector b of shape (D,) to every sample using broadcasting.

import numpy as np

# ---- CHALLENGE ----
def normalize_features(X):
    """Normalize X to zero mean, unit variance per feature. Return (X_norm, mean, std)."""
    pass

def add_bias(X, b):
    """Add bias vector b of shape (D,) to each row of X (N, D)."""
    pass

# ---- SOLUTION ----
def normalize_features(X):
    """Normalize X to zero mean, unit variance per feature."""
    # axis=0 computes stat across samples (one value per feature)
    mean = X.mean(axis=0)          # shape: (D,)
    std = X.std(axis=0)            # shape: (D,)
    # Avoid division by zero for constant features
    std = np.where(std == 0, 1.0, std)
    # Broadcasting: (N, D) - (D,) works automatically
    X_norm = (X - mean) / std
    return X_norm, mean, std

def add_bias(X, b):
    """Add bias vector b of shape (D,) to each row of X (N, D)."""
    # Broadcasting: (N, D) + (D,) -> (N, D)
    return X + b

# ---- VERIFICATION ----
X = np.array([[1.0, 200.0, 3000.0],
              [2.0, 300.0, 4000.0],
              [3.0, 400.0, 5000.0],
              [4.0, 500.0, 6000.0]])

X_norm, mean, std = normalize_features(X)
# After normalization, each column should have mean ~0 and std ~1
assert np.allclose(X_norm.mean(axis=0), 0, atol=1e-10), "Mean should be ~0"
assert np.allclose(X_norm.std(axis=0), 1, atol=1e-10), "Std should be ~1"

b = np.array([0.1, 0.2, 0.3])
result = add_bias(X_norm, b)
assert result.shape == X_norm.shape, "Shape should be preserved"
print("Challenge 2 passed!")

Challenge 3: Fancy Indexing

Problem: Given a batch of model predictions probs of shape (N, C) where N is batch size and C is number of classes, and a label array labels of shape (N,) containing the true class index for each sample, extract the predicted probability of the correct class for each sample without loops. This is needed to compute cross-entropy loss.

import numpy as np

# ---- CHALLENGE ----
def extract_class_probs(probs, labels):
    """Extract probs[i, labels[i]] for each sample i. Return shape (N,)."""
    pass

def one_hot_encode(labels, num_classes):
    """Convert integer labels to one-hot encoding. Return shape (N, C)."""
    pass

# ---- SOLUTION ----
def extract_class_probs(probs, labels):
    """Extract the probability of the true class for each sample."""
    # Fancy indexing: index first dim with range, second with labels
    N = probs.shape[0]
    return probs[np.arange(N), labels]

def one_hot_encode(labels, num_classes):
    """Convert integer labels to one-hot encoding."""
    N = labels.shape[0]
    one_hot = np.zeros((N, num_classes))
    one_hot[np.arange(N), labels] = 1.0
    return one_hot

# ---- VERIFICATION ----
probs = np.array([[0.1, 0.7, 0.2],   # sample 0: class 1 has highest prob
                  [0.8, 0.1, 0.1],   # sample 1: class 0 has highest prob
                  [0.2, 0.3, 0.5]])  # sample 2: class 2 has highest prob
labels = np.array([1, 0, 2])

correct_probs = extract_class_probs(probs, labels)
assert np.allclose(correct_probs, [0.7, 0.8, 0.5]), f"Expected [0.7, 0.8, 0.5], got {correct_probs}"

one_hot = one_hot_encode(labels, 3)
expected = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]], dtype=float)
assert np.allclose(one_hot, expected), f"One-hot encoding incorrect"
print("Challenge 3 passed!")

Challenge 4: Boolean Masking

Problem: Given a dataset with features X of shape (N, D) and targets y of shape (N,): (a) filter out samples where any feature is NaN, (b) select samples where the target is above a threshold, (c) clip outlier feature values to the 1st and 99th percentiles.

import numpy as np

# ---- CHALLENGE ----
def remove_nan_samples(X, y):
    """Remove samples (rows) that contain any NaN in X."""
    pass

def filter_by_target(X, y, threshold):
    """Select only samples where y > threshold."""
    pass

def clip_outliers(X, lower_pct=1, upper_pct=99):
    """Clip each feature to its [lower_pct, upper_pct] percentile range."""
    pass

# ---- SOLUTION ----
def remove_nan_samples(X, y):
    """Remove samples (rows) that contain any NaN in X."""
    # np.isnan returns boolean array, .any(axis=1) checks each row
    valid_mask = ~np.isnan(X).any(axis=1)  # shape: (N,)
    return X[valid_mask], y[valid_mask]

def filter_by_target(X, y, threshold):
    """Select only samples where y > threshold."""
    mask = y > threshold
    return X[mask], y[mask]

def clip_outliers(X, lower_pct=1, upper_pct=99):
    """Clip each feature to its percentile range."""
    # Compute percentiles along axis=0 (per feature)
    lower = np.percentile(X, lower_pct, axis=0)   # shape: (D,)
    upper = np.percentile(X, upper_pct, axis=0)   # shape: (D,)
    # np.clip with broadcasting: (N, D) clipped to (D,) bounds
    return np.clip(X, lower, upper)

# ---- VERIFICATION ----
X = np.array([[1.0, 2.0], [np.nan, 3.0], [4.0, 5.0], [6.0, np.nan]])
y = np.array([10, 20, 30, 40])

X_clean, y_clean = remove_nan_samples(X, y)
assert X_clean.shape[0] == 2, f"Expected 2 valid samples, got {X_clean.shape[0]}"
assert np.array_equal(y_clean, [10, 30]), "Wrong samples retained"

X2 = np.random.randn(100, 5)
y2 = np.random.randn(100)
X_filt, y_filt = filter_by_target(X2, y2, 0.0)
assert np.all(y_filt > 0.0), "All filtered targets should be > 0"

X3 = np.random.randn(1000, 3) * 100
X_clipped = clip_outliers(X3)
assert X_clipped.max() <= np.percentile(X3, 99, axis=0).max() + 1e-10
print("Challenge 4 passed!")

Challenge 5: Stacking Arrays

Problem: You have embedding vectors from three different models, each of shape (N, D_i) where D_i may differ. (a) Concatenate them horizontally to create a combined feature vector. (b) Given a list of equal-shaped arrays, stack them to create a new batch dimension. (c) Build an augmented matrix [X | 1] by adding a column of ones (bias term) to a feature matrix.

import numpy as np

# ---- CHALLENGE ----
def concat_embeddings(emb_list):
    """Concatenate list of (N, D_i) arrays horizontally -> (N, sum(D_i))."""
    pass

def stack_batches(batch_list):
    """Stack list of (H, W) arrays into (B, H, W)."""
    pass

def add_bias_column(X):
    """Add column of ones to X: (N, D) -> (N, D+1)."""
    pass

# ---- SOLUTION ----
def concat_embeddings(emb_list):
    """Concatenate embeddings horizontally."""
    # axis=1 concatenates along the feature dimension
    return np.concatenate(emb_list, axis=1)

def stack_batches(batch_list):
    """Stack arrays along a new first axis."""
    # np.stack creates a new dimension
    return np.stack(batch_list, axis=0)

def add_bias_column(X):
    """Add column of ones to the right of X."""
    N = X.shape[0]
    ones = np.ones((N, 1))
    return np.hstack([X, ones])
    # Alternative: np.concatenate([X, ones], axis=1)
    # Alternative: np.column_stack([X, ones])

# ---- VERIFICATION ----
emb1 = np.random.randn(100, 64)
emb2 = np.random.randn(100, 128)
emb3 = np.random.randn(100, 32)
combined = concat_embeddings([emb1, emb2, emb3])
assert combined.shape == (100, 224), f"Expected (100, 224), got {combined.shape}"

imgs = [np.random.randn(28, 28) for _ in range(16)]
batch = stack_batches(imgs)
assert batch.shape == (16, 28, 28), f"Expected (16, 28, 28), got {batch.shape}"

X = np.random.randn(50, 10)
X_aug = add_bias_column(X)
assert X_aug.shape == (50, 11), f"Expected (50, 11), got {X_aug.shape}"
assert np.all(X_aug[:, -1] == 1.0), "Last column should be all ones"
print("Challenge 5 passed!")

Challenge 6: Splitting Datasets

Problem: Given features X of shape (N, D) and labels y of shape (N,), split them into train/validation/test sets with ratios 70/15/15. The split must be random (shuffled) and reproducible (seeded). Also implement stratified splitting that preserves class distribution.

import numpy as np

# ---- CHALLENGE ----
def train_val_test_split(X, y, train_ratio=0.7, val_ratio=0.15, seed=42):
    """Split X, y into train/val/test sets with given ratios."""
    pass

def stratified_split(X, y, train_ratio=0.8, seed=42):
    """Split preserving class distribution in train and test."""
    pass

# ---- SOLUTION ----
def train_val_test_split(X, y, train_ratio=0.7, val_ratio=0.15, seed=42):
    """Random split into train/val/test."""
    N = X.shape[0]
    rng = np.random.default_rng(seed)
    indices = rng.permutation(N)

    train_end = int(N * train_ratio)
    val_end = train_end + int(N * val_ratio)

    train_idx = indices[:train_end]
    val_idx = indices[train_end:val_end]
    test_idx = indices[val_end:]

    return (X[train_idx], y[train_idx],
            X[val_idx], y[val_idx],
            X[test_idx], y[test_idx])

def stratified_split(X, y, train_ratio=0.8, seed=42):
    """Split preserving class distribution."""
    rng = np.random.default_rng(seed)
    classes = np.unique(y)
    train_indices = []
    test_indices = []

    for cls in classes:
        cls_indices = np.where(y == cls)[0]
        rng.shuffle(cls_indices)
        split = int(len(cls_indices) * train_ratio)
        train_indices.extend(cls_indices[:split])
        test_indices.extend(cls_indices[split:])

    train_indices = np.array(train_indices)
    test_indices = np.array(test_indices)
    rng.shuffle(train_indices)
    rng.shuffle(test_indices)

    return (X[train_indices], y[train_indices],
            X[test_indices], y[test_indices])

# ---- VERIFICATION ----
X = np.random.randn(1000, 10)
y = np.random.randint(0, 3, 1000)

X_tr, y_tr, X_val, y_val, X_te, y_te = train_val_test_split(X, y)
assert X_tr.shape[0] == 700, f"Expected 700 train, got {X_tr.shape[0]}"
assert X_val.shape[0] == 150, f"Expected 150 val, got {X_val.shape[0]}"
assert X_te.shape[0] == 150, f"Expected 150 test, got {X_te.shape[0]}"

X_tr2, y_tr2, X_te2, y_te2 = stratified_split(X, y, train_ratio=0.8)
# Check class distribution is preserved
for cls in [0, 1, 2]:
    orig_ratio = (y == cls).mean()
    train_ratio_cls = (y_tr2 == cls).mean()
    assert abs(orig_ratio - train_ratio_cls) < 0.05, \
        f"Class {cls} ratio differs too much: {orig_ratio:.3f} vs {train_ratio_cls:.3f}"
print("Challenge 6 passed!")

Key Takeaways

💡

reshape and transpose are zero-copy operations — they return views, not copies
Broadcasting eliminates the need for explicit loops: (N, D) - (D,) subtracts from every row
Fancy indexing with [np.arange(N), labels] is the standard pattern for extracting per-sample class probabilities
Boolean masking with X[mask] is the NumPy equivalent of SQL WHERE clause
np.concatenate joins along existing axes, np.stack creates a new axis
Always use np.random.default_rng(seed) for reproducible random operations

← Previous NumPy for ML Interviews Next → Matrix Mathematics