Array Operations
Six coding challenges covering the fundamental NumPy operations that appear in every ML coding interview: reshape and transpose, broadcasting, fancy indexing, boolean masking, array stacking, and dataset splitting.
Challenge 1: Reshape and Transpose
Problem: Given a 1D array of pixel values with length H * W * C, reshape it into an image tensor of shape (H, W, C) and then convert it to channel-first format (C, H, W) used by PyTorch. Also, given a batch of images (N, H, W, C), convert to (N, C, H, W).
import numpy as np
# ---- CHALLENGE ----
def reshape_image(pixels, H, W, C):
"""Reshape flat pixels to (H, W, C), then convert to (C, H, W)."""
pass
def batch_channel_first(images):
"""Convert batch from (N, H, W, C) to (N, C, H, W)."""
pass
# ---- SOLUTION ----
def reshape_image(pixels, H, W, C):
"""Reshape flat pixels to (H, W, C), then convert to (C, H, W)."""
# Step 1: reshape to image
img = pixels.reshape(H, W, C)
# Step 2: move channel axis from last to first
# transpose(2, 0, 1) moves axis 2->0, 0->1, 1->2
img_chw = img.transpose(2, 0, 1)
return img_chw
def batch_channel_first(images):
"""Convert batch from (N, H, W, C) to (N, C, H, W)."""
# transpose: keep batch dim (0), move channel (3) before H,W (1,2)
return images.transpose(0, 3, 1, 2)
# ---- VERIFICATION ----
pixels = np.arange(2 * 3 * 3) # 18 pixels, RGB image 2x3
result = reshape_image(pixels, 2, 3, 3)
assert result.shape == (3, 2, 3), f"Expected (3, 2, 3), got {result.shape}"
batch = np.random.randn(8, 32, 32, 3) # 8 images, 32x32, RGB
result = batch_channel_first(batch)
assert result.shape == (8, 3, 32, 32), f"Expected (8, 3, 32, 32), got {result.shape}"
print("Challenge 1 passed!")
Challenge 2: Broadcasting Arithmetic
Problem: Given a feature matrix X of shape (N, D) where N is samples and D is features, normalize each feature to zero mean and unit variance without any loops. Then, add a bias vector b of shape (D,) to every sample using broadcasting.
import numpy as np
# ---- CHALLENGE ----
def normalize_features(X):
"""Normalize X to zero mean, unit variance per feature. Return (X_norm, mean, std)."""
pass
def add_bias(X, b):
"""Add bias vector b of shape (D,) to each row of X (N, D)."""
pass
# ---- SOLUTION ----
def normalize_features(X):
"""Normalize X to zero mean, unit variance per feature."""
# axis=0 computes stat across samples (one value per feature)
mean = X.mean(axis=0) # shape: (D,)
std = X.std(axis=0) # shape: (D,)
# Avoid division by zero for constant features
std = np.where(std == 0, 1.0, std)
# Broadcasting: (N, D) - (D,) works automatically
X_norm = (X - mean) / std
return X_norm, mean, std
def add_bias(X, b):
"""Add bias vector b of shape (D,) to each row of X (N, D)."""
# Broadcasting: (N, D) + (D,) -> (N, D)
return X + b
# ---- VERIFICATION ----
X = np.array([[1.0, 200.0, 3000.0],
[2.0, 300.0, 4000.0],
[3.0, 400.0, 5000.0],
[4.0, 500.0, 6000.0]])
X_norm, mean, std = normalize_features(X)
# After normalization, each column should have mean ~0 and std ~1
assert np.allclose(X_norm.mean(axis=0), 0, atol=1e-10), "Mean should be ~0"
assert np.allclose(X_norm.std(axis=0), 1, atol=1e-10), "Std should be ~1"
b = np.array([0.1, 0.2, 0.3])
result = add_bias(X_norm, b)
assert result.shape == X_norm.shape, "Shape should be preserved"
print("Challenge 2 passed!")
Challenge 3: Fancy Indexing
Problem: Given a batch of model predictions probs of shape (N, C) where N is batch size and C is number of classes, and a label array labels of shape (N,) containing the true class index for each sample, extract the predicted probability of the correct class for each sample without loops. This is needed to compute cross-entropy loss.
import numpy as np
# ---- CHALLENGE ----
def extract_class_probs(probs, labels):
"""Extract probs[i, labels[i]] for each sample i. Return shape (N,)."""
pass
def one_hot_encode(labels, num_classes):
"""Convert integer labels to one-hot encoding. Return shape (N, C)."""
pass
# ---- SOLUTION ----
def extract_class_probs(probs, labels):
"""Extract the probability of the true class for each sample."""
# Fancy indexing: index first dim with range, second with labels
N = probs.shape[0]
return probs[np.arange(N), labels]
def one_hot_encode(labels, num_classes):
"""Convert integer labels to one-hot encoding."""
N = labels.shape[0]
one_hot = np.zeros((N, num_classes))
one_hot[np.arange(N), labels] = 1.0
return one_hot
# ---- VERIFICATION ----
probs = np.array([[0.1, 0.7, 0.2], # sample 0: class 1 has highest prob
[0.8, 0.1, 0.1], # sample 1: class 0 has highest prob
[0.2, 0.3, 0.5]]) # sample 2: class 2 has highest prob
labels = np.array([1, 0, 2])
correct_probs = extract_class_probs(probs, labels)
assert np.allclose(correct_probs, [0.7, 0.8, 0.5]), f"Expected [0.7, 0.8, 0.5], got {correct_probs}"
one_hot = one_hot_encode(labels, 3)
expected = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]], dtype=float)
assert np.allclose(one_hot, expected), f"One-hot encoding incorrect"
print("Challenge 3 passed!")
Challenge 4: Boolean Masking
Problem: Given a dataset with features X of shape (N, D) and targets y of shape (N,): (a) filter out samples where any feature is NaN, (b) select samples where the target is above a threshold, (c) clip outlier feature values to the 1st and 99th percentiles.
import numpy as np
# ---- CHALLENGE ----
def remove_nan_samples(X, y):
"""Remove samples (rows) that contain any NaN in X."""
pass
def filter_by_target(X, y, threshold):
"""Select only samples where y > threshold."""
pass
def clip_outliers(X, lower_pct=1, upper_pct=99):
"""Clip each feature to its [lower_pct, upper_pct] percentile range."""
pass
# ---- SOLUTION ----
def remove_nan_samples(X, y):
"""Remove samples (rows) that contain any NaN in X."""
# np.isnan returns boolean array, .any(axis=1) checks each row
valid_mask = ~np.isnan(X).any(axis=1) # shape: (N,)
return X[valid_mask], y[valid_mask]
def filter_by_target(X, y, threshold):
"""Select only samples where y > threshold."""
mask = y > threshold
return X[mask], y[mask]
def clip_outliers(X, lower_pct=1, upper_pct=99):
"""Clip each feature to its percentile range."""
# Compute percentiles along axis=0 (per feature)
lower = np.percentile(X, lower_pct, axis=0) # shape: (D,)
upper = np.percentile(X, upper_pct, axis=0) # shape: (D,)
# np.clip with broadcasting: (N, D) clipped to (D,) bounds
return np.clip(X, lower, upper)
# ---- VERIFICATION ----
X = np.array([[1.0, 2.0], [np.nan, 3.0], [4.0, 5.0], [6.0, np.nan]])
y = np.array([10, 20, 30, 40])
X_clean, y_clean = remove_nan_samples(X, y)
assert X_clean.shape[0] == 2, f"Expected 2 valid samples, got {X_clean.shape[0]}"
assert np.array_equal(y_clean, [10, 30]), "Wrong samples retained"
X2 = np.random.randn(100, 5)
y2 = np.random.randn(100)
X_filt, y_filt = filter_by_target(X2, y2, 0.0)
assert np.all(y_filt > 0.0), "All filtered targets should be > 0"
X3 = np.random.randn(1000, 3) * 100
X_clipped = clip_outliers(X3)
assert X_clipped.max() <= np.percentile(X3, 99, axis=0).max() + 1e-10
print("Challenge 4 passed!")
Challenge 5: Stacking Arrays
Problem: You have embedding vectors from three different models, each of shape (N, D_i) where D_i may differ. (a) Concatenate them horizontally to create a combined feature vector. (b) Given a list of equal-shaped arrays, stack them to create a new batch dimension. (c) Build an augmented matrix [X | 1] by adding a column of ones (bias term) to a feature matrix.
import numpy as np
# ---- CHALLENGE ----
def concat_embeddings(emb_list):
"""Concatenate list of (N, D_i) arrays horizontally -> (N, sum(D_i))."""
pass
def stack_batches(batch_list):
"""Stack list of (H, W) arrays into (B, H, W)."""
pass
def add_bias_column(X):
"""Add column of ones to X: (N, D) -> (N, D+1)."""
pass
# ---- SOLUTION ----
def concat_embeddings(emb_list):
"""Concatenate embeddings horizontally."""
# axis=1 concatenates along the feature dimension
return np.concatenate(emb_list, axis=1)
def stack_batches(batch_list):
"""Stack arrays along a new first axis."""
# np.stack creates a new dimension
return np.stack(batch_list, axis=0)
def add_bias_column(X):
"""Add column of ones to the right of X."""
N = X.shape[0]
ones = np.ones((N, 1))
return np.hstack([X, ones])
# Alternative: np.concatenate([X, ones], axis=1)
# Alternative: np.column_stack([X, ones])
# ---- VERIFICATION ----
emb1 = np.random.randn(100, 64)
emb2 = np.random.randn(100, 128)
emb3 = np.random.randn(100, 32)
combined = concat_embeddings([emb1, emb2, emb3])
assert combined.shape == (100, 224), f"Expected (100, 224), got {combined.shape}"
imgs = [np.random.randn(28, 28) for _ in range(16)]
batch = stack_batches(imgs)
assert batch.shape == (16, 28, 28), f"Expected (16, 28, 28), got {batch.shape}"
X = np.random.randn(50, 10)
X_aug = add_bias_column(X)
assert X_aug.shape == (50, 11), f"Expected (50, 11), got {X_aug.shape}"
assert np.all(X_aug[:, -1] == 1.0), "Last column should be all ones"
print("Challenge 5 passed!")
Challenge 6: Splitting Datasets
Problem: Given features X of shape (N, D) and labels y of shape (N,), split them into train/validation/test sets with ratios 70/15/15. The split must be random (shuffled) and reproducible (seeded). Also implement stratified splitting that preserves class distribution.
import numpy as np
# ---- CHALLENGE ----
def train_val_test_split(X, y, train_ratio=0.7, val_ratio=0.15, seed=42):
"""Split X, y into train/val/test sets with given ratios."""
pass
def stratified_split(X, y, train_ratio=0.8, seed=42):
"""Split preserving class distribution in train and test."""
pass
# ---- SOLUTION ----
def train_val_test_split(X, y, train_ratio=0.7, val_ratio=0.15, seed=42):
"""Random split into train/val/test."""
N = X.shape[0]
rng = np.random.default_rng(seed)
indices = rng.permutation(N)
train_end = int(N * train_ratio)
val_end = train_end + int(N * val_ratio)
train_idx = indices[:train_end]
val_idx = indices[train_end:val_end]
test_idx = indices[val_end:]
return (X[train_idx], y[train_idx],
X[val_idx], y[val_idx],
X[test_idx], y[test_idx])
def stratified_split(X, y, train_ratio=0.8, seed=42):
"""Split preserving class distribution."""
rng = np.random.default_rng(seed)
classes = np.unique(y)
train_indices = []
test_indices = []
for cls in classes:
cls_indices = np.where(y == cls)[0]
rng.shuffle(cls_indices)
split = int(len(cls_indices) * train_ratio)
train_indices.extend(cls_indices[:split])
test_indices.extend(cls_indices[split:])
train_indices = np.array(train_indices)
test_indices = np.array(test_indices)
rng.shuffle(train_indices)
rng.shuffle(test_indices)
return (X[train_indices], y[train_indices],
X[test_indices], y[test_indices])
# ---- VERIFICATION ----
X = np.random.randn(1000, 10)
y = np.random.randint(0, 3, 1000)
X_tr, y_tr, X_val, y_val, X_te, y_te = train_val_test_split(X, y)
assert X_tr.shape[0] == 700, f"Expected 700 train, got {X_tr.shape[0]}"
assert X_val.shape[0] == 150, f"Expected 150 val, got {X_val.shape[0]}"
assert X_te.shape[0] == 150, f"Expected 150 test, got {X_te.shape[0]}"
X_tr2, y_tr2, X_te2, y_te2 = stratified_split(X, y, train_ratio=0.8)
# Check class distribution is preserved
for cls in [0, 1, 2]:
orig_ratio = (y == cls).mean()
train_ratio_cls = (y_tr2 == cls).mean()
assert abs(orig_ratio - train_ratio_cls) < 0.05, \
f"Class {cls} ratio differs too much: {orig_ratio:.3f} vs {train_ratio_cls:.3f}"
print("Challenge 6 passed!")
Key Takeaways
reshapeandtransposeare zero-copy operations — they return views, not copies- Broadcasting eliminates the need for explicit loops:
(N, D) - (D,)subtracts from every row - Fancy indexing with
[np.arange(N), labels]is the standard pattern for extracting per-sample class probabilities - Boolean masking with
X[mask]is the NumPy equivalent of SQL WHERE clause np.concatenatejoins along existing axes,np.stackcreates a new axis- Always use
np.random.default_rng(seed)for reproducible random operations
Lilly Tech Systems