Custom Sorting & Comparators
Real-world sorting rarely uses simple numeric ordering. ML systems sort by relevance scores with tiebreakers, merge overlapping time windows, schedule GPU resources, and rearrange features for pipeline optimization. This lesson covers 5 problems that test your ability to define custom ordering logic.
Problem 1: Largest Number (Custom Comparator)
Problem: Given a list of non-negative integers, arrange them to form the largest possible number. Return as a string.
ML Application: Custom ranking where the comparison function is non-trivial (e.g., ranking search results by a combination of relevance and freshness).
from functools import cmp_to_key
def largest_number(nums: list) -> str:
"""Arrange numbers to form the largest number. LeetCode 179.
Time: O(n log n) for sorting
Space: O(n) for string conversion
Key insight: Compare "ab" vs "ba" as strings.
If "ab" > "ba", then a should come before b.
"""
# Convert to strings for concatenation comparison
strs = [str(n) for n in nums]
# Custom comparator: compare concatenations
def compare(a, b):
if a + b > b + a:
return -1 # a should come first
elif a + b < b + a:
return 1 # b should come first
return 0
strs.sort(key=cmp_to_key(compare))
# Handle edge case: all zeros
result = "".join(strs)
return "0" if result[0] == "0" else result
# Example
print(largest_number([10, 2])) # "210"
print(largest_number([3, 30, 34, 5, 9])) # "9534330"
# ML application: sort model versions by composite score
models = [
{"name": "v1.2", "accuracy": 0.92, "latency": 45},
{"name": "v2.0", "accuracy": 0.95, "latency": 120},
{"name": "v1.8", "accuracy": 0.93, "latency": 50},
{"name": "v2.1", "accuracy": 0.94, "latency": 60},
]
def model_comparator(a, b):
"""Compare models: higher accuracy first, then lower latency."""
if a["accuracy"] != b["accuracy"]:
return -1 if a["accuracy"] > b["accuracy"] else 1
return -1 if a["latency"] < b["latency"] else 1
ranked = sorted(models, key=cmp_to_key(model_comparator))
for m in ranked:
print(f" {m['name']}: acc={m['accuracy']}, lat={m['latency']}ms")
Problem 2: Sort by Multiple Keys
Problem: Sort a list of records by multiple criteria with different orderings (ascending/descending).
ML Application: Sorting training examples by difficulty (curriculum learning), sorting features by importance then correlation.
def sort_multi_key(records: list, keys: list) -> list:
"""Sort records by multiple keys with ascending/descending control.
Time: O(n log n * k) where k = number of keys
Space: O(n)
Uses Python's stable sort: sort by LAST key first, then second-to-last, etc.
Since sort is stable, earlier sorts preserve order for equal elements.
"""
# Method 1: Tuple key (simpler, preferred)
# Negate numeric values for descending order
pass
# Practical example: sort ML experiment results
experiments = [
{"name": "exp_1", "val_acc": 0.92, "train_time": 120, "params": 1_000_000},
{"name": "exp_2", "val_acc": 0.95, "train_time": 300, "params": 5_000_000},
{"name": "exp_3", "val_acc": 0.95, "train_time": 150, "params": 2_000_000},
{"name": "exp_4", "val_acc": 0.91, "train_time": 80, "params": 500_000},
{"name": "exp_5", "val_acc": 0.95, "train_time": 200, "params": 3_000_000},
]
# Sort by: val_acc DESC, then train_time ASC, then params ASC
sorted_exps = sorted(
experiments,
key=lambda x: (-x["val_acc"], x["train_time"], x["params"])
)
print("Experiments ranked (best first):")
for exp in sorted_exps:
print(f" {exp['name']}: acc={exp['val_acc']}, "
f"time={exp['train_time']}s, params={exp['params']:,}")
# Output:
# exp_3: acc=0.95, time=150s, params=2,000,000
# exp_5: acc=0.95, time=200s, params=3,000,000
# exp_2: acc=0.95, time=300s, params=5,000,000
# exp_1: acc=0.92, time=120s, params=1,000,000
# exp_4: acc=0.91, time=80s, params=500,000
# Advanced: curriculum learning - sort examples by difficulty
training_data = [
{"text": "The cat sat.", "loss": 0.1, "length": 3},
{"text": "Complex technical document...", "loss": 2.5, "length": 50},
{"text": "Simple sentence here.", "loss": 0.3, "length": 3},
{"text": "Medium complexity text.", "loss": 0.8, "length": 4},
]
# Curriculum: easy examples first (low loss), then hard
curriculum = sorted(training_data, key=lambda x: (x["loss"], x["length"]))
print("\nCurriculum order (easy to hard):")
for item in curriculum:
print(f" loss={item['loss']:.1f}, len={item['length']}: {item['text'][:30]}")
Problem 3: Merge Intervals
Problem: Given a collection of intervals, merge all overlapping intervals.
ML Application: Merging overlapping time windows in event data, combining adjacent attention spans, merging prediction confidence intervals.
def merge_intervals(intervals: list) -> list:
"""Merge overlapping intervals. LeetCode 56.
Time: O(n log n) for sorting
Space: O(n) for output
Sort by start time, then iterate and merge overlapping intervals.
Two intervals overlap if current.start <= previous.end.
"""
if not intervals:
return []
# Sort by start time
intervals.sort(key=lambda x: x[0])
merged = [intervals[0]]
for start, end in intervals[1:]:
# If current interval overlaps with previous
if start <= merged[-1][1]:
# Extend the previous interval
merged[-1][1] = max(merged[-1][1], end)
else:
# No overlap, add new interval
merged.append([start, end])
return merged
# Example
intervals = [[1, 3], [2, 6], [8, 10], [15, 18]]
print(merge_intervals(intervals)) # [[1, 6], [8, 10], [15, 18]]
# ML application: merge overlapping attention windows
attention_spans = [
[0, 50], # tokens 0-50 attend to each other
[30, 80], # tokens 30-80 attend to each other
[100, 150], # separate attention window
[140, 200], # overlaps with previous
]
merged_attention = merge_intervals(attention_spans)
print(f"Merged attention windows: {merged_attention}")
# [[0, 80], [100, 200]]
# ML application: merge active user sessions
sessions = [[0, 30], [5, 10], [25, 50], [60, 90], [85, 100]]
merged_sessions = merge_intervals(sessions)
print(f"User active periods: {merged_sessions}")
# [[0, 50], [60, 100]]
Problem 4: Meeting Rooms II (Minimum Platforms)
Problem: Given meeting time intervals, find the minimum number of conference rooms required.
ML Application: GPU scheduling — finding the minimum number of GPUs needed to run all training jobs. Batch scheduling in inference servers.
import heapq
def min_meeting_rooms(intervals: list) -> int:
"""Minimum meeting rooms / GPUs needed. LeetCode 253.
Time: O(n log n)
Space: O(n)
Sort by start time. Use a min-heap to track end times of
ongoing meetings. If new meeting starts after earliest end,
reuse that room. Otherwise, allocate a new room.
"""
if not intervals:
return 0
intervals.sort(key=lambda x: x[0])
# Min-heap of end times (rooms in use)
rooms = [intervals[0][1]] # First meeting's end time
for start, end in intervals[1:]:
# If earliest room is free (its meeting ended)
if start >= rooms[0]:
heapq.heapreplace(rooms, end) # Reuse room
else:
heapq.heappush(rooms, end) # Need new room
return len(rooms)
# Example
meetings = [[0, 30], [5, 10], [15, 20]]
print(f"Rooms needed: {min_meeting_rooms(meetings)}") # 2
# ML application: GPU scheduling for training jobs
training_jobs = [
[0, 120], # Job A: 0-120 min
[30, 90], # Job B: 30-90 min
[60, 180], # Job C: 60-180 min
[100, 150], # Job D: 100-150 min
[150, 240], # Job E: 150-240 min
]
gpus_needed = min_meeting_rooms(training_jobs)
print(f"Minimum GPUs needed: {gpus_needed}") # 3
# Alternative: sweep line approach (also O(n log n))
def min_rooms_sweep(intervals: list) -> int:
"""Sweep line approach: track start/end events."""
events = []
for start, end in intervals:
events.append((start, 1)) # +1 room at start
events.append((end, -1)) # -1 room at end
events.sort() # Sort by time, -1 before +1 at same time
max_rooms = current = 0
for _, delta in events:
current += delta
max_rooms = max(max_rooms, current)
return max_rooms
print(f"Sweep line result: {min_rooms_sweep(training_jobs)}") # 3
Problem 5: Sort Array by Rearranging
Problem: Rearrange an array such that positive and negative numbers alternate, or sort colors (Dutch National Flag), or wiggle sort.
ML Application: Balanced batch construction for training (alternating easy/hard examples), stratified sampling, diversity-aware ranking.
def sort_colors(nums: list) -> None:
"""Dutch National Flag: sort 0s, 1s, 2s in-place. LeetCode 75.
Time: O(n) single pass
Space: O(1)
Three pointers: lo (0s boundary), hi (2s boundary), mid (scanner)
"""
lo, mid, hi = 0, 0, len(nums) - 1
while mid <= hi:
if nums[mid] == 0:
nums[lo], nums[mid] = nums[mid], nums[lo]
lo += 1
mid += 1
elif nums[mid] == 1:
mid += 1
else: # nums[mid] == 2
nums[mid], nums[hi] = nums[hi], nums[mid]
hi -= 1
# Don't increment mid: swapped element needs checking
# Example
colors = [2, 0, 2, 1, 1, 0]
sort_colors(colors)
print(colors) # [0, 0, 1, 1, 2, 2]
def rearrange_alternating(arr: list) -> list:
"""Rearrange so positive and negative alternate.
ML use case: Create balanced training batches with alternating
positive and negative examples for contrastive learning.
Time: O(n log n)
Space: O(n)
"""
positives = sorted([x for x in arr if x >= 0])
negatives = sorted([x for x in arr if x < 0])
result = []
i = j = 0
while i < len(positives) and j < len(negatives):
result.append(positives[i])
result.append(negatives[j])
i += 1
j += 1
# Append remaining
result.extend(positives[i:])
result.extend(negatives[j:])
return result
# Example
arr = [3, -2, 1, -5, 4, -1, 2, -3]
print(rearrange_alternating(arr))
# [1, -5, 2, -3, 3, -2, 4, -1]
def create_balanced_batch(positives: list, negatives: list, batch_size: int) -> list:
"""Create balanced training batches for contrastive learning.
Alternates positive and negative examples within each batch.
Used in triplet loss, contrastive loss, and InfoNCE training.
Time: O(n)
Space: O(batch_size)
"""
batches = []
pos_idx = neg_idx = 0
half = batch_size // 2
while pos_idx + half <= len(positives) and neg_idx + half <= len(negatives):
batch = []
for i in range(half):
batch.append(("pos", positives[pos_idx + i]))
batch.append(("neg", negatives[neg_idx + i]))
batches.append(batch)
pos_idx += half
neg_idx += half
return batches
# Example
pos_examples = ["good review 1", "good review 2", "good review 3", "good review 4"]
neg_examples = ["bad review 1", "bad review 2", "bad review 3", "bad review 4"]
batches = create_balanced_batch(pos_examples, neg_examples, batch_size=4)
for i, batch in enumerate(batches):
print(f"Batch {i}: {batch}")
Key Takeaways
functools.cmp_to_keyconverts a comparator function to a key function for Python'ssorted()- Multi-key sorting uses tuples as keys — negate numeric values for descending order within the tuple
- Merge intervals is a pattern that appears in session analysis, attention windowing, and time-series segmentation
- Meeting rooms / GPU scheduling uses a min-heap of end times to greedily assign resources
- Dutch National Flag (3-way partition) is the basis of 3-way QuickSort, which handles duplicate keys efficiently
- Balanced batch construction through rearrangement is essential for contrastive and metric learning
Lilly Tech Systems