Advanced

Custom Sorting & Comparators

Real-world sorting rarely uses simple numeric ordering. ML systems sort by relevance scores with tiebreakers, merge overlapping time windows, schedule GPU resources, and rearrange features for pipeline optimization. This lesson covers 5 problems that test your ability to define custom ordering logic.

Problem 1: Largest Number (Custom Comparator)

Problem: Given a list of non-negative integers, arrange them to form the largest possible number. Return as a string.

ML Application: Custom ranking where the comparison function is non-trivial (e.g., ranking search results by a combination of relevance and freshness).

from functools import cmp_to_key

def largest_number(nums: list) -> str:
    """Arrange numbers to form the largest number. LeetCode 179.

    Time:  O(n log n) for sorting
    Space: O(n) for string conversion

    Key insight: Compare "ab" vs "ba" as strings.
    If "ab" > "ba", then a should come before b.
    """
    # Convert to strings for concatenation comparison
    strs = [str(n) for n in nums]

    # Custom comparator: compare concatenations
    def compare(a, b):
        if a + b > b + a:
            return -1  # a should come first
        elif a + b < b + a:
            return 1   # b should come first
        return 0

    strs.sort(key=cmp_to_key(compare))

    # Handle edge case: all zeros
    result = "".join(strs)
    return "0" if result[0] == "0" else result

# Example
print(largest_number([10, 2]))      # "210"
print(largest_number([3, 30, 34, 5, 9]))  # "9534330"

# ML application: sort model versions by composite score
models = [
    {"name": "v1.2", "accuracy": 0.92, "latency": 45},
    {"name": "v2.0", "accuracy": 0.95, "latency": 120},
    {"name": "v1.8", "accuracy": 0.93, "latency": 50},
    {"name": "v2.1", "accuracy": 0.94, "latency": 60},
]

def model_comparator(a, b):
    """Compare models: higher accuracy first, then lower latency."""
    if a["accuracy"] != b["accuracy"]:
        return -1 if a["accuracy"] > b["accuracy"] else 1
    return -1 if a["latency"] < b["latency"] else 1

ranked = sorted(models, key=cmp_to_key(model_comparator))
for m in ranked:
    print(f"  {m['name']}: acc={m['accuracy']}, lat={m['latency']}ms")

Problem 2: Sort by Multiple Keys

Problem: Sort a list of records by multiple criteria with different orderings (ascending/descending).

ML Application: Sorting training examples by difficulty (curriculum learning), sorting features by importance then correlation.

def sort_multi_key(records: list, keys: list) -> list:
    """Sort records by multiple keys with ascending/descending control.

    Time:  O(n log n * k) where k = number of keys
    Space: O(n)

    Uses Python's stable sort: sort by LAST key first, then second-to-last, etc.
    Since sort is stable, earlier sorts preserve order for equal elements.
    """
    # Method 1: Tuple key (simpler, preferred)
    # Negate numeric values for descending order
    pass

# Practical example: sort ML experiment results
experiments = [
    {"name": "exp_1", "val_acc": 0.92, "train_time": 120, "params": 1_000_000},
    {"name": "exp_2", "val_acc": 0.95, "train_time": 300, "params": 5_000_000},
    {"name": "exp_3", "val_acc": 0.95, "train_time": 150, "params": 2_000_000},
    {"name": "exp_4", "val_acc": 0.91, "train_time": 80,  "params": 500_000},
    {"name": "exp_5", "val_acc": 0.95, "train_time": 200, "params": 3_000_000},
]

# Sort by: val_acc DESC, then train_time ASC, then params ASC
sorted_exps = sorted(
    experiments,
    key=lambda x: (-x["val_acc"], x["train_time"], x["params"])
)

print("Experiments ranked (best first):")
for exp in sorted_exps:
    print(f"  {exp['name']}: acc={exp['val_acc']}, "
          f"time={exp['train_time']}s, params={exp['params']:,}")
# Output:
#   exp_3: acc=0.95, time=150s, params=2,000,000
#   exp_5: acc=0.95, time=200s, params=3,000,000
#   exp_2: acc=0.95, time=300s, params=5,000,000
#   exp_1: acc=0.92, time=120s, params=1,000,000
#   exp_4: acc=0.91, time=80s,  params=500,000


# Advanced: curriculum learning - sort examples by difficulty
training_data = [
    {"text": "The cat sat.", "loss": 0.1, "length": 3},
    {"text": "Complex technical document...", "loss": 2.5, "length": 50},
    {"text": "Simple sentence here.", "loss": 0.3, "length": 3},
    {"text": "Medium complexity text.", "loss": 0.8, "length": 4},
]

# Curriculum: easy examples first (low loss), then hard
curriculum = sorted(training_data, key=lambda x: (x["loss"], x["length"]))
print("\nCurriculum order (easy to hard):")
for item in curriculum:
    print(f"  loss={item['loss']:.1f}, len={item['length']}: {item['text'][:30]}")

Problem 3: Merge Intervals

Problem: Given a collection of intervals, merge all overlapping intervals.

ML Application: Merging overlapping time windows in event data, combining adjacent attention spans, merging prediction confidence intervals.

def merge_intervals(intervals: list) -> list:
    """Merge overlapping intervals. LeetCode 56.

    Time:  O(n log n) for sorting
    Space: O(n) for output

    Sort by start time, then iterate and merge overlapping intervals.
    Two intervals overlap if current.start <= previous.end.
    """
    if not intervals:
        return []

    # Sort by start time
    intervals.sort(key=lambda x: x[0])

    merged = [intervals[0]]

    for start, end in intervals[1:]:
        # If current interval overlaps with previous
        if start <= merged[-1][1]:
            # Extend the previous interval
            merged[-1][1] = max(merged[-1][1], end)
        else:
            # No overlap, add new interval
            merged.append([start, end])

    return merged

# Example
intervals = [[1, 3], [2, 6], [8, 10], [15, 18]]
print(merge_intervals(intervals))  # [[1, 6], [8, 10], [15, 18]]

# ML application: merge overlapping attention windows
attention_spans = [
    [0, 50],    # tokens 0-50 attend to each other
    [30, 80],   # tokens 30-80 attend to each other
    [100, 150], # separate attention window
    [140, 200], # overlaps with previous
]
merged_attention = merge_intervals(attention_spans)
print(f"Merged attention windows: {merged_attention}")
# [[0, 80], [100, 200]]

# ML application: merge active user sessions
sessions = [[0, 30], [5, 10], [25, 50], [60, 90], [85, 100]]
merged_sessions = merge_intervals(sessions)
print(f"User active periods: {merged_sessions}")
# [[0, 50], [60, 100]]

Problem 4: Meeting Rooms II (Minimum Platforms)

Problem: Given meeting time intervals, find the minimum number of conference rooms required.

ML Application: GPU scheduling — finding the minimum number of GPUs needed to run all training jobs. Batch scheduling in inference servers.

import heapq

def min_meeting_rooms(intervals: list) -> int:
    """Minimum meeting rooms / GPUs needed. LeetCode 253.

    Time:  O(n log n)
    Space: O(n)

    Sort by start time. Use a min-heap to track end times of
    ongoing meetings. If new meeting starts after earliest end,
    reuse that room. Otherwise, allocate a new room.
    """
    if not intervals:
        return 0

    intervals.sort(key=lambda x: x[0])

    # Min-heap of end times (rooms in use)
    rooms = [intervals[0][1]]  # First meeting's end time

    for start, end in intervals[1:]:
        # If earliest room is free (its meeting ended)
        if start >= rooms[0]:
            heapq.heapreplace(rooms, end)  # Reuse room
        else:
            heapq.heappush(rooms, end)     # Need new room

    return len(rooms)

# Example
meetings = [[0, 30], [5, 10], [15, 20]]
print(f"Rooms needed: {min_meeting_rooms(meetings)}")  # 2

# ML application: GPU scheduling for training jobs
training_jobs = [
    [0, 120],    # Job A: 0-120 min
    [30, 90],    # Job B: 30-90 min
    [60, 180],   # Job C: 60-180 min
    [100, 150],  # Job D: 100-150 min
    [150, 240],  # Job E: 150-240 min
]
gpus_needed = min_meeting_rooms(training_jobs)
print(f"Minimum GPUs needed: {gpus_needed}")  # 3

# Alternative: sweep line approach (also O(n log n))
def min_rooms_sweep(intervals: list) -> int:
    """Sweep line approach: track start/end events."""
    events = []
    for start, end in intervals:
        events.append((start, 1))   # +1 room at start
        events.append((end, -1))    # -1 room at end

    events.sort()  # Sort by time, -1 before +1 at same time

    max_rooms = current = 0
    for _, delta in events:
        current += delta
        max_rooms = max(max_rooms, current)

    return max_rooms

print(f"Sweep line result: {min_rooms_sweep(training_jobs)}")  # 3

Problem 5: Sort Array by Rearranging

Problem: Rearrange an array such that positive and negative numbers alternate, or sort colors (Dutch National Flag), or wiggle sort.

ML Application: Balanced batch construction for training (alternating easy/hard examples), stratified sampling, diversity-aware ranking.

def sort_colors(nums: list) -> None:
    """Dutch National Flag: sort 0s, 1s, 2s in-place. LeetCode 75.

    Time:  O(n) single pass
    Space: O(1)

    Three pointers: lo (0s boundary), hi (2s boundary), mid (scanner)
    """
    lo, mid, hi = 0, 0, len(nums) - 1

    while mid <= hi:
        if nums[mid] == 0:
            nums[lo], nums[mid] = nums[mid], nums[lo]
            lo += 1
            mid += 1
        elif nums[mid] == 1:
            mid += 1
        else:  # nums[mid] == 2
            nums[mid], nums[hi] = nums[hi], nums[mid]
            hi -= 1
            # Don't increment mid: swapped element needs checking

# Example
colors = [2, 0, 2, 1, 1, 0]
sort_colors(colors)
print(colors)  # [0, 0, 1, 1, 2, 2]


def rearrange_alternating(arr: list) -> list:
    """Rearrange so positive and negative alternate.

    ML use case: Create balanced training batches with alternating
    positive and negative examples for contrastive learning.

    Time:  O(n log n)
    Space: O(n)
    """
    positives = sorted([x for x in arr if x >= 0])
    negatives = sorted([x for x in arr if x < 0])

    result = []
    i = j = 0

    while i < len(positives) and j < len(negatives):
        result.append(positives[i])
        result.append(negatives[j])
        i += 1
        j += 1

    # Append remaining
    result.extend(positives[i:])
    result.extend(negatives[j:])

    return result

# Example
arr = [3, -2, 1, -5, 4, -1, 2, -3]
print(rearrange_alternating(arr))
# [1, -5, 2, -3, 3, -2, 4, -1]


def create_balanced_batch(positives: list, negatives: list, batch_size: int) -> list:
    """Create balanced training batches for contrastive learning.

    Alternates positive and negative examples within each batch.
    Used in triplet loss, contrastive loss, and InfoNCE training.

    Time:  O(n)
    Space: O(batch_size)
    """
    batches = []
    pos_idx = neg_idx = 0
    half = batch_size // 2

    while pos_idx + half <= len(positives) and neg_idx + half <= len(negatives):
        batch = []
        for i in range(half):
            batch.append(("pos", positives[pos_idx + i]))
            batch.append(("neg", negatives[neg_idx + i]))
        batches.append(batch)
        pos_idx += half
        neg_idx += half

    return batches

# Example
pos_examples = ["good review 1", "good review 2", "good review 3", "good review 4"]
neg_examples = ["bad review 1", "bad review 2", "bad review 3", "bad review 4"]
batches = create_balanced_batch(pos_examples, neg_examples, batch_size=4)
for i, batch in enumerate(batches):
    print(f"Batch {i}: {batch}")

Key Takeaways

💡

functools.cmp_to_key converts a comparator function to a key function for Python's sorted()
Multi-key sorting uses tuples as keys — negate numeric values for descending order within the tuple
Merge intervals is a pattern that appears in session analysis, attention windowing, and time-series segmentation
Meeting rooms / GPU scheduling uses a min-heap of end times to greedily assign resources
Dutch National Flag (3-way partition) is the basis of 3-way QuickSort, which handles duplicate keys efficiently
Balanced batch construction through rearrangement is essential for contrastive and metric learning

← Previous Top-K Problems Next → Patterns & Tips