Advanced
Tree Construction & Serialization
Building trees from traversal data and converting trees to/from strings are essential skills for model serialization (saving/loading decision trees), parsing expressions in code-generating LLMs, and implementing efficient text search with tries. These problems test your understanding of tree structure at a deeper level than traversal alone.
Problem 1: Build Tree from Inorder and Preorder
Problem: Given inorder and preorder traversal arrays of a binary tree, construct and return the tree.
ML Context: When you export a decision tree from scikit-learn as a series of rules (preorder) and sorted thresholds (inorder), you need this algorithm to reconstruct the tree structure for visualization or transfer to another framework.
ML Context: When you export a decision tree from scikit-learn as a series of rules (preorder) and sorted thresholds (inorder), you need this algorithm to reconstruct the tree structure for visualization or transfer to another framework.
class TreeNode:
def __init__(self, val=0, left=None, right=None):
self.val = val
self.left = left
self.right = right
def build_tree(preorder: list, inorder: list) -> TreeNode:
"""Construct binary tree from preorder and inorder traversals.
Key insight: preorder[0] is always the root.
Root's position in inorder splits left/right subtrees.
Time: O(n), Space: O(n) with hash map.
"""
if not preorder or not inorder:
return None
# Map values to indices in inorder for O(1) lookup
inorder_map = {val: idx for idx, val in enumerate(inorder)}
pre_idx = [0] # Use list to maintain state across recursion
def build(in_left, in_right):
if in_left > in_right:
return None
root_val = preorder[pre_idx[0]]
pre_idx[0] += 1
root = TreeNode(root_val)
# Split inorder into left and right subtrees
in_root = inorder_map[root_val]
root.left = build(in_left, in_root - 1)
root.right = build(in_root + 1, in_right)
return root
return build(0, len(inorder) - 1)
# Verify with inorder traversal
def verify_inorder(root):
if not root:
return []
return verify_inorder(root.left) + [root.val] + verify_inorder(root.right)
# Test
preorder = [3, 9, 20, 15, 7]
inorder = [9, 3, 15, 20, 7]
root = build_tree(preorder, inorder)
print(verify_inorder(root)) # [9, 3, 15, 20, 7]
# Tree: 3
# / \
# 9 20
# / \
# 15 7
Problem 2: Serialize and Deserialize Binary Tree
Problem: Design an algorithm to serialize a binary tree to a string and deserialize it back to the original tree structure.
ML Context: This is exactly how model serialization works. scikit-learn's
ML Context: This is exactly how model serialization works. scikit-learn's
joblib.dump() serializes decision tree structures. ONNX format serializes computation graphs. Understanding serialization is essential for model deployment, versioning, and cross-framework compatibility.from collections import deque
class Codec:
"""Serialize/deserialize using preorder with null markers.
Time: O(n) for both operations, Space: O(n).
"""
def serialize(self, root: TreeNode) -> str:
"""Preorder traversal with 'null' for missing children."""
result = []
def preorder(node):
if not node:
result.append("null")
return
result.append(str(node.val))
preorder(node.left)
preorder(node.right)
preorder(root)
return ",".join(result)
def deserialize(self, data: str) -> TreeNode:
"""Rebuild tree from preorder serialization."""
values = iter(data.split(","))
def build():
val = next(values)
if val == "null":
return None
node = TreeNode(int(val))
node.left = build()
node.right = build()
return node
return build()
class CodecBFS:
"""Alternative: BFS (level-order) serialization.
This is the format used by LeetCode's tree visualization.
"""
def serialize(self, root: TreeNode) -> str:
if not root:
return ""
result = []
queue = deque([root])
while queue:
node = queue.popleft()
if node:
result.append(str(node.val))
queue.append(node.left)
queue.append(node.right)
else:
result.append("null")
# Remove trailing nulls
while result and result[-1] == "null":
result.pop()
return ",".join(result)
def deserialize(self, data: str) -> TreeNode:
if not data:
return None
values = data.split(",")
root = TreeNode(int(values[0]))
queue = deque([root])
i = 1
while queue and i < len(values):
node = queue.popleft()
if i < len(values) and values[i] != "null":
node.left = TreeNode(int(values[i]))
queue.append(node.left)
i += 1
if i < len(values) and values[i] != "null":
node.right = TreeNode(int(values[i]))
queue.append(node.right)
i += 1
return root
# Test
root = TreeNode(1, TreeNode(2), TreeNode(3, TreeNode(4), TreeNode(5)))
codec = Codec()
serialized = codec.serialize(root)
print(serialized) # "1,2,null,null,3,4,null,null,5,null,null"
restored = codec.deserialize(serialized)
print(verify_inorder(restored)) # [2, 1, 4, 3, 5]
Problem 3: Trie (Prefix Tree) Implementation
Problem: Implement a trie with insert, search, and startsWith methods.
ML Context: Tries power autocomplete systems, token prefix matching in LLM tokenizers, and efficient vocabulary lookup. BPE (Byte Pair Encoding) tokenization in GPT models uses trie-like structures to find the longest matching token prefix in O(L) time where L is the token length.
ML Context: Tries power autocomplete systems, token prefix matching in LLM tokenizers, and efficient vocabulary lookup. BPE (Byte Pair Encoding) tokenization in GPT models uses trie-like structures to find the longest matching token prefix in O(L) time where L is the token length.
class TrieNode:
def __init__(self):
self.children = {} # char -> TrieNode
self.is_end = False
class Trie:
"""Prefix tree for efficient string operations.
Insert: O(L), Search: O(L), StartsWith: O(L)
where L = length of the word/prefix.
Space: O(N * L) where N = number of words.
"""
def __init__(self):
self.root = TrieNode()
def insert(self, word: str) -> None:
node = self.root
for char in word:
if char not in node.children:
node.children[char] = TrieNode()
node = node.children[char]
node.is_end = True
def search(self, word: str) -> bool:
"""Returns True if the exact word exists in the trie."""
node = self._find_node(word)
return node is not None and node.is_end
def starts_with(self, prefix: str) -> bool:
"""Returns True if any word in the trie starts with prefix."""
return self._find_node(prefix) is not None
def _find_node(self, prefix: str) -> TrieNode:
node = self.root
for char in prefix:
if char not in node.children:
return None
node = node.children[char]
return node
def autocomplete(self, prefix: str, limit: int = 5) -> list:
"""Return up to 'limit' words that start with prefix.
ML Application: autocomplete suggestions.
"""
node = self._find_node(prefix)
if not node:
return []
results = []
def dfs(current, path):
if len(results) >= limit:
return
if current.is_end:
results.append(prefix + "".join(path))
for char in sorted(current.children):
path.append(char)
dfs(current.children[char], path)
path.pop()
dfs(node, [])
return results
# Test
trie = Trie()
words = ["apple", "app", "application", "apply", "apt", "banana"]
for w in words:
trie.insert(w)
print(trie.search("app")) # True
print(trie.search("ap")) # False
print(trie.starts_with("ap")) # True
print(trie.autocomplete("app")) # ['app', 'apple', 'application', 'apply']
print(trie.autocomplete("b")) # ['banana']
Problem 4: Suffix Tree Basics
Problem: Build a simplified suffix trie for a string and use it to check if a pattern is a substring.
ML Context: Suffix trees/arrays enable O(m) substring search (where m = pattern length), making them essential for genomic sequence analysis (finding DNA motifs), plagiarism detection in NLP, and efficient pattern matching in log analysis for ML system monitoring.
ML Context: Suffix trees/arrays enable O(m) substring search (where m = pattern length), making them essential for genomic sequence analysis (finding DNA motifs), plagiarism detection in NLP, and efficient pattern matching in log analysis for ML system monitoring.
class SuffixTrie:
"""Simplified suffix trie for substring search.
Build: O(n^2), Search: O(m) where n = text length, m = pattern length.
Note: Production systems use Ukkonen's algorithm for O(n) construction.
"""
def __init__(self, text: str):
self.root = TrieNode()
self.text = text
self._build(text)
def _build(self, text: str):
"""Insert all suffixes of text into the trie."""
for i in range(len(text)):
node = self.root
for char in text[i:]:
if char not in node.children:
node.children[char] = TrieNode()
node = node.children[char]
node.is_end = True
def contains(self, pattern: str) -> bool:
"""Check if pattern is a substring of the original text."""
node = self.root
for char in pattern:
if char not in node.children:
return False
node = node.children[char]
return True
def count_occurrences(self, pattern: str) -> int:
"""Count how many times pattern appears as substring."""
node = self.root
for char in pattern:
if char not in node.children:
return 0
node = node.children[char]
# Count leaf nodes reachable from this point
count = 0
def count_leaves(n):
nonlocal count
if n.is_end:
count += 1
for child in n.children.values():
count_leaves(child)
count_leaves(node)
return count
# Test
st = SuffixTrie("banana$")
print(st.contains("ban")) # True
print(st.contains("ana")) # True
print(st.contains("xyz")) # False
print(st.count_occurrences("an")) # 2 ("ban_an_a" has "an" at positions 1 and 3)
Problem 5: Expression Tree Evaluation
Problem: Given a binary expression tree where leaves are operands and internal nodes are operators (+, -, *, /), evaluate the expression.
ML Context: Neural network computation graphs are expression trees. Each operation node (matmul, add, relu) takes inputs from child nodes and produces outputs. Forward pass = postorder evaluation. Understanding expression trees is fundamental to how PyTorch autograd and TensorFlow eager mode work.
ML Context: Neural network computation graphs are expression trees. Each operation node (matmul, add, relu) takes inputs from child nodes and produces outputs. Forward pass = postorder evaluation. Understanding expression trees is fundamental to how PyTorch autograd and TensorFlow eager mode work.
def evaluate_expression_tree(root: TreeNode) -> float:
"""Evaluate a binary expression tree using postorder traversal.
Leaves contain operands (numbers), internal nodes contain operators.
Time: O(n), Space: O(h).
"""
if not root:
return 0
# Leaf node = operand
if not root.left and not root.right:
return float(root.val)
left_val = evaluate_expression_tree(root.left)
right_val = evaluate_expression_tree(root.right)
if root.val == '+':
return left_val + right_val
elif root.val == '-':
return left_val - right_val
elif root.val == '*':
return left_val * right_val
elif root.val == '/':
return left_val / right_val
return 0
def build_expression_tree(postfix: list) -> TreeNode:
"""Build expression tree from postfix (RPN) expression.
Time: O(n), Space: O(n).
"""
operators = {'+', '-', '*', '/'}
stack = []
for token in postfix:
node = TreeNode(token)
if token in operators:
node.right = stack.pop()
node.left = stack.pop()
stack.append(node)
return stack[0]
def tree_to_infix(root: TreeNode) -> str:
"""Convert expression tree to infix notation with parentheses.
Useful for debugging computation graphs.
"""
if not root:
return ""
if not root.left and not root.right:
return str(root.val)
left = tree_to_infix(root.left)
right = tree_to_infix(root.right)
return f"({left} {root.val} {right})"
# Test: Expression: (3 + 4) * 2 = 14
# Postfix: 3 4 + 2 *
postfix = ['3', '4', '+', '2', '*']
expr_tree = build_expression_tree(postfix)
print(evaluate_expression_tree(expr_tree)) # 14.0
print(tree_to_infix(expr_tree)) # ((3 + 4) * 2)
# ML Application: Simple computation graph
# forward pass: y = relu(W * x + b)
# This is a tree: relu -> add -> [matmul -> [W, x], b]
print("\n# Computation graph evaluation order (postorder):")
print("# 1. Compute W * x (matmul)")
print("# 2. Compute (W*x) + b (add)")
print("# 3. Compute relu(W*x + b) (activation)")
print("# This is exactly postorder traversal of the expression tree")
Pattern recognition: Tree construction problems always require identifying the root and splitting into subtrees. The root comes from one traversal (preorder gives root first, postorder gives root last), and the split point comes from another (inorder). Serialization always needs a way to mark null children — either explicit markers or BFS level-order with trailing nulls removed.
Lilly Tech Systems