Step 2: Collaborative Filtering Intermediate
Collaborative filtering (CF) is the workhorse of recommendation systems. The core idea is simple: users who agreed in the past will agree in the future. This lesson implements both user-based and item-based CF from scratch, covering the key similarity metrics and prediction formulas.
Similarity Metrics
CF relies on measuring how similar two users (or items) are. We implement two standard metrics:
import numpy as np from scipy.sparse import csr_matrix from sklearn.metrics.pairwise import cosine_similarity def compute_cosine_similarity(matrix): """Compute pairwise cosine similarity. cosine_sim(a, b) = dot(a, b) / (||a|| * ||b||) Args: matrix: scipy sparse matrix (n_items x n_features) or (n_users x n_features) Returns: similarity: dense numpy array (n x n) """ sim = cosine_similarity(matrix) # Zero out self-similarity np.fill_diagonal(sim, 0) return sim def compute_pearson_similarity(matrix): """Compute pairwise Pearson correlation. Pearson adjusts for each user's mean rating, handling rating scale differences (e.g., one user rates everything 4-5, another 1-3). Args: matrix: dense numpy array (n_users x n_items) Returns: similarity: dense numpy array (n_users x n_users) """ if hasattr(matrix, "toarray"): matrix = matrix.toarray() # Mean-center each user's ratings (only over rated items) user_means = np.true_divide( matrix.sum(axis=1), (matrix != 0).sum(axis=1), where=(matrix != 0).sum(axis=1) != 0 ) centered = matrix.copy() for i in range(matrix.shape[0]): rated = matrix[i] != 0 centered[i, rated] = matrix[i, rated] - user_means[i] # Cosine similarity on centered data = Pearson correlation sim = cosine_similarity(csr_matrix(centered)) np.fill_diagonal(sim, 0) return sim
User-Based Collaborative Filtering
Find users most similar to the target user, then predict ratings based on what those similar users rated.
class UserBasedCF: """User-Based Collaborative Filtering recommender.""" def __init__(self, k_neighbors=20, similarity="pearson"): self.k = k_neighbors self.similarity_type = similarity self.user_sim = None self.matrix = None self.user_means = None def fit(self, user_item_matrix): """Compute user-user similarity matrix.""" self.matrix = user_item_matrix if hasattr(self.matrix, "toarray"): self.matrix = self.matrix.toarray() # Compute per-user mean rating (over rated items only) rated_mask = self.matrix != 0 self.user_means = np.where( rated_mask.sum(axis=1) > 0, self.matrix.sum(axis=1) / rated_mask.sum(axis=1), 0 ) if self.similarity_type == "pearson": self.user_sim = compute_pearson_similarity(self.matrix) else: self.user_sim = compute_cosine_similarity(csr_matrix(self.matrix)) print(f"UserBasedCF fitted: {self.matrix.shape[0]} users, " f"{self.matrix.shape[1]} items, k={self.k}") def predict(self, user_idx, item_idx): """Predict a single user-item rating.""" # Get k most similar users who rated this item item_raters = np.where(self.matrix[:, item_idx] != 0)[0] if len(item_raters) == 0: return self.user_means[user_idx] sims = self.user_sim[user_idx, item_raters] top_k_idx = np.argsort(sims)[-self.k:] top_k_users = item_raters[top_k_idx] top_k_sims = sims[top_k_idx] # Weighted average (mean-centered) sim_sum = np.abs(top_k_sims).sum() if sim_sum == 0: return self.user_means[user_idx] weighted = np.sum( top_k_sims * (self.matrix[top_k_users, item_idx] - self.user_means[top_k_users]) ) pred = self.user_means[user_idx] + weighted / sim_sum return np.clip(pred, 1, 5) def recommend(self, user_idx, n=10): """Generate top-N recommendations for a user.""" # Items the user has not rated rated = set(np.where(self.matrix[user_idx] != 0)[0]) all_items = set(range(self.matrix.shape[1])) candidates = all_items - rated # Predict scores for all candidates scores = [] for item_idx in candidates: score = self.predict(user_idx, item_idx) scores.append((item_idx, score)) # Return top N by predicted score scores.sort(key=lambda x: x[1], reverse=True) return scores[:n] # Usage user_cf = UserBasedCF(k_neighbors=20, similarity="pearson") user_cf.fit(train_matrix) # Get recommendations for user 0 recs = user_cf.recommend(user_idx=0, n=10) for item_idx, score in recs: item_id = reverse_item_map[item_idx] title = movies[movies["item_id"] == item_id]["title"].values[0] print(f" {title}: {score:.2f}")
Item-Based Collaborative Filtering
Instead of finding similar users, find similar items. Item-based CF is often more stable because item-item similarities change less frequently than user-user similarities.
class ItemBasedCF: """Item-Based Collaborative Filtering recommender.""" def __init__(self, k_neighbors=20): self.k = k_neighbors self.item_sim = None self.matrix = None def fit(self, user_item_matrix): """Compute item-item similarity matrix.""" self.matrix = user_item_matrix if hasattr(self.matrix, "toarray"): dense = self.matrix.toarray() else: dense = self.matrix # Item-item cosine similarity (transpose: items as rows) self.item_sim = compute_cosine_similarity(csr_matrix(dense.T)) print(f"ItemBasedCF fitted: {dense.shape[1]} items, k={self.k}") def predict(self, user_idx, item_idx): """Predict rating for a user-item pair.""" if hasattr(self.matrix, "toarray"): user_ratings = self.matrix.toarray()[user_idx] else: user_ratings = self.matrix[user_idx] # Items this user has rated rated_items = np.where(user_ratings != 0)[0] if len(rated_items) == 0: return 3.0 # Global mean fallback # Get similarities between target item and user's rated items sims = self.item_sim[item_idx, rated_items] top_k_idx = np.argsort(sims)[-self.k:] top_k_items = rated_items[top_k_idx] top_k_sims = sims[top_k_idx] # Weighted average sim_sum = np.abs(top_k_sims).sum() if sim_sum == 0: return 3.0 pred = np.sum(top_k_sims * user_ratings[top_k_items]) / sim_sum return np.clip(pred, 1, 5) def recommend(self, user_idx, n=10): """Generate top-N recommendations for a user.""" if hasattr(self.matrix, "toarray"): user_ratings = self.matrix.toarray()[user_idx] else: user_ratings = self.matrix[user_idx] rated = set(np.where(user_ratings != 0)[0]) candidates = set(range(self.matrix.shape[1])) - rated scores = [] for item_idx in candidates: score = self.predict(user_idx, item_idx) scores.append((item_idx, score)) scores.sort(key=lambda x: x[1], reverse=True) return scores[:n] # Usage item_cf = ItemBasedCF(k_neighbors=20) item_cf.fit(train_matrix) recs = item_cf.recommend(user_idx=0, n=10) for item_idx, score in recs: item_id = reverse_item_map[item_idx] title = movies[movies["item_id"] == item_id]["title"].values[0] print(f" {title}: {score:.2f}")
User-Based vs Item-Based: When to Use Which
| Aspect | User-Based CF | Item-Based CF |
|---|---|---|
| Scalability | Struggles with many users (O(n_users^2) similarity) | Better when items << users |
| Stability | User preferences shift over time | Item similarities are more stable |
| Cold start (new user) | Cannot find similar users | Can still recommend based on the few rated items |
| Cold start (new item) | Can recommend if similar users rated it | Cannot compute similarity without ratings |
| Serendipity | Higher — discovers items through diverse users | Lower — tends to recommend similar items |
Next: Content-Based Filtering
Add a second recommendation approach that uses movie metadata (genres, descriptions) instead of relying solely on rating patterns.
Step 3: Content-Based Filtering →
Lilly Tech Systems