Step 1: Data Preparation Intermediate
Before building any models, we need clean, well-structured data. This lesson walks through loading the MovieLens 100K dataset, performing exploratory data analysis to understand rating patterns, constructing the user-item interaction matrix, and splitting data for training and evaluation.
Load the MovieLens Dataset
import pandas as pd import numpy as np # Load ratings ratings = pd.read_csv( "data/ml-100k/u.data", sep="\t", names=["user_id", "item_id", "rating", "timestamp"], engine="python" ) # Load movie metadata movies = pd.read_csv( "data/ml-100k/u.item", sep="|", encoding="latin-1", names=[ "item_id", "title", "release_date", "video_release", "imdb_url", "unknown", "Action", "Adventure", "Animation", "Children", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western" ] ) print(f"Ratings: {len(ratings):,}") print(f"Users: {ratings['user_id'].nunique()}") print(f"Movies: {ratings['item_id'].nunique()}") print(f"Sparsity: {1 - len(ratings) / (943 * 1682):.2%}") # Output: # Ratings: 100,000 # Users: 943 # Movies: 1,682 # Sparsity: 93.70%
Exploratory Data Analysis
import matplotlib.pyplot as plt import seaborn as sns # Rating distribution fig, axes = plt.subplots(1, 3, figsize=(15, 4)) # 1. Rating value distribution ratings["rating"].value_counts().sort_index().plot( kind="bar", ax=axes[0], color="#6366f1" ) axes[0].set_title("Rating Distribution") axes[0].set_xlabel("Rating") # 2. Ratings per user user_counts = ratings.groupby("user_id").size() axes[1].hist(user_counts, bins=50, color="#10b981") axes[1].set_title("Ratings per User") axes[1].set_xlabel("Number of Ratings") # 3. Ratings per movie item_counts = ratings.groupby("item_id").size() axes[2].hist(item_counts, bins=50, color="#f59e0b") axes[2].set_title("Ratings per Movie") axes[2].set_xlabel("Number of Ratings") plt.tight_layout() plt.savefig("data/eda_plots.png", dpi=150) plt.show() # Key statistics print(f"Mean rating: {ratings['rating'].mean():.2f}") print(f"Median ratings per user: {user_counts.median():.0f}") print(f"Median ratings per movie: {item_counts.median():.0f}")
Build the User-Item Matrix
The user-item matrix is the foundational data structure for collaborative filtering. Each row represents a user, each column a movie, and each cell contains the rating (or 0 for unrated).
from scipy.sparse import csr_matrix def build_user_item_matrix(ratings_df): """Build a user-item interaction matrix. Returns: matrix: scipy sparse matrix (n_users x n_items) user_map: dict mapping user_id -> matrix row index item_map: dict mapping item_id -> matrix column index reverse_item_map: dict mapping matrix column index -> item_id """ # Create mappings from IDs to matrix indices user_ids = sorted(ratings_df["user_id"].unique()) item_ids = sorted(ratings_df["item_id"].unique()) user_map = {uid: idx for idx, uid in enumerate(user_ids)} item_map = {iid: idx for idx, iid in enumerate(item_ids)} reverse_item_map = {idx: iid for iid, idx in item_map.items()} # Build sparse matrix rows = ratings_df["user_id"].map(user_map).values cols = ratings_df["item_id"].map(item_map).values vals = ratings_df["rating"].values.astype(np.float32) matrix = csr_matrix( (vals, (rows, cols)), shape=(len(user_ids), len(item_ids)) ) print(f"Matrix shape: {matrix.shape}") print(f"Non-zero entries: {matrix.nnz:,}") print(f"Density: {matrix.nnz / (matrix.shape[0] * matrix.shape[1]):.4%}") return matrix, user_map, item_map, reverse_item_map # Build it ui_matrix, user_map, item_map, reverse_item_map = build_user_item_matrix(ratings) # Output: # Matrix shape: (943, 1682) # Non-zero entries: 100,000 # Density: 6.3047%
Train/Test Split
For recommendation systems, we use a time-based split or a leave-one-out strategy rather than random splitting, because real-world recommendations must predict future behavior from past interactions.
def temporal_train_test_split(ratings_df, test_ratio=0.2): """Split ratings by timestamp - train on older, test on newer. This mimics the real scenario: predict future ratings from past data. """ # Sort by timestamp ratings_sorted = ratings_df.sort_values("timestamp") # Split at the time threshold split_idx = int(len(ratings_sorted) * (1 - test_ratio)) train = ratings_sorted.iloc[:split_idx].copy() test = ratings_sorted.iloc[split_idx:].copy() print(f"Train: {len(train):,} ratings") print(f"Test: {len(test):,} ratings") print(f"Train time range: {pd.to_datetime(train['timestamp'].min(), unit='s').date()}" f" to {pd.to_datetime(train['timestamp'].max(), unit='s').date()}") print(f"Test time range: {pd.to_datetime(test['timestamp'].min(), unit='s').date()}" f" to {pd.to_datetime(test['timestamp'].max(), unit='s').date()}") return train, test def leave_one_out_split(ratings_df): """Leave-one-out: hold out each user's most recent rating for testing.""" # For each user, find the rating with the latest timestamp latest_idx = ratings_df.groupby("user_id")["timestamp"].idxmax() test = ratings_df.loc[latest_idx].copy() train = ratings_df.drop(latest_idx).copy() print(f"Train: {len(train):,} ratings") print(f"Test: {len(test):,} ratings (one per user)") return train, test # Use temporal split for our project train_df, test_df = temporal_train_test_split(ratings) # Build training matrix train_matrix, _, _, _ = build_user_item_matrix(train_df)
Data Quality Checks
def data_quality_report(train_df, test_df): """Verify data quality before training.""" checks = {} # Check for cold-start users in test (users not in train) train_users = set(train_df["user_id"].unique()) test_users = set(test_df["user_id"].unique()) cold_users = test_users - train_users checks["cold_start_users"] = len(cold_users) # Check for cold-start items in test train_items = set(train_df["item_id"].unique()) test_items = set(test_df["item_id"].unique()) cold_items = test_items - train_items checks["cold_start_items"] = len(cold_items) # Check for duplicate ratings dupes = train_df.duplicated(subset=["user_id", "item_id"]).sum() checks["duplicate_ratings"] = dupes # Rating range validation checks["min_rating"] = train_df["rating"].min() checks["max_rating"] = train_df["rating"].max() for key, val in checks.items(): print(f" {key}: {val}") return checks data_quality_report(train_df, test_df)
import pickle # Save processed data with open("data/processed_data.pkl", "wb") as f: pickle.dump({ "train_df": train_df, "test_df": test_df, "movies": movies, "user_map": user_map, "item_map": item_map, "reverse_item_map": reverse_item_map, }, f) print("Data saved to data/processed_data.pkl")
Next: Collaborative Filtering
With our data prepared and user-item matrix built, we are ready to implement our first recommendation algorithm: collaborative filtering.
Step 2: Collaborative Filtering →