Intermediate

Experiment Tracking

Log experiments, parameters, metrics, and artifacts with MLflow.

Training with MLflow Tracking

# src/train.py
import mlflow
import mlflow.sklearn
import pandas as pd
import yaml
import json
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import joblib

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("mlops-pipeline")

def train():
    with open("params.yaml") as f:
        params = yaml.safe_load(f)["model"]

    train_df = pd.read_csv("data/processed/train.csv")
    test_df = pd.read_csv("data/processed/test.csv")

    target = train_df.columns[-1]
    X_train = train_df.drop(columns=[target])
    y_train = train_df[target]
    X_test = test_df.drop(columns=[target])
    y_test = test_df[target]

    with mlflow.start_run():
        mlflow.log_params(params)
        mlflow.log_param("train_size", len(X_train))
        mlflow.log_param("test_size", len(X_test))

        if params["type"] == "random_forest":
            model = RandomForestClassifier(
                n_estimators=params["n_estimators"],
                max_depth=params["max_depth"],
                random_state=params["random_state"])
        else:
            model = GradientBoostingClassifier(
                n_estimators=params["n_estimators"],
                max_depth=params["max_depth"],
                random_state=params["random_state"])

        model.fit(X_train, y_train)
        preds = model.predict(X_test)

        metrics = {
            "accuracy": accuracy_score(y_test, preds),
            "f1": f1_score(y_test, preds, average="weighted"),
            "precision": precision_score(y_test, preds, average="weighted"),
            "recall": recall_score(y_test, preds, average="weighted"),
        }

        for k, v in metrics.items():
            mlflow.log_metric(k, v)

        mlflow.sklearn.log_model(model, "model")
        joblib.dump(model, "models/model.joblib")

        with open("metrics.json", "w") as f:
            json.dump(metrics, f, indent=2)

        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"F1: {metrics['f1']:.4f}")
        return metrics

if __name__ == "__main__":
    train()
💡
MLflow UI: Visit http://localhost:5000 to compare experiments, view metrics charts, and download artifacts.