Intermediate

MLflow Tracking

Master the MLflow Tracking API — log parameters, metrics, and artifacts, use autologging, and compare experiments.

Experiments and Runs

MLflow organizes tracking into two levels:

  • Experiment: A named collection of runs, typically for a specific ML task (e.g., "customer-churn-prediction").
  • Run: A single execution of training code within an experiment. Each run tracks parameters, metrics, artifacts, and tags.

Logging Parameters

Python — Logging parameters
import mlflow

with mlflow.start_run():
    # Log individual parameters
    mlflow.log_param("learning_rate", 0.01)
    mlflow.log_param("batch_size", 32)
    mlflow.log_param("model_type", "gradient_boosting")

    # Log multiple parameters at once
    params = {
        "n_estimators": 200,
        "max_depth": 8,
        "min_samples_split": 5,
        "subsample": 0.8,
    }
    mlflow.log_params(params)

Logging Metrics

Python — Logging metrics
with mlflow.start_run():
    # Log a single metric
    mlflow.log_metric("accuracy", 0.95)
    mlflow.log_metric("f1_score", 0.93)

    # Log metrics over time (e.g., per epoch)
    for epoch in range(100):
        train_loss = train_one_epoch(model, train_loader)
        val_loss = evaluate(model, val_loader)

        mlflow.log_metric("train_loss", train_loss, step=epoch)
        mlflow.log_metric("val_loss", val_loss, step=epoch)

    # Log multiple metrics at once
    mlflow.log_metrics({
        "precision": 0.94,
        "recall": 0.92,
        "auc": 0.97,
    })

Logging Artifacts

Python — Logging artifacts
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

with mlflow.start_run():
    # Log a single file
    mlflow.log_artifact("config.yaml")

    # Log a directory of files
    mlflow.log_artifacts("plots/", artifact_path="figures")

    # Create and log a plot
    cm = confusion_matrix(y_test, y_pred)
    fig, ax = plt.subplots()
    ConfusionMatrixDisplay(cm).plot(ax=ax)
    fig.savefig("confusion_matrix.png")
    mlflow.log_artifact("confusion_matrix.png")

    # Log a text file
    with open("feature_importance.txt", "w") as f:
        for name, imp in zip(feature_names, importances):
            f.write(f"{name}: {imp:.4f}\n")
    mlflow.log_artifact("feature_importance.txt")

Autologging

MLflow can automatically log parameters, metrics, and models for supported frameworks:

Python — Autologging
# Enable autologging for all supported frameworks
mlflow.autolog()

# Or enable for specific frameworks
mlflow.sklearn.autolog()
mlflow.pytorch.autolog()
mlflow.tensorflow.autolog()
mlflow.xgboost.autolog()
mlflow.lightgbm.autolog()

# Now just train normally - everything is logged automatically
from sklearn.ensemble import RandomForestClassifier

with mlflow.start_run():
    model = RandomForestClassifier(n_estimators=100, max_depth=10)
    model.fit(X_train, y_train)
    # Parameters, metrics, model, and feature importance are all logged!
Autolog captures: All constructor parameters, training metrics (accuracy, F1, etc.), the trained model artifact, feature importance plots, and even the model signature. It's the fastest way to start tracking.

Custom Tags

Python — Using tags
with mlflow.start_run():
    # Set individual tags
    mlflow.set_tag("team", "fraud-detection")
    mlflow.set_tag("data_version", "v2.3")
    mlflow.set_tag("environment", "staging")

    # Set multiple tags
    mlflow.set_tags({
        "model_type": "ensemble",
        "feature_set": "v3",
        "priority": "high",
    })

Searching and Comparing Runs

Python — Searching runs programmatically
import mlflow

# Search runs with filters
runs = mlflow.search_runs(
    experiment_names=["customer-churn"],
    filter_string="metrics.accuracy > 0.9 AND params.model_type = 'gradient_boosting'",
    order_by=["metrics.f1_score DESC"],
    max_results=10,
)

# Display results
print(runs[["run_id", "params.model_type", "metrics.accuracy", "metrics.f1_score"]])

# Get the best run
best_run = runs.iloc[0]
print(f"Best run: {best_run.run_id}")
print(f"  Accuracy: {best_run['metrics.accuracy']:.4f}")
print(f"  F1 Score: {best_run['metrics.f1_score']:.4f}")

Framework Examples

scikit-learn

Python — MLflow with scikit-learn
import mlflow
mlflow.sklearn.autolog()

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

with mlflow.start_run():
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("classifier", LogisticRegression(C=1.0, max_iter=200))
    ])
    pipe.fit(X_train, y_train)
    # Everything logged automatically

PyTorch

Python — MLflow with PyTorch
import mlflow
import torch
import torch.nn as nn

mlflow.pytorch.autolog()

with mlflow.start_run():
    model = nn.Sequential(
        nn.Linear(784, 256),
        nn.ReLU(),
        nn.Linear(256, 10),
    )
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(10):
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            output = model(batch_x)
            loss = criterion(output, batch_y)
            loss.backward()
            optimizer.step()

        mlflow.log_metric("train_loss", loss.item(), step=epoch)

TensorFlow / Keras

Python — MLflow with TensorFlow
import mlflow
import tensorflow as tf

mlflow.tensorflow.autolog()

with mlflow.start_run():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(10, activation='softmax'),
    ])

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=10, validation_split=0.2)
    # All metrics, params, and model logged automatically