Intermediate
Experiment Tracking
Log experiments, parameters, metrics, and artifacts with MLflow.
Training with MLflow Tracking
# src/train.py
import mlflow
import mlflow.sklearn
import pandas as pd
import yaml
import json
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import joblib
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("mlops-pipeline")
def train():
with open("params.yaml") as f:
params = yaml.safe_load(f)["model"]
train_df = pd.read_csv("data/processed/train.csv")
test_df = pd.read_csv("data/processed/test.csv")
target = train_df.columns[-1]
X_train = train_df.drop(columns=[target])
y_train = train_df[target]
X_test = test_df.drop(columns=[target])
y_test = test_df[target]
with mlflow.start_run():
mlflow.log_params(params)
mlflow.log_param("train_size", len(X_train))
mlflow.log_param("test_size", len(X_test))
if params["type"] == "random_forest":
model = RandomForestClassifier(
n_estimators=params["n_estimators"],
max_depth=params["max_depth"],
random_state=params["random_state"])
else:
model = GradientBoostingClassifier(
n_estimators=params["n_estimators"],
max_depth=params["max_depth"],
random_state=params["random_state"])
model.fit(X_train, y_train)
preds = model.predict(X_test)
metrics = {
"accuracy": accuracy_score(y_test, preds),
"f1": f1_score(y_test, preds, average="weighted"),
"precision": precision_score(y_test, preds, average="weighted"),
"recall": recall_score(y_test, preds, average="weighted"),
}
for k, v in metrics.items():
mlflow.log_metric(k, v)
mlflow.sklearn.log_model(model, "model")
joblib.dump(model, "models/model.joblib")
with open("metrics.json", "w") as f:
json.dump(metrics, f, indent=2)
print(f"Accuracy: {metrics['accuracy']:.4f}")
print(f"F1: {metrics['f1']:.4f}")
return metrics
if __name__ == "__main__":
train()
MLflow UI: Visit
http://localhost:5000 to compare experiments, view metrics charts, and download artifacts.
Lilly Tech Systems