Skip to content

Part 5 — Experiment Tracking

Overview

You will run dozens of experiments — different learning rates, LoRA ranks, datasets, and architectures. Without systematic tracking, you will lose the results that matter. This article shows how to log runs with Weights & Biases and MLflow, version artifacts, and run reproducible hyperparameter sweeps.


1. Why Tracking Matters

Without Tracking With Tracking
"Which run was best again?" Dashboard shows all runs side by side
Rerunning experiments to compare Logged metrics + config available instantly
"Did I use lr=2e-4 or 3e-4?" Every hyperparameter stored per run
Model saved over by next experiment Every checkpoint versioned
Non-reproducible results Exact seed, config, and data version logged

2. Weights & Biases (W&B)

W&B is the de-facto standard for deep learning experiment tracking.

Setup

pip install wandb
wandb login   # paste your API key from wandb.ai/authorize

Basic Logging

import wandb

# Start a run
wandb.init(
    project="llm-finetuning",
    name="mistral-lora-r16",
    config={
        "model": "mistralai/Mistral-7B-v0.1",
        "lora_r": 16,
        "lora_alpha": 32,
        "learning_rate": 2e-4,
        "batch_size": 16,
        "epochs": 3,
    },
)

# Log metrics inside your training loop
for step, batch in enumerate(train_loader):
    loss = train_step(batch)

    wandb.log({
        "train/loss": loss,
        "train/learning_rate": scheduler.get_last_lr()[0],
        "train/grad_norm": grad_norm,
    }, step=step)

# Log eval metrics
wandb.log({
    "eval/loss": eval_loss,
    "eval/rouge1": rouge_result["rouge1"],
    "eval/perplexity": perplexity,
})

wandb.finish()

W&B with HuggingFace Trainer

from transformers import TrainingArguments

args = TrainingArguments(
    ...
    report_to="wandb",
    run_name="mistral-lora-r16",    # W&B run name
    logging_steps=25,               # log every N steps
)
# Trainer handles wandb.init/finish automatically

Logging Artifacts (Model Checkpoints)

artifact = wandb.Artifact(
    name="mistral-lora-r16",
    type="model",
    metadata={"rouge1": 0.42, "perplexity": 12.3},
)
artifact.add_dir("checkpoints/mistral-lora-r16")
wandb.log_artifact(artifact)

Hyperparameter Sweeps

Define a sweep config and W&B runs agents in parallel:

# sweep_config.yaml
sweep_config = {
    "method": "bayes",               # bayes | grid | random
    "metric": {"name": "eval/loss", "goal": "minimize"},
    "parameters": {
        "learning_rate": {"distribution": "log_uniform_values", "min": 1e-5, "max": 3e-4},
        "lora_r":        {"values": [8, 16, 32, 64]},
        "lora_alpha":    {"values": [16, 32]},
        "warmup_ratio":  {"distribution": "uniform", "min": 0.01, "max": 0.1},
    },
}

sweep_id = wandb.sweep(sweep_config, project="llm-finetuning")
def train_for_sweep():
    with wandb.init() as run:
        cfg = run.config

        # Build model and trainer using cfg values
        lora_config = LoraConfig(r=cfg.lora_r, lora_alpha=cfg.lora_alpha, ...)
        args = TrainingArguments(
            learning_rate=cfg.learning_rate,
            warmup_ratio=cfg.warmup_ratio,
            report_to="wandb",
            ...
        )
        trainer = SFTTrainer(model=model, args=args, ...)
        trainer.train()

# Launch agents — each agent picks up a sweep run
wandb.agent(sweep_id, function=train_for_sweep, count=20)

3. MLflow

MLflow is open-source and self-hostable — a good choice if data cannot leave your infrastructure.

Setup

pip install mlflow
mlflow server --host 0.0.0.0 --port 5000 &   # local tracking server

Basic Logging

import mlflow
import mlflow.pytorch

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("llm-finetuning")

with mlflow.start_run(run_name="mistral-lora-r16"):
    # Log hyperparameters
    mlflow.log_params({
        "model": "mistralai/Mistral-7B-v0.1",
        "lora_r": 16,
        "learning_rate": 2e-4,
        "batch_size": 16,
    })

    for epoch in range(num_epochs):
        train_loss = run_epoch(train_loader)
        eval_loss  = evaluate(val_loader)

        mlflow.log_metrics({
            "train_loss": train_loss,
            "eval_loss":  eval_loss,
        }, step=epoch)

    # Log model artifact
    mlflow.pytorch.log_model(model, "model")

    # Log any file
    mlflow.log_artifact("results/eval_metrics.json")

MLflow with HuggingFace Trainer

from transformers import TrainingArguments

args = TrainingArguments(
    ...
    report_to="mlflow",
)

# Set experiment name via env var or mlflow.set_experiment()
import os
os.environ["MLFLOW_EXPERIMENT_NAME"] = "llm-finetuning"

Querying Results Programmatically

import mlflow

client = mlflow.MlflowClient("http://localhost:5000")
experiment = client.get_experiment_by_name("llm-finetuning")

runs = client.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=["metrics.eval_loss ASC"],
    max_results=10,
)

for run in runs:
    print(
        f"Run: {run.info.run_name} | "
        f"eval_loss: {run.data.metrics['eval_loss']:.4f} | "
        f"lr: {run.data.params['learning_rate']}"
    )

4. Reproducibility

A result you cannot reproduce is a result you cannot trust.

import random, os
import numpy as np
import torch

def set_seed(seed: int = 42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # For fully deterministic results (slower):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)
# Log everything needed to reproduce the run
metadata = {
    "seed": 42,
    "python_version": platform.python_version(),
    "torch_version": torch.__version__,
    "transformers_version": transformers.__version__,
    "dataset_hash": hashlib.md5(open("data/train.jsonl", "rb").read()).hexdigest(),
    "git_commit": subprocess.check_output(["git", "rev-parse", "HEAD"]).decode().strip(),
}
wandb.config.update(metadata)

5. Config Management with Dataclasses

Keep all hyperparameters in a single typed config — no magic strings scattered through training code.

from dataclasses import dataclass, field, asdict
from typing import Optional
import json

@dataclass
class ExperimentConfig:
    # Model
    model_name: str = "mistralai/Mistral-7B-v0.1"
    # LoRA
    lora_r: int = 16
    lora_alpha: int = 32
    lora_target_modules: list = field(default_factory=lambda: ["q_proj", "v_proj"])
    # Training
    learning_rate: float = 2e-4
    num_epochs: int = 3
    per_device_batch_size: int = 2
    gradient_accumulation_steps: int = 16
    warmup_ratio: float = 0.06
    # Data
    max_seq_length: int = 2048
    train_data: str = "data/train.jsonl"
    eval_data: str = "data/eval.jsonl"
    # Misc
    seed: int = 42
    output_dir: str = "checkpoints/run"
    run_name: Optional[str] = None

cfg = ExperimentConfig(lora_r=32, learning_rate=1e-4, run_name="r32-lr1e4")

# Save config alongside checkpoint
with open(f"{cfg.output_dir}/config.json", "w") as f:
    json.dump(asdict(cfg), f, indent=2)

# Log to W&B
wandb.init(project="llm-finetuning", name=cfg.run_name, config=asdict(cfg))

6. Comparing Runs in W&B

Use the W&B dashboard to:

  1. Parallel coordinates plot — spot which hyperparameter combos yield low eval loss
  2. Grouped runs — group by lora_r to see the effect of rank
  3. Custom panels — overlay train_loss and eval_loss to detect overfitting
# Programmatic comparison
import wandb

api = wandb.Api()
runs = api.runs("your-entity/llm-finetuning", filters={"config.lora_r": 32})

for run in runs:
    print(run.name, run.summary["eval/loss"], run.config["learning_rate"])

7. End-to-End Tracked Training Script

import wandb, torch, json
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
from dataclasses import dataclass, asdict

@dataclass
class Config:
    model_name: str = "mistralai/Mistral-7B-v0.1"
    lora_r: int = 16
    lora_alpha: int = 32
    learning_rate: float = 2e-4
    epochs: int = 3
    batch_size: int = 2
    grad_accum: int = 16
    max_seq_len: int = 2048
    seed: int = 42
    output_dir: str = "checkpoints/run"
    run_name: str = "default-run"

def main(cfg: Config):
    set_seed(cfg.seed)

    run = wandb.init(
        project="llm-finetuning",
        name=cfg.run_name,
        config=asdict(cfg),
    )

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    model = AutoModelForCausalLM.from_pretrained(
        cfg.model_name, quantization_config=bnb_config, device_map="auto"
    )
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, LoraConfig(
        r=cfg.lora_r, lora_alpha=cfg.lora_alpha,
        target_modules=["q_proj", "v_proj"], task_type="CAUSAL_LM"
    ))

    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
    tokenizer.pad_token = tokenizer.eos_token

    ds = load_dataset("json", data_files="data/train.jsonl", split="train")

    trainer = SFTTrainer(
        model=model,
        processing_class=tokenizer,
        train_dataset=ds,
        args=SFTConfig(
            output_dir=cfg.output_dir,
            num_train_epochs=cfg.epochs,
            per_device_train_batch_size=cfg.batch_size,
            gradient_accumulation_steps=cfg.grad_accum,
            learning_rate=cfg.learning_rate,
            bf16=True,
            max_seq_length=cfg.max_seq_len,
            logging_steps=25,
            report_to="wandb",
            run_name=cfg.run_name,
        ),
    )

    trainer.train()
    trainer.save_model()

    # Log final adapter as artifact
    artifact = wandb.Artifact(cfg.run_name, type="model")
    artifact.add_dir(cfg.output_dir)
    run.log_artifact(artifact)

    wandb.finish()

if __name__ == "__main__":
    main(Config(lora_r=32, run_name="r32-lr2e4"))

8. Tracking Checklist

  • Every run has a unique, descriptive name
  • All hyperparameters logged at the start of the run
  • Training and eval loss logged at every epoch (or every N steps)
  • Final eval metrics (ROUGE, BERTScore, etc.) logged at end of run
  • Model checkpoints saved as versioned artifacts
  • Git commit hash logged per run
  • Dataset version or hash logged per run
  • Seed fixed and logged
  • Config saved as JSON alongside each checkpoint