Part 5 — Experiment Tracking¶
Overview
You will run dozens of experiments — different learning rates, LoRA ranks, datasets, and architectures. Without systematic tracking, you will lose the results that matter. This article shows how to log runs with Weights & Biases and MLflow, version artifacts, and run reproducible hyperparameter sweeps.
1. Why Tracking Matters¶
| Without Tracking | With Tracking |
|---|---|
| "Which run was best again?" | Dashboard shows all runs side by side |
| Rerunning experiments to compare | Logged metrics + config available instantly |
| "Did I use lr=2e-4 or 3e-4?" | Every hyperparameter stored per run |
| Model saved over by next experiment | Every checkpoint versioned |
| Non-reproducible results | Exact seed, config, and data version logged |
2. Weights & Biases (W&B)¶
W&B is the de-facto standard for deep learning experiment tracking.
Setup¶
Basic Logging¶
import wandb
# Start a run
wandb.init(
project="llm-finetuning",
name="mistral-lora-r16",
config={
"model": "mistralai/Mistral-7B-v0.1",
"lora_r": 16,
"lora_alpha": 32,
"learning_rate": 2e-4,
"batch_size": 16,
"epochs": 3,
},
)
# Log metrics inside your training loop
for step, batch in enumerate(train_loader):
loss = train_step(batch)
wandb.log({
"train/loss": loss,
"train/learning_rate": scheduler.get_last_lr()[0],
"train/grad_norm": grad_norm,
}, step=step)
# Log eval metrics
wandb.log({
"eval/loss": eval_loss,
"eval/rouge1": rouge_result["rouge1"],
"eval/perplexity": perplexity,
})
wandb.finish()
W&B with HuggingFace Trainer¶
from transformers import TrainingArguments
args = TrainingArguments(
...
report_to="wandb",
run_name="mistral-lora-r16", # W&B run name
logging_steps=25, # log every N steps
)
# Trainer handles wandb.init/finish automatically
Logging Artifacts (Model Checkpoints)¶
artifact = wandb.Artifact(
name="mistral-lora-r16",
type="model",
metadata={"rouge1": 0.42, "perplexity": 12.3},
)
artifact.add_dir("checkpoints/mistral-lora-r16")
wandb.log_artifact(artifact)
Hyperparameter Sweeps¶
Define a sweep config and W&B runs agents in parallel:
# sweep_config.yaml
sweep_config = {
"method": "bayes", # bayes | grid | random
"metric": {"name": "eval/loss", "goal": "minimize"},
"parameters": {
"learning_rate": {"distribution": "log_uniform_values", "min": 1e-5, "max": 3e-4},
"lora_r": {"values": [8, 16, 32, 64]},
"lora_alpha": {"values": [16, 32]},
"warmup_ratio": {"distribution": "uniform", "min": 0.01, "max": 0.1},
},
}
sweep_id = wandb.sweep(sweep_config, project="llm-finetuning")
def train_for_sweep():
with wandb.init() as run:
cfg = run.config
# Build model and trainer using cfg values
lora_config = LoraConfig(r=cfg.lora_r, lora_alpha=cfg.lora_alpha, ...)
args = TrainingArguments(
learning_rate=cfg.learning_rate,
warmup_ratio=cfg.warmup_ratio,
report_to="wandb",
...
)
trainer = SFTTrainer(model=model, args=args, ...)
trainer.train()
# Launch agents — each agent picks up a sweep run
wandb.agent(sweep_id, function=train_for_sweep, count=20)
3. MLflow¶
MLflow is open-source and self-hostable — a good choice if data cannot leave your infrastructure.
Setup¶
Basic Logging¶
import mlflow
import mlflow.pytorch
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("llm-finetuning")
with mlflow.start_run(run_name="mistral-lora-r16"):
# Log hyperparameters
mlflow.log_params({
"model": "mistralai/Mistral-7B-v0.1",
"lora_r": 16,
"learning_rate": 2e-4,
"batch_size": 16,
})
for epoch in range(num_epochs):
train_loss = run_epoch(train_loader)
eval_loss = evaluate(val_loader)
mlflow.log_metrics({
"train_loss": train_loss,
"eval_loss": eval_loss,
}, step=epoch)
# Log model artifact
mlflow.pytorch.log_model(model, "model")
# Log any file
mlflow.log_artifact("results/eval_metrics.json")
MLflow with HuggingFace Trainer¶
from transformers import TrainingArguments
args = TrainingArguments(
...
report_to="mlflow",
)
# Set experiment name via env var or mlflow.set_experiment()
import os
os.environ["MLFLOW_EXPERIMENT_NAME"] = "llm-finetuning"
Querying Results Programmatically¶
import mlflow
client = mlflow.MlflowClient("http://localhost:5000")
experiment = client.get_experiment_by_name("llm-finetuning")
runs = client.search_runs(
experiment_ids=[experiment.experiment_id],
order_by=["metrics.eval_loss ASC"],
max_results=10,
)
for run in runs:
print(
f"Run: {run.info.run_name} | "
f"eval_loss: {run.data.metrics['eval_loss']:.4f} | "
f"lr: {run.data.params['learning_rate']}"
)
4. Reproducibility¶
A result you cannot reproduce is a result you cannot trust.
import random, os
import numpy as np
import torch
def set_seed(seed: int = 42):
random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# For fully deterministic results (slower):
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
set_seed(42)
# Log everything needed to reproduce the run
metadata = {
"seed": 42,
"python_version": platform.python_version(),
"torch_version": torch.__version__,
"transformers_version": transformers.__version__,
"dataset_hash": hashlib.md5(open("data/train.jsonl", "rb").read()).hexdigest(),
"git_commit": subprocess.check_output(["git", "rev-parse", "HEAD"]).decode().strip(),
}
wandb.config.update(metadata)
5. Config Management with Dataclasses¶
Keep all hyperparameters in a single typed config — no magic strings scattered through training code.
from dataclasses import dataclass, field, asdict
from typing import Optional
import json
@dataclass
class ExperimentConfig:
# Model
model_name: str = "mistralai/Mistral-7B-v0.1"
# LoRA
lora_r: int = 16
lora_alpha: int = 32
lora_target_modules: list = field(default_factory=lambda: ["q_proj", "v_proj"])
# Training
learning_rate: float = 2e-4
num_epochs: int = 3
per_device_batch_size: int = 2
gradient_accumulation_steps: int = 16
warmup_ratio: float = 0.06
# Data
max_seq_length: int = 2048
train_data: str = "data/train.jsonl"
eval_data: str = "data/eval.jsonl"
# Misc
seed: int = 42
output_dir: str = "checkpoints/run"
run_name: Optional[str] = None
cfg = ExperimentConfig(lora_r=32, learning_rate=1e-4, run_name="r32-lr1e4")
# Save config alongside checkpoint
with open(f"{cfg.output_dir}/config.json", "w") as f:
json.dump(asdict(cfg), f, indent=2)
# Log to W&B
wandb.init(project="llm-finetuning", name=cfg.run_name, config=asdict(cfg))
6. Comparing Runs in W&B¶
Use the W&B dashboard to:
- Parallel coordinates plot — spot which hyperparameter combos yield low eval loss
- Grouped runs — group by
lora_rto see the effect of rank - Custom panels — overlay
train_lossandeval_lossto detect overfitting
# Programmatic comparison
import wandb
api = wandb.Api()
runs = api.runs("your-entity/llm-finetuning", filters={"config.lora_r": 32})
for run in runs:
print(run.name, run.summary["eval/loss"], run.config["learning_rate"])
7. End-to-End Tracked Training Script¶
import wandb, torch, json
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
from dataclasses import dataclass, asdict
@dataclass
class Config:
model_name: str = "mistralai/Mistral-7B-v0.1"
lora_r: int = 16
lora_alpha: int = 32
learning_rate: float = 2e-4
epochs: int = 3
batch_size: int = 2
grad_accum: int = 16
max_seq_len: int = 2048
seed: int = 42
output_dir: str = "checkpoints/run"
run_name: str = "default-run"
def main(cfg: Config):
set_seed(cfg.seed)
run = wandb.init(
project="llm-finetuning",
name=cfg.run_name,
config=asdict(cfg),
)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(
cfg.model_name, quantization_config=bnb_config, device_map="auto"
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, LoraConfig(
r=cfg.lora_r, lora_alpha=cfg.lora_alpha,
target_modules=["q_proj", "v_proj"], task_type="CAUSAL_LM"
))
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
tokenizer.pad_token = tokenizer.eos_token
ds = load_dataset("json", data_files="data/train.jsonl", split="train")
trainer = SFTTrainer(
model=model,
processing_class=tokenizer,
train_dataset=ds,
args=SFTConfig(
output_dir=cfg.output_dir,
num_train_epochs=cfg.epochs,
per_device_train_batch_size=cfg.batch_size,
gradient_accumulation_steps=cfg.grad_accum,
learning_rate=cfg.learning_rate,
bf16=True,
max_seq_length=cfg.max_seq_len,
logging_steps=25,
report_to="wandb",
run_name=cfg.run_name,
),
)
trainer.train()
trainer.save_model()
# Log final adapter as artifact
artifact = wandb.Artifact(cfg.run_name, type="model")
artifact.add_dir(cfg.output_dir)
run.log_artifact(artifact)
wandb.finish()
if __name__ == "__main__":
main(Config(lora_r=32, run_name="r32-lr2e4"))
8. Tracking Checklist¶
- Every run has a unique, descriptive name
- All hyperparameters logged at the start of the run
- Training and eval loss logged at every epoch (or every N steps)
- Final eval metrics (ROUGE, BERTScore, etc.) logged at end of run
- Model checkpoints saved as versioned artifacts
- Git commit hash logged per run
- Dataset version or hash logged per run
- Seed fixed and logged
- Config saved as JSON alongside each checkpoint