Example Training Script
This page provides a full working example of using the Transformer Lab Client for fine-tuning a language model. The example demonstrates the entire workflow from initialization to model saving, with proper progress reporting and error handling.
Overview​
This example demonstrates how to:
- Initialize a Transformer Lab client and register a training job
- Load and process a dataset for language model fine-tuning
- Configure and train a model using Hugging Face Transformers
- Report progress and metrics to Transformer Lab
- Handle errors and completion properly
Prerequisites​
- transformerlab-client
- transformers
- datasets
- torch
- A running Transformer Lab server
Complete Example Script​
Below is the complete example script for fine-tuning a small language model on instruction data:
import os
from datetime import datetime
from pprint import pprint
from datasets import load_dataset
from transformerlab_client.callbacks.hf_callback import TLabProgressCallback
from transformerlab_client.client import TransformerLabClient
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
DataCollatorForLanguageModeling,
Trainer,
TrainingArguments,
)
def train():
"""Main training function that runs locally but reports to TransformerLab"""
# Training configuration
training_config = {
"experiment_name": "alpha",
"model_name": "HuggingFaceTB/SmolLM-135M-Instruct",
"dataset": "Trelis/touch-rugby-rules",
"template_name": "full-demo",
"output_dir": "./output",
"log_to_wandb": False,
"_config": {
"dataset_name": "Trelis/touch-rugby-rules",
"lr": 2e-5,
"num_train_epochs": 1,
"batch_size": 8,
"gradient_accumulation_steps": 1,
"warmup_ratio": 0.03,
"weight_decay": 0.01,
"max_seq_length": 512,
},
}
# Initialize TransformerLab client
tlab_client = TransformerLabClient()
job_id = tlab_client.start(training_config)
# Create output directory if it doesn't exist
os.makedirs(training_config["output_dir"], exist_ok=True)
try:
# Log start time
start_time = datetime.now()
tlab_client.log_info(f"Training started at {start_time}")
# Load the dataset
tlab_client.log_info("Loading dataset...")
dataset = load_dataset(training_config["dataset"])
tlab_client.log_info(f"Loaded dataset with {len(dataset['train'])} training examples")
# Report progress to TransformerLab
tlab_client.report_progress(10, {"status": "dataset_loaded"})
# Load tokenizer and model
tlab_client.log_info(f"Loading model: {training_config['model_name']}")
tokenizer = AutoTokenizer.from_pretrained(training_config["model_name"])
model = AutoModelForCausalLM.from_pretrained(
training_config["model_name"],
device_map="auto",
)
# Configure tokenizer
if not tokenizer.pad_token_id:
tokenizer.pad_token = tokenizer.eos_token
# Report progress
tlab_client.report_progress(20, {"status": "model_loaded"})
# Process dataset
def format_instruction(example):
"""Format instruction and response using template"""
instruction = example["prompt"]
response = example["completion"]
# Simple Llama-3 instruction template
if training_config["template_name"] == "llama3instruct":
formatted = f"<|begin_of_text|><|prompt|>{instruction}<|response|>{response}<|end_of_text|>"
else:
# Default simple template
formatted = f"Instruction: {instruction}\n\nResponse: {response}"
return {"formatted_text": formatted}
tokenized_dataset = dataset.map(format_instruction)
# Tokenize dataset
def tokenize_function(examples):
return tokenizer(
examples["formatted_text"],
padding="max_length",
truncation=True,
max_length=training_config["_config"]["max_seq_length"],
return_tensors="pt",
)
processed_dataset = tokenized_dataset.map(
tokenize_function, batched=True, remove_columns=tokenized_dataset["train"].column_names
)
# Report progress
tlab_client.report_progress(30, {"status": "dataset_processed"})
# Setup training arguments
training_args = TrainingArguments(
output_dir=os.path.join(training_config["output_dir"], f"job_{job_id}"),
learning_rate=training_config["_config"]["lr"],
num_train_epochs=training_config["_config"]["num_train_epochs"],
per_device_train_batch_size=training_config["_config"]["batch_size"],
gradient_accumulation_steps=training_config["_config"]["gradient_accumulation_steps"],
warmup_ratio=training_config["_config"]["warmup_ratio"],
weight_decay=training_config["_config"]["weight_decay"],
logging_steps=20,
save_steps=500,
save_total_limit=2,
report_to=[], # We'll handle reporting to TransformerLab ourselves
)
# Setup trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=processed_dataset["train"],
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
callbacks=[TLabProgressCallback(tlab_client)],
)
# Train the model
tlab_client.log_info("Starting training...")
trainer.train()
# Save the final model
tlab_client.log_info("Saving model...")
trainer.save_model(os.path.join(training_config["output_dir"], f"final_model_{job_id}"))
tokenizer.save_pretrained(os.path.join(training_config["output_dir"], f"final_model_{job_id}"))
tlab_client.log_info("Saving model in Transformer Lab")
tlab_client.save_model(os.path.join(training_config["output_dir"], f"final_model_{job_id}"))
# Calculate training time
end_time = datetime.now()
training_duration = end_time - start_time
tlab_client.log_info(f"Training completed in {training_duration}")
# Complete the job in TransformerLab
tlab_client.complete()
return {
"status": "success",
"job_id": job_id,
"duration": str(training_duration),
"output_dir": os.path.join(training_config["output_dir"], f"final_model_{job_id}"),
}
except KeyboardInterrupt:
tlab_client.log_warning("Training interrupted by user or remotely")
tlab_client.stop("Training stopped by user or remotely")
return {"status": "stopped", "job_id": job_id}
except Exception as e:
tlab_client.log_error(f"Training failed: {str(e)}")
import traceback
traceback.print_exc()
tlab_client.stop(f"Training failed: {str(e)}")
return {"status": "error", "job_id": job_id, "error": str(e)}
if __name__ == "__main__":
result = train()
pprint(result)
Explanation​
Let's break down the key components of this example:
1. Training Configuration​
The script starts by defining a configuration dictionary with all the necessary parameters for training:
training_config = {
"experiment_name": "alpha",
"model_name": "HuggingFaceTB/SmolLM-135M-Instruct",
"dataset": "Trelis/touch-rugby-rules",
"template_name": "full-demo",
"output_dir": "./output",
"log_to_wandb": False,
"_config": {
"dataset_name": "Trelis/touch-rugby-rules",
"lr": 2e-5,
"num_train_epochs": 1,
"batch_size": 8,
"gradient_accumulation_steps": 1,
"warmup_ratio": 0.03,
"weight_decay": 0.01,
"max_seq_length": 512,
},
}
This configuration contains:
- Basic experiment information (name, model, dataset)
- Output directory for saving results
- Training hyperparameters in the
_config
nested dictionary
2. Client Initialization​
The script initializes the TransformerLab client and registers a new training job:
tlab_client = TransformerLabClient()
job_id = tlab_client.start(training_config)
The start()
method registers the job with Transformer Lab and returns a unique job ID.
3. Progress Reporting​
Throughout the script, progress is reported at key milestones:
# Manual progress reporting at key points
tlab_client.report_progress(10, {"status": "dataset_loaded"})
tlab_client.report_progress(20, {"status": "model_loaded"})
tlab_client.report_progress(30, {"status": "dataset_processed"})
Progress values are percentages (0-100) and can include additional metrics as a dictionary.
4. Logging​
The client provides various logging methods to keep track of events:
tlab_client.log_info("Loading dataset...")
tlab_client.log_info(f"Loaded dataset with {len(dataset['train'])} training examples")
These logs appear both in the console and in the Transformer Lab interface.
5. Callback Integration​
The script uses the TLabProgressCallback to automatically report progress during training:
trainer = Trainer(
model=model,
args=training_args,
train_dataset=processed_dataset["train"],
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
callbacks=[TLabProgressCallback(tlab_client)], # Add the callback here
)
This callback automatically updates progress based on the training steps without requiring manual progress calls during the training loop.
6. Error Handling​
The script includes comprehensive error handling to ensure that TransformerLab is properly updated if an error occurs:
except KeyboardInterrupt:
tlab_client.log_warning("Training interrupted by user or remotely")
tlab_client.stop("Training stopped by user or remotely")
return {"status": "stopped", "job_id": job_id}
except Exception as e:
tlab_client.log_error(f"Training failed: {str(e)}")
import traceback
traceback.print_exc()
tlab_client.stop(f"Training failed: {str(e)}")
return {"status": "error", "job_id": job_id, "error": str(e)}
This ensures that the job is properly marked as stopped or failed in Transformer Lab if something goes wrong.
7. Completion and Model Saving​
When training completes successfully, the model is saved and the job is marked as complete:
# Save the model locally
trainer.save_model(os.path.join(training_config["output_dir"], f"final_model_{job_id}"))
tokenizer.save_pretrained(os.path.join(training_config["output_dir"], f"final_model_{job_id}"))
# Notify Transformer Lab about the saved model
tlab_client.save_model(os.path.join(training_config["output_dir"], f"final_model_{job_id}"))
# Mark job as complete
tlab_client.complete()
Running the Example​
To run this example:
- Make sure Transformer Lab is running
- Install the required packages:
pip install transformerlab-client transformers datasets torch
- Save the script to a file (e.g.,
train_with_tlab.py
) - Run the script:
python train_with_tlab.py
You can monitor the progress in the Transformer Lab interface, where you'll see real-time updates of progress, metrics, and logs.
Additional Tips​
- Template Customization: Modify the
format_instruction
function to use different chat templates for other models - Dataset Customization: Replace
load_dataset()
with your own dataset loading logic if needed - Configuration: Customize the
training_config
dictionary to suit your specific needs - Error Handling: Add more specific error handling for your use case
This example serves as a starting point that you can adapt for your own model training workflows.