Introduction: Fine-tuning adapts pre-trained language models to specific tasks, domains, or behaviors. While prompting works for many use cases, fine-tuning delivers better performance, lower latency, and reduced costs for specialized applications. This guide covers modern fine-tuning approaches: full fine-tuning for maximum customization, LoRA and QLoRA for efficient parameter updates, preparing high-quality training data, using OpenAI and open-source fine-tuning APIs, and evaluating fine-tuned models. These techniques let you create models that excel at your specific use case while maintaining the general capabilities of the base model.

Training Data Preparation
import json
from typing import Optional
from pydantic import BaseModel
class TrainingExample(BaseModel):
"""Single training example for fine-tuning."""
system: Optional[str] = None
user: str
assistant: str
def create_openai_format(examples: list[TrainingExample]) -> list[dict]:
"""Convert to OpenAI fine-tuning format."""
formatted = []
for ex in examples:
messages = []
if ex.system:
messages.append({"role": "system", "content": ex.system})
messages.append({"role": "user", "content": ex.user})
messages.append({"role": "assistant", "content": ex.assistant})
formatted.append({"messages": messages})
return formatted
def save_jsonl(data: list[dict], filepath: str):
"""Save data in JSONL format for fine-tuning."""
with open(filepath, 'w') as f:
for item in data:
f.write(json.dumps(item) + '\n')
def validate_training_data(filepath: str) -> dict:
"""Validate training data format and quality."""
issues = []
stats = {
"total_examples": 0,
"with_system": 0,
"avg_user_tokens": 0,
"avg_assistant_tokens": 0
}
user_lengths = []
assistant_lengths = []
with open(filepath) as f:
for i, line in enumerate(f, 1):
try:
data = json.loads(line)
stats["total_examples"] += 1
messages = data.get("messages", [])
# Check structure
if not messages:
issues.append(f"Line {i}: No messages")
continue
has_system = any(m["role"] == "system" for m in messages)
has_user = any(m["role"] == "user" for m in messages)
has_assistant = any(m["role"] == "assistant" for m in messages)
if has_system:
stats["with_system"] += 1
if not has_user:
issues.append(f"Line {i}: Missing user message")
if not has_assistant:
issues.append(f"Line {i}: Missing assistant message")
# Estimate tokens (rough: 4 chars per token)
for m in messages:
content = m.get("content", "")
tokens = len(content) // 4
if m["role"] == "user":
user_lengths.append(tokens)
elif m["role"] == "assistant":
assistant_lengths.append(tokens)
except json.JSONDecodeError:
issues.append(f"Line {i}: Invalid JSON")
if user_lengths:
stats["avg_user_tokens"] = sum(user_lengths) // len(user_lengths)
if assistant_lengths:
stats["avg_assistant_tokens"] = sum(assistant_lengths) // len(assistant_lengths)
return {"stats": stats, "issues": issues[:20]}
# Example: Create training data for a customer support bot
examples = [
TrainingExample(
system="You are a helpful customer support agent for TechCorp.",
user="How do I reset my password?",
assistant="To reset your password, go to Settings > Security > Reset Password. You'll receive an email with a reset link. Click the link and enter your new password. If you don't receive the email within 5 minutes, check your spam folder."
),
TrainingExample(
system="You are a helpful customer support agent for TechCorp.",
user="My order hasn't arrived yet",
assistant="I'm sorry to hear your order hasn't arrived. Let me help you track it. Could you please provide your order number? You can find it in your confirmation email or under Orders in your account. Once I have that, I can check the shipping status for you."
),
]
formatted = create_openai_format(examples)
save_jsonl(formatted, "training_data.jsonl")
validation = validate_training_data("training_data.jsonl")
print(f"Total examples: {validation['stats']['total_examples']}")
print(f"Issues found: {len(validation['issues'])}")
OpenAI Fine-Tuning
from openai import OpenAI
import time
client = OpenAI()
def upload_training_file(filepath: str) -> str:
"""Upload training file to OpenAI."""
with open(filepath, "rb") as f:
response = client.files.create(file=f, purpose="fine-tune")
return response.id
def create_fine_tune_job(
training_file_id: str,
model: str = "gpt-4o-mini-2024-07-18",
suffix: str = None,
n_epochs: int = 3,
learning_rate_multiplier: float = None,
batch_size: int = None
) -> str:
"""Create a fine-tuning job."""
hyperparameters = {"n_epochs": n_epochs}
if learning_rate_multiplier:
hyperparameters["learning_rate_multiplier"] = learning_rate_multiplier
if batch_size:
hyperparameters["batch_size"] = batch_size
response = client.fine_tuning.jobs.create(
training_file=training_file_id,
model=model,
suffix=suffix,
hyperparameters=hyperparameters
)
return response.id
def monitor_fine_tune(job_id: str, poll_interval: int = 60):
"""Monitor fine-tuning job progress."""
while True:
job = client.fine_tuning.jobs.retrieve(job_id)
print(f"Status: {job.status}")
print(f"Trained tokens: {job.trained_tokens or 'N/A'}")
if job.status == "succeeded":
print(f"Fine-tuned model: {job.fine_tuned_model}")
return job.fine_tuned_model
elif job.status == "failed":
print(f"Error: {job.error}")
return None
elif job.status == "cancelled":
print("Job was cancelled")
return None
# Get recent events
events = client.fine_tuning.jobs.list_events(job_id, limit=5)
for event in events.data:
print(f" {event.created_at}: {event.message}")
time.sleep(poll_interval)
def use_fine_tuned_model(model_name: str, prompt: str) -> str:
"""Use a fine-tuned model."""
response = client.chat.completions.create(
model=model_name,
messages=[
{"role": "system", "content": "You are a helpful customer support agent."},
{"role": "user", "content": prompt}
]
)
return response.choices[0].message.content
# Usage
file_id = upload_training_file("training_data.jsonl")
print(f"Uploaded file: {file_id}")
job_id = create_fine_tune_job(
training_file_id=file_id,
model="gpt-4o-mini-2024-07-18",
suffix="customer-support",
n_epochs=3
)
print(f"Created job: {job_id}")
# Monitor until complete
model_name = monitor_fine_tune(job_id)
if model_name:
response = use_fine_tuned_model(model_name, "How do I cancel my subscription?")
print(response)
LoRA Fine-Tuning with Hugging Face
# pip install transformers peft datasets accelerate bitsandbytes
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
import torch
def prepare_lora_model(
model_name: str = "meta-llama/Llama-2-7b-hf",
lora_r: int = 16,
lora_alpha: int = 32,
lora_dropout: float = 0.05,
target_modules: list[str] = None
):
"""Prepare model for LoRA fine-tuning."""
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
# Load model in 4-bit for QLoRA
model = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_4bit=True,
torch_dtype=torch.float16,
device_map="auto"
)
# Prepare for k-bit training
model = prepare_model_for_kbit_training(model)
# Configure LoRA
if target_modules is None:
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"]
lora_config = LoraConfig(
r=lora_r,
lora_alpha=lora_alpha,
lora_dropout=lora_dropout,
target_modules=target_modules,
bias="none",
task_type="CAUSAL_LM"
)
# Apply LoRA
model = get_peft_model(model, lora_config)
# Print trainable parameters
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable: {trainable:,} / {total:,} ({100 * trainable / total:.2f}%)")
return model, tokenizer
def prepare_dataset(examples: list[dict], tokenizer, max_length: int = 512):
"""Prepare dataset for training."""
def format_example(example):
text = f"### User: {example['user']}\n\n### Assistant: {example['assistant']}"
return {"text": text}
dataset = Dataset.from_list(examples)
dataset = dataset.map(format_example)
def tokenize(example):
return tokenizer(
example["text"],
truncation=True,
max_length=max_length,
padding="max_length"
)
dataset = dataset.map(tokenize, remove_columns=["text", "user", "assistant"])
return dataset
def train_lora(
model,
tokenizer,
train_dataset,
output_dir: str = "./lora-output",
num_epochs: int = 3,
batch_size: int = 4,
learning_rate: float = 2e-4
):
"""Train LoRA model."""
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=num_epochs,
per_device_train_batch_size=batch_size,
gradient_accumulation_steps=4,
learning_rate=learning_rate,
fp16=True,
logging_steps=10,
save_strategy="epoch",
warmup_ratio=0.03,
lr_scheduler_type="cosine"
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
trainer.train()
# Save LoRA weights
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
return output_dir
# Usage
model, tokenizer = prepare_lora_model(
model_name="meta-llama/Llama-2-7b-hf",
lora_r=16,
lora_alpha=32
)
examples = [
{"user": "What's the weather like?", "assistant": "I don't have access to real-time weather data..."},
# Add more examples
]
dataset = prepare_dataset(examples, tokenizer)
output_dir = train_lora(model, tokenizer, dataset)
Evaluation and Comparison
from dataclasses import dataclass
from typing import Callable
import numpy as np
@dataclass
class EvalResult:
model_name: str
accuracy: float
avg_latency_ms: float
cost_per_1k: float
examples_evaluated: int
def evaluate_model(
model_fn: Callable[[str], str],
test_examples: list[dict],
model_name: str,
cost_per_1k_tokens: float = 0.0
) -> EvalResult:
"""Evaluate a model on test examples."""
correct = 0
latencies = []
total_tokens = 0
for example in test_examples:
import time
start = time.time()
response = model_fn(example["input"])
latency = (time.time() - start) * 1000
latencies.append(latency)
# Check if response matches expected (simple exact match)
if example.get("expected") and example["expected"].lower() in response.lower():
correct += 1
# Estimate tokens
total_tokens += len(response) // 4
return EvalResult(
model_name=model_name,
accuracy=correct / len(test_examples) if test_examples else 0,
avg_latency_ms=np.mean(latencies),
cost_per_1k=(total_tokens / 1000) * cost_per_1k_tokens,
examples_evaluated=len(test_examples)
)
def compare_models(results: list[EvalResult]) -> str:
"""Generate comparison report."""
report = "Model Comparison Report\n"
report += "=" * 50 + "\n\n"
for result in sorted(results, key=lambda x: x.accuracy, reverse=True):
report += f"Model: {result.model_name}\n"
report += f" Accuracy: {result.accuracy:.2%}\n"
report += f" Avg Latency: {result.avg_latency_ms:.1f}ms\n"
report += f" Cost per 1K: ${result.cost_per_1k:.4f}\n"
report += f" Examples: {result.examples_evaluated}\n\n"
return report
# Usage
test_examples = [
{"input": "How do I reset my password?", "expected": "settings"},
{"input": "Where is my order?", "expected": "order number"},
]
# Evaluate base model
def base_model(prompt):
return client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}]
).choices[0].message.content
base_result = evaluate_model(
base_model,
test_examples,
"gpt-4o-mini (base)",
cost_per_1k_tokens=0.00015
)
# Evaluate fine-tuned model
def fine_tuned_model(prompt):
return client.chat.completions.create(
model="ft:gpt-4o-mini-2024-07-18:org::abc123",
messages=[{"role": "user", "content": prompt}]
).choices[0].message.content
ft_result = evaluate_model(
fine_tuned_model,
test_examples,
"gpt-4o-mini (fine-tuned)",
cost_per_1k_tokens=0.0003
)
print(compare_models([base_result, ft_result]))
Data Augmentation for Fine-Tuning
def augment_training_data(
examples: list[TrainingExample],
augmentation_factor: int = 3
) -> list[TrainingExample]:
"""Augment training data using LLM."""
augmented = list(examples) # Keep originals
for example in examples:
prompt = f"""Generate {augmentation_factor} variations of this training example.
Keep the same intent and information, but vary the phrasing.
Original user message: {example.user}
Original assistant response: {example.assistant}
Return JSON array:
[
{{"user": "variation 1", "assistant": "appropriate response"}},
{{"user": "variation 2", "assistant": "appropriate response"}}
]"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
try:
data = json.loads(response.choices[0].message.content)
variations = data if isinstance(data, list) else data.get("variations", [])
for var in variations:
augmented.append(TrainingExample(
system=example.system,
user=var["user"],
assistant=var["assistant"]
))
except (json.JSONDecodeError, KeyError):
continue
return augmented
def generate_synthetic_data(
task_description: str,
num_examples: int = 50,
system_prompt: str = None
) -> list[TrainingExample]:
"""Generate synthetic training data."""
prompt = f"""Generate {num_examples} diverse training examples for this task:
Task: {task_description}
Requirements:
- Each example should be realistic and varied
- Cover different scenarios and edge cases
- User messages should be natural, like real users would write
- Assistant responses should be helpful and accurate
Return JSON array:
[
{{"user": "user message", "assistant": "assistant response"}},
...
]"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
examples_list = data if isinstance(data, list) else data.get("examples", [])
return [
TrainingExample(
system=system_prompt,
user=ex["user"],
assistant=ex["assistant"]
)
for ex in examples_list
]
# Usage
synthetic = generate_synthetic_data(
task_description="Customer support for a SaaS product. Handle billing questions, feature requests, and technical issues.",
num_examples=50,
system_prompt="You are a helpful customer support agent."
)
print(f"Generated {len(synthetic)} synthetic examples")
Production Fine-Tuning Pipeline
from fastapi import FastAPI, BackgroundTasks
from pydantic import BaseModel
from typing import Optional
import uuid
app = FastAPI()
# Store job status
jobs_db: dict[str, dict] = {}
class FineTuneRequest(BaseModel):
training_data: list[dict]
model: str = "gpt-4o-mini-2024-07-18"
suffix: str
n_epochs: int = 3
validation_split: float = 0.1
class JobStatus(BaseModel):
job_id: str
status: str
model_name: Optional[str] = None
error: Optional[str] = None
progress: dict = {}
async def run_fine_tune_job(job_id: str, request: FineTuneRequest):
"""Background task to run fine-tuning."""
try:
jobs_db[job_id]["status"] = "preparing_data"
# Split data
split_idx = int(len(request.training_data) * (1 - request.validation_split))
train_data = request.training_data[:split_idx]
val_data = request.training_data[split_idx:]
# Save files
train_file = f"/tmp/{job_id}_train.jsonl"
val_file = f"/tmp/{job_id}_val.jsonl"
save_jsonl(train_data, train_file)
save_jsonl(val_data, val_file)
jobs_db[job_id]["progress"]["train_examples"] = len(train_data)
jobs_db[job_id]["progress"]["val_examples"] = len(val_data)
# Upload files
jobs_db[job_id]["status"] = "uploading"
train_file_id = upload_training_file(train_file)
val_file_id = upload_training_file(val_file) if val_data else None
# Create fine-tune job
jobs_db[job_id]["status"] = "training"
ft_job_id = create_fine_tune_job(
training_file_id=train_file_id,
model=request.model,
suffix=request.suffix,
n_epochs=request.n_epochs
)
jobs_db[job_id]["progress"]["openai_job_id"] = ft_job_id
# Monitor until complete
model_name = monitor_fine_tune(ft_job_id)
if model_name:
jobs_db[job_id]["status"] = "completed"
jobs_db[job_id]["model_name"] = model_name
else:
jobs_db[job_id]["status"] = "failed"
jobs_db[job_id]["error"] = "Fine-tuning failed"
except Exception as e:
jobs_db[job_id]["status"] = "failed"
jobs_db[job_id]["error"] = str(e)
@app.post("/fine-tune", response_model=JobStatus)
async def start_fine_tune(request: FineTuneRequest, background_tasks: BackgroundTasks):
"""Start a fine-tuning job."""
job_id = str(uuid.uuid4())
jobs_db[job_id] = {
"status": "queued",
"model_name": None,
"error": None,
"progress": {}
}
background_tasks.add_task(run_fine_tune_job, job_id, request)
return JobStatus(job_id=job_id, status="queued")
@app.get("/fine-tune/{job_id}", response_model=JobStatus)
async def get_job_status(job_id: str):
"""Get fine-tuning job status."""
if job_id not in jobs_db:
return JobStatus(job_id=job_id, status="not_found")
job = jobs_db[job_id]
return JobStatus(
job_id=job_id,
status=job["status"],
model_name=job.get("model_name"),
error=job.get("error"),
progress=job.get("progress", {})
)
References
- OpenAI Fine-Tuning: https://platform.openai.com/docs/guides/fine-tuning
- Hugging Face PEFT: https://huggingface.co/docs/peft
- LoRA Paper: https://arxiv.org/abs/2106.09685
- QLoRA Paper: https://arxiv.org/abs/2305.14314
Conclusion
Fine-tuning transforms general-purpose LLMs into specialized tools for your specific use case. Start with high-quality training data—even 50-100 well-crafted examples can significantly improve performance. Use OpenAI’s fine-tuning API for quick iteration with GPT models, or LoRA/QLoRA for efficient fine-tuning of open-source models. Always evaluate fine-tuned models against baselines to ensure improvement. Consider data augmentation and synthetic data generation to expand limited datasets. The investment in fine-tuning pays off through better accuracy, lower latency (shorter prompts), and reduced costs for high-volume applications.
Discover more from Code, Cloud & Context
Subscribe to get the latest posts sent to your email.