Introduction: Fine-tuning transforms a general-purpose LLM into a specialized model tailored to your domain, style, or task. While prompt engineering can get you far, fine-tuning offers consistent behavior, reduced token usage, and capabilities that prompting alone cannot achieve. This guide covers the complete fine-tuning workflow—from data preparation to deployment—using both cloud APIs (OpenAI, Together AI) and local training with techniques like LoRA and QLoRA that make fine-tuning accessible even on consumer hardware.

When to Fine-Tune vs. Prompt Engineer
Fine-tuning makes sense when you need consistent formatting or style that’s hard to maintain with prompts, when you have domain-specific knowledge not in the base model, when you want to reduce token usage by eliminating lengthy system prompts, or when you need faster inference by baking behavior into weights. Stick with prompt engineering when you have limited training data (under 100 examples), when your requirements change frequently, or when you need to combine multiple capabilities dynamically.
Data Preparation
import json
from typing import List, Dict
def prepare_openai_format(conversations: List[Dict]) -> List[Dict]:
"""Convert conversations to OpenAI fine-tuning format."""
formatted = []
for conv in conversations:
messages = []
# Add system message if present
if "system" in conv:
messages.append({"role": "system", "content": conv["system"]})
# Add conversation turns
for turn in conv["turns"]:
messages.append({"role": "user", "content": turn["user"]})
messages.append({"role": "assistant", "content": turn["assistant"]})
formatted.append({"messages": messages})
return formatted
# Example: Customer support fine-tuning data
raw_data = [
{
"system": "You are a helpful customer support agent for TechCorp.",
"turns": [
{
"user": "My order hasn't arrived yet. Order #12345",
"assistant": "I understand your concern about order #12345. Let me check the status for you. I can see your order was shipped on Monday and is currently in transit. Expected delivery is tomorrow by 5 PM. Would you like me to send you the tracking link?"
}
]
},
{
"system": "You are a helpful customer support agent for TechCorp.",
"turns": [
{
"user": "How do I reset my password?",
"assistant": "I'd be happy to help you reset your password. Here's how:\n\n1. Go to techcorp.com/login\n2. Click 'Forgot Password'\n3. Enter your email address\n4. Check your inbox for the reset link\n5. Create a new password (at least 8 characters)\n\nThe reset link expires in 24 hours. Let me know if you need any other assistance!"
}
]
}
]
# Convert and save
formatted_data = prepare_openai_format(raw_data)
with open("training_data.jsonl", "w") as f:
for item in formatted_data:
f.write(json.dumps(item) + "\n")
# Validate data
def validate_training_data(filepath: str) -> Dict:
"""Validate training data format and quality."""
issues = []
stats = {"total": 0, "tokens_estimate": 0}
with open(filepath, "r") as f:
for i, line in enumerate(f, 1):
stats["total"] += 1
try:
data = json.loads(line)
except json.JSONDecodeError:
issues.append(f"Line {i}: Invalid JSON")
continue
if "messages" not in data:
issues.append(f"Line {i}: Missing 'messages' key")
continue
messages = data["messages"]
# Check for required roles
roles = [m["role"] for m in messages]
if "assistant" not in roles:
issues.append(f"Line {i}: No assistant message")
# Estimate tokens (rough: 4 chars = 1 token)
text = " ".join(m["content"] for m in messages)
stats["tokens_estimate"] += len(text) // 4
return {"issues": issues, "stats": stats}
validation = validate_training_data("training_data.jsonl")
print(f"Total examples: {validation['stats']['total']}")
print(f"Estimated tokens: {validation['stats']['tokens_estimate']}")
Fine-Tuning with OpenAI
from openai import OpenAI
import time
client = OpenAI()
# Step 1: Upload training file
with open("training_data.jsonl", "rb") as f:
file_response = client.files.create(file=f, purpose="fine-tune")
file_id = file_response.id
print(f"Uploaded file: {file_id}")
# Step 2: Create fine-tuning job
job = client.fine_tuning.jobs.create(
training_file=file_id,
model="gpt-4o-mini-2024-07-18", # Base model
hyperparameters={
"n_epochs": 3,
"batch_size": "auto",
"learning_rate_multiplier": "auto"
},
suffix="customer-support" # Custom model name suffix
)
print(f"Fine-tuning job created: {job.id}")
# Step 3: Monitor progress
while True:
job_status = client.fine_tuning.jobs.retrieve(job.id)
print(f"Status: {job_status.status}")
if job_status.status in ["succeeded", "failed", "cancelled"]:
break
time.sleep(60)
# Step 4: Use the fine-tuned model
if job_status.status == "succeeded":
fine_tuned_model = job_status.fine_tuned_model
print(f"Fine-tuned model: {fine_tuned_model}")
# Test the model
response = client.chat.completions.create(
model=fine_tuned_model,
messages=[
{"role": "system", "content": "You are a helpful customer support agent."},
{"role": "user", "content": "I want to return my purchase"}
]
)
print(response.choices[0].message.content)
# List all fine-tuning jobs
jobs = client.fine_tuning.jobs.list(limit=10)
for j in jobs.data:
print(f"{j.id}: {j.status} - {j.fine_tuned_model}")
Local Fine-Tuning with LoRA
# pip install transformers peft datasets accelerate bitsandbytes
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset
import torch
# Load base model with 4-bit quantization (QLoRA)
model_name = "meta-llama/Llama-3.2-3B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
model_name,
load_in_4bit=True,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
# Prepare model for training
model = prepare_model_for_kbit_training(model)
# Configure LoRA
lora_config = LoraConfig(
r=16, # Rank of update matrices
lora_alpha=32, # Scaling factor
target_modules=[ # Which layers to adapt
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"
],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters() # Shows ~0.5% of params are trainable
# Load and prepare dataset
def format_instruction(example):
"""Format example for instruction tuning."""
text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{example['system']}<|eot_id|><|start_header_id|>user<|end_header_id|>
{example['instruction']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{example['response']}<|eot_id|>"""
return {"text": text}
dataset = load_dataset("json", data_files="training_data.jsonl")
dataset = dataset.map(format_instruction)
def tokenize(example):
return tokenizer(
example["text"],
truncation=True,
max_length=2048,
padding="max_length"
)
tokenized_dataset = dataset.map(tokenize, remove_columns=dataset["train"].column_names)
# Training arguments
training_args = TrainingArguments(
output_dir="./lora-output",
num_train_epochs=3,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
warmup_steps=100,
logging_steps=10,
save_steps=500,
fp16=True,
optim="paged_adamw_8bit"
)
# Train
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
trainer.train()
# Save LoRA weights
model.save_pretrained("./lora-weights")
# Merge and save full model (optional)
merged_model = model.merge_and_unload()
merged_model.save_pretrained("./merged-model")
Fine-Tuning Comparison
| Method | VRAM Required | Training Time | Best For |
|---|---|---|---|
| Full Fine-Tuning | 40-80GB | Hours-Days | Maximum quality, large datasets |
| LoRA | 16-24GB | Hours | Good balance of quality/efficiency |
| QLoRA (4-bit) | 6-12GB | Hours | Consumer GPUs, limited resources |
| OpenAI API | None (cloud) | Minutes-Hours | Convenience, no infrastructure |
Evaluation and Iteration
from transformers import pipeline
import json
# Load fine-tuned model
pipe = pipeline(
"text-generation",
model="./merged-model",
tokenizer=tokenizer,
max_new_tokens=256
)
# Evaluation dataset
eval_examples = [
{"input": "How do I track my order?", "expected_topics": ["tracking", "order status"]},
{"input": "I want a refund", "expected_topics": ["refund", "return policy"]},
{"input": "Product is defective", "expected_topics": ["replacement", "warranty"]}
]
def evaluate_response(response: str, expected_topics: list) -> dict:
"""Simple evaluation of response quality."""
response_lower = response.lower()
# Check topic coverage
topics_covered = sum(1 for t in expected_topics if t in response_lower)
topic_score = topics_covered / len(expected_topics)
# Check response length (not too short, not too long)
word_count = len(response.split())
length_score = 1.0 if 20 <= word_count <= 200 else 0.5
# Check for common issues
has_repetition = len(set(response.split())) / len(response.split()) < 0.5
return {
"topic_score": topic_score,
"length_score": length_score,
"has_repetition": has_repetition,
"word_count": word_count
}
# Run evaluation
results = []
for example in eval_examples:
prompt = f"User: {example['input']}\nAssistant:"
response = pipe(prompt)[0]["generated_text"]
# Extract just the assistant response
assistant_response = response.split("Assistant:")[-1].strip()
eval_result = evaluate_response(assistant_response, example["expected_topics"])
eval_result["input"] = example["input"]
eval_result["response"] = assistant_response[:200]
results.append(eval_result)
# Summary
avg_topic = sum(r["topic_score"] for r in results) / len(results)
avg_length = sum(r["length_score"] for r in results) / len(results)
print(f"Average topic coverage: {avg_topic:.2%}")
print(f"Average length score: {avg_length:.2%}")
Production Deployment
# Deploy with vLLM for high-throughput inference
# pip install vllm
from vllm import LLM, SamplingParams
# Load model
llm = LLM(
model="./merged-model",
tensor_parallel_size=1, # Number of GPUs
gpu_memory_utilization=0.9
)
sampling_params = SamplingParams(
temperature=0.7,
top_p=0.9,
max_tokens=256
)
# Batch inference
prompts = [
"How do I reset my password?",
"Where is my order?",
"I need help with billing"
]
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
print(f"Prompt: {output.prompt[:50]}...")
print(f"Response: {output.outputs[0].text}\n")
# FastAPI server
from fastapi import FastAPI
from pydantic import BaseModel
app = FastAPI()
class ChatRequest(BaseModel):
message: str
max_tokens: int = 256
@app.post("/chat")
async def chat(request: ChatRequest):
outputs = llm.generate([request.message], SamplingParams(max_tokens=request.max_tokens))
return {"response": outputs[0].outputs[0].text}
References
- OpenAI Fine-Tuning: https://platform.openai.com/docs/guides/fine-tuning
- Hugging Face PEFT: https://huggingface.co/docs/peft
- LoRA Paper: https://arxiv.org/abs/2106.09685
- QLoRA Paper: https://arxiv.org/abs/2305.14314
- vLLM: https://docs.vllm.ai/
Conclusion
Fine-tuning is a powerful technique for creating specialized LLMs, but it requires careful consideration of data quality, training approach, and evaluation methodology. Start with cloud APIs like OpenAI for convenience and quick iteration, then move to local training with LoRA/QLoRA when you need more control or cost efficiency. The key to successful fine-tuning is high-quality training data—invest time in curating diverse, representative examples that capture the behavior you want. Remember that fine-tuning is not a one-time process; plan for continuous improvement as you gather feedback from production usage. With the right approach, fine-tuned models can dramatically outperform prompted base models for your specific use case.
Discover more from Code, Cloud & Context
Subscribe to get the latest posts sent to your email.