Fine-tuning adapts pre-trained models to your specific use case, resulting in better performance than generic prompting.
When to Fine-Tune
# Fine-tune when:
# 1. Specific domain knowledge needed (legal, medical)
# 2. Consistent formatting required
# 3. Behavior modification needed
# 4. Cost optimization (smaller fine-tuned model vs large prompts)
# Don't fine-tune when:
# - Few examples (use few-shot prompting instead)
# - Rapid iteration needed
# - General knowledge queries
Preparing Training Data
import json
# Format for OpenAI fine-tuning
training_data = []
for example in your_examples:
training_data.append({
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": example['input']},
{"role": "assistant", "content": example['output']}
]
})
# Save to JSONL
with open('training_data.jsonl', 'w') as f:
for item in training_data:
f.write(json.dumps(item) + '\n')
Fine-Tuning with OpenAI
import openai
# Upload training file
file = openai.files.create(
file=open("training_data.jsonl", "rb"),
purpose="fine-tune"
)
# Start fine-tuning job
job = openai.fine_tuning.jobs.create(
training_file=file.id,
model="gpt-3.5-turbo",
hyperparameters={
"n_epochs": 3,
"learning_rate_multiplier": 0.1
}
)
# Check status
status = openai.fine_tuning.jobs.retrieve(job.id)
print(f"Status: {status.status}")
# Use fine-tuned model
response = openai.chat.completions.create(
model=job.fine_tuned_model,
messages=[{"role": "user", "content": "Your query"}]
)
Hugging Face Fine-Tuning
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# Prepare dataset
dataset = load_dataset("your_dataset")
def tokenize_function(examples):
return tokenizer(examples["text"], truncation=True, padding=True)
tokenized_dataset = dataset.map(tokenize_function, batched=True)
# Training arguments
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=4,
save_steps=500,
logging_steps=100
)
# Train
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"]
)
trainer.train()
Fine-tuning creates specialized AI models optimized for your specific use case!