LLM costs can add up quickly. Learn optimization strategies to reduce expenses while maintaining quality.
Understanding Token Costs
import tiktoken
# Count tokens
encoding = tiktoken.encoding_for_model("gpt-4")
text = "Your prompt here"
tokens = encoding.encode(text)
token_count = len(tokens)
# Estimate cost (GPT-4: $0.03/1K input, $0.06/1K output)
input_cost = (token_count / 1000) * 0.03
print(f"Tokens: {token_count}, Est. cost: ${input_cost:.4f}")
Prompt Optimization
# Bad: Verbose prompt (500 tokens)
bad_prompt = """
I would like you to please help me by analyzing the following text
and providing a comprehensive summary. Please make sure to capture
all the key points and important details...
[Long repetitive instructions]
"""
# Good: Concise prompt (50 tokens)
good_prompt = "Summarize key points from this text:"
# Savings: 450 tokens = $0.0135 per call
# At 1000 calls/day: $13.50/day saved
Response Caching
import hashlib
import json
cache = {}
def cached_completion(prompt, model="gpt-4"):
# Create cache key
key = hashlib.md5(f"{prompt}{model}".encode()).hexdigest()
# Check cache
if key in cache:
print("Cache hit!")
return cache[key]
# Call API
response = openai.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}]
)
result = response.choices[0].message.content
cache[key] = result
return result
# Repeated queries are free!
result1 = cached_completion("What is Python?")
result2 = cached_completion("What is Python?") # No API call
Model Selection Strategy
# Use cheaper models for simple tasks
def smart_completion(prompt, complexity="low"):
# Route based on complexity
models = {
"low": "gpt-3.5-turbo", # $0.001/1K
"medium": "gpt-4-turbo", # $0.01/1K
"high": "gpt-4" # $0.03/1K
}
model = models[complexity]
response = openai.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
# Save 97% on simple tasks
simple_answer = smart_completion("What is 2+2?", complexity="low")
Batch Processing
# Process multiple items in one call
def batch_process(items):
# Instead of N API calls, make 1
prompt = f"""
Process each item and return results as JSON:
{json.dumps(items)}
Format: {{"item": "result"}}
"""
response = openai.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": prompt}]
)
return json.loads(response.choices[0].message.content)
# 10 items: 1 API call instead of 10 = 90% cost reduction
items = ["task1", "task2", "task3", "task4", "task5"]
results = batch_process(items)
Streaming for Better UX
# Stream responses to show progress
# Costs same but improves user experience
def stream_response(prompt):
response = openai.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}],
stream=True
)
full_response = ""
for chunk in response:
if chunk.choices[0].delta.content:
content = chunk.choices[0].delta.content
print(content, end='', flush=True)
full_response += content
return full_response
Smart optimization reduces LLM costs by 50-90% while maintaining quality!