Multimodal AI models process and understand multiple types of data simultaneously, enabling richer interactions.
GPT-4 Vision API
import openai
import base64
# Encode image
with open("image.jpg", "rb") as image_file:
image_data = base64.b64encode(image_file.read()).decode()
# Analyze image with GPT-4 Vision
response = openai.chat.completions.create(
model="gpt-4-vision-preview",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "What's in this image?"},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_data}"
}
}
]
}
],
max_tokens=300
)
print(response.choices[0].message.content)
Image-to-Text with Vision Models
# Detailed image analysis
questions = [
"Describe this image in detail",
"What objects do you see?",
"What is the mood or atmosphere?",
"Are there any text elements visible?"
]
for question in questions:
response = openai.chat.completions.create(
model="gpt-4-vision-preview",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "image_url", "image_url": {"url": image_url}}
]
}]
)
print(f"Q: {question}")
print(f"A: {response.choices[0].message.content}\n")
CLIP for Image-Text Matching
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
# Load image
image = Image.open("photo.jpg")
# Define candidate labels
labels = ["a photo of a cat", "a photo of a dog", "a photo of a bird"]
# Process
inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)
# Get probabilities
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)
# Results
for label, prob in zip(labels, probs[0]):
print(f"{label}: {prob:.2%}")
Document Understanding
# Extract information from documents
def analyze_document(image_path, questions):
with open(image_path, "rb") as img:
image_data = base64.b64encode(img.read()).decode()
results = {}
for question in questions:
response = openai.chat.completions.create(
model="gpt-4-vision-preview",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
]
}]
)
results[question] = response.choices[0].message.content
return results
# Use it
questions = [
"What is the total amount on this invoice?",
"What is the invoice date?",
"Who is the vendor?"
]
info = analyze_document("invoice.pdf", questions)
Multimodal AI unlocks new possibilities by understanding the world like humans do!