Generative AI

Multimodal AI: Combining Vision and Language Models

📅 December 05, 2025 ⏱️ 1 min read 👁️ 4 views 🏷️ Generative AI

Multimodal AI models process and understand multiple types of data simultaneously, enabling richer interactions.

GPT-4 Vision API


import openai
import base64

# Encode image
with open("image.jpg", "rb") as image_file:
    image_data = base64.b64encode(image_file.read()).decode()

# Analyze image with GPT-4 Vision
response = openai.chat.completions.create(
    model="gpt-4-vision-preview",
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "What's in this image?"},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{image_data}"
                    }
                }
            ]
        }
    ],
    max_tokens=300
)

print(response.choices[0].message.content)

Image-to-Text with Vision Models


# Detailed image analysis
questions = [
    "Describe this image in detail",
    "What objects do you see?",
    "What is the mood or atmosphere?",
    "Are there any text elements visible?"
]

for question in questions:
    response = openai.chat.completions.create(
        model="gpt-4-vision-preview",
        messages=[{
            "role": "user",
            "content": [
                {"type": "text", "text": question},
                {"type": "image_url", "image_url": {"url": image_url}}
            ]
        }]
    )
    print(f"Q: {question}")
    print(f"A: {response.choices[0].message.content}\n")

CLIP for Image-Text Matching


from transformers import CLIPProcessor, CLIPModel
from PIL import Image

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Load image
image = Image.open("photo.jpg")

# Define candidate labels
labels = ["a photo of a cat", "a photo of a dog", "a photo of a bird"]

# Process
inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
outputs = model(**inputs)

# Get probabilities
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)

# Results
for label, prob in zip(labels, probs[0]):
    print(f"{label}: {prob:.2%}")

Document Understanding


# Extract information from documents
def analyze_document(image_path, questions):
    with open(image_path, "rb") as img:
        image_data = base64.b64encode(img.read()).decode()
    
    results = {}
    for question in questions:
        response = openai.chat.completions.create(
            model="gpt-4-vision-preview",
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text": question},
                    {"type": "image_url", 
                     "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
                ]
            }]
        )
        results[question] = response.choices[0].message.content
    
    return results

# Use it
questions = [
    "What is the total amount on this invoice?",
    "What is the invoice date?",
    "Who is the vendor?"
]

info = analyze_document("invoice.pdf", questions)

Multimodal AI unlocks new possibilities by understanding the world like humans do!

🏷️ Tags:

multimodal AI GPT-4 Vision CLIP image analysis vision language

Multimodal AI: Combining Vision and Language Models

GPT-4 Vision API

Image-to-Text with Vision Models

CLIP for Image-Text Matching

Document Understanding

📚 Related Articles

Cost Optimization for LLM Applications: Reduce AI Expenses

AI Content Moderation: Building Safe Applications

Text-to-Speech and Speech-to-Text AI: Complete Guide