Vector databases store and search high-dimensional vectors, enabling semantic search and RAG applications at scale.
Understanding Embeddings
from openai import OpenAI
client = OpenAI()
# Create embeddings
text = "Machine learning is fascinating"
response = client.embeddings.create(
model="text-embedding-3-small",
input=text
)
embedding = response.data[0].embedding # 1536-dimensional vector
print(f"Embedding length: {len(embedding)}")
# Similar texts have similar vectors
texts = ["AI is amazing", "Dogs are cute", "Artificial intelligence rocks"]
embeddings = [
client.embeddings.create(input=t, model="text-embedding-3-small").data[0].embedding
for t in texts
]
ChromaDB for Local Development
import chromadb
from chromadb.utils import embedding_functions
# Initialize client
client = chromadb.Client()
# Create collection
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
api_key="your-key",
model_name="text-embedding-3-small"
)
collection = client.create_collection(
name="my_documents",
embedding_function=openai_ef
)
# Add documents
collection.add(
documents=["AI is the future", "Machine learning powers AI"],
metadatas=[{"source": "doc1"}, {"source": "doc2"}],
ids=["id1", "id2"]
)
# Query
results = collection.query(
query_texts=["What is AI?"],
n_results=2
)
Pinecone for Production
import pinecone
# Initialize
pinecone.init(api_key="your-key", environment="us-west1-gcp")
# Create index
index_name = "my-index"
if index_name not in pinecone.list_indexes():
pinecone.create_index(
name=index_name,
dimension=1536, # OpenAI embedding size
metric="cosine"
)
# Connect to index
index = pinecone.Index(index_name)
# Upsert vectors
index.upsert(vectors=[
("vec1", embedding1, {"text": "AI content"}),
("vec2", embedding2, {"text": "ML content"})
])
# Search
results = index.query(
vector=query_embedding,
top_k=5,
include_metadata=True
)
Weaviate for Advanced Features
import weaviate
# Connect
client = weaviate.Client("http://localhost:8080")
# Create schema
schema = {
"class": "Document",
"vectorizer": "text2vec-openai",
"properties": [
{"name": "content", "dataType": ["text"]},
{"name": "author", "dataType": ["string"]}
]
}
client.schema.create_class(schema)
# Add data
client.data_object.create(
class_name="Document",
data_object={
"content": "AI trends in 2024",
"author": "Alice"
}
)
# Hybrid search (vector + keyword)
results = client.query.get("Document", ["content", "author"])\
.with_hybrid(query="AI", alpha=0.5)\
.with_limit(5)\
.do()
Vector databases are essential for modern AI applications. Choose the right one for your needs!