AI-powered speech technologies enable natural voice interactions, transcriptions, and accessibility features.
OpenAI Whisper for Transcription
import openai
# Transcribe audio file
with open("audio.mp3", "rb") as audio_file:
transcript = openai.audio.transcriptions.create(
model="whisper-1",
file=audio_file
)
print(transcript.text)
# Translation (to English)
with open("spanish.mp3", "rb") as audio_file:
translation = openai.audio.translations.create(
model="whisper-1",
file=audio_file
)
print(translation.text)
Text-to-Speech with OpenAI
# Generate speech from text
response = openai.audio.speech.create(
model="tts-1",
voice="alloy", # alloy, echo, fable, onyx, nova, shimmer
input="Hello! This is AI-generated speech."
)
# Save to file
response.stream_to_file("output.mp3")
Real-Time Speech Processing
import speech_recognition as sr
import pyttsx3
# Speech recognition
recognizer = sr.Recognizer()
with sr.Microphone() as source:
print("Speak something...")
audio = recognizer.listen(source)
try:
text = recognizer.recognize_google(audio)
print(f"You said: {text}")
except sr.UnknownValueError:
print("Could not understand audio")
# Text-to-speech (local)
engine = pyttsx3.init()
engine.say("Hello, how can I help you?")
engine.runAndWait()
Building Voice Assistant
def voice_assistant():
recognizer = sr.Recognizer()
with sr.Microphone() as source:
print("Listening...")
audio = recognizer.listen(source)
# Transcribe
query = recognizer.recognize_google(audio)
print(f"User: {query}")
# Process with LLM
response = openai.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": query}]
)
answer = response.choices[0].message.content
print(f"Assistant: {answer}")
# Speak response
speech = openai.audio.speech.create(
model="tts-1",
voice="nova",
input=answer
)
speech.stream_to_file("response.mp3")
voice_assistant()
Speech AI enables natural voice interactions for accessible, hands-free applications!