Generative AI

Text-to-Speech and Speech-to-Text AI: Complete Guide

📅 December 05, 2025 ⏱️ 1 min read 👁️ 4 views 🏷️ Generative AI

AI-powered speech technologies enable natural voice interactions, transcriptions, and accessibility features.

OpenAI Whisper for Transcription


import openai

# Transcribe audio file
with open("audio.mp3", "rb") as audio_file:
    transcript = openai.audio.transcriptions.create(
        model="whisper-1",
        file=audio_file
    )

print(transcript.text)

# Translation (to English)
with open("spanish.mp3", "rb") as audio_file:
    translation = openai.audio.translations.create(
        model="whisper-1",
        file=audio_file
    )

print(translation.text)

Text-to-Speech with OpenAI


# Generate speech from text
response = openai.audio.speech.create(
    model="tts-1",
    voice="alloy",  # alloy, echo, fable, onyx, nova, shimmer
    input="Hello! This is AI-generated speech."
)

# Save to file
response.stream_to_file("output.mp3")

Real-Time Speech Processing


import speech_recognition as sr
import pyttsx3

# Speech recognition
recognizer = sr.Recognizer()

with sr.Microphone() as source:
    print("Speak something...")
    audio = recognizer.listen(source)
    
    try:
        text = recognizer.recognize_google(audio)
        print(f"You said: {text}")
    except sr.UnknownValueError:
        print("Could not understand audio")

# Text-to-speech (local)
engine = pyttsx3.init()
engine.say("Hello, how can I help you?")
engine.runAndWait()

Building Voice Assistant


def voice_assistant():
    recognizer = sr.Recognizer()
    
    with sr.Microphone() as source:
        print("Listening...")
        audio = recognizer.listen(source)
        
        # Transcribe
        query = recognizer.recognize_google(audio)
        print(f"User: {query}")
        
        # Process with LLM
        response = openai.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": query}]
        )
        
        answer = response.choices[0].message.content
        print(f"Assistant: {answer}")
        
        # Speak response
        speech = openai.audio.speech.create(
            model="tts-1",
            voice="nova",
            input=answer
        )
        speech.stream_to_file("response.mp3")

voice_assistant()

Speech AI enables natural voice interactions for accessible, hands-free applications!

🏷️ Tags:
text-to-speech speech-to-text Whisper voice AI audio processing

📚 Related Articles