Generate speech
Learn how to use and integrate the Soniox Text-to-Speech REST API.
Overview
The Soniox REST API provides request/response Text-to-Speech over HTTP. Send text and generation options in a single POST /tts call and receive the generated audio in the response body.
REST is the right fit for:
- Server-side generation → Render audio on a backend and store or serve the result.
- Batch jobs → Pre-render prompts, menus, notifications, or static narration.
- Non-interactive flows → Anywhere a single request-per-utterance model is simpler than managing a live WebSocket.
For low-latency, interactive, or LLM-driven use cases where audio must start playing before the full text is ready, use the real-time WebSocket API instead.
Endpoint
POST https://tts-rt.soniox.com/ttsRequired request headers:
Content-Type: application/jsonAuthorization: Bearer <SONIOX_API_KEY|SONIOX_TEMPORARY_API_KEY>
See the Generate speech API reference for the full request schema, parameters, and error codes.
Response
On success, the response body contains raw audio bytes for the generated speech. The Content-Type header reflects the requested audio_format:
audio_format | Content-Type |
|---|---|
pcm_f32le, pcm_s16le, pcm_mulaw, pcm_alaw | audio/pcm |
wav | audio/wav |
mp3 | audio/mpeg |
aac | audio/aac |
opus | audio/opus |
flac | audio/flac |
For supported sample rates and bitrates per format, see audio formats.
Error handling
- Before audio streaming begins → the server returns a standard JSON error body with
error_codeanderror_message. - After audio streaming begins → error details are sent as HTTP trailers:
X-Tts-Error-CodeandX-Tts-Error-Message.
Mid-stream errors reported via HTTP trailers (X-Tts-Error-Code, X-Tts-Error-Message) may not be surfaced by HTTP clients that ignore trailers, including browser fetch and the Soniox JS SDK. For guaranteed error delivery, use the real-time WebSocket API instead.
For the full list of error codes and messages, see the Generate speech API reference.
Code example
Prerequisite: Complete the steps in Get started.
See on GitHub: soniox_sdk_rest.py.
import argparse
import os
from pathlib import Path
from typing import Any
from soniox import SonioxClient
from soniox.errors import SonioxAPIError
from soniox.utils import output_file_for_audio_format
VALID_SAMPLE_RATES = [8000, 16000, 24000, 44100, 48000]
VALID_BITRATES = [32000, 64000, 96000, 128000, 192000, 256000, 320000]
VALID_AUDIO_FORMATS = [
"pcm_f32le",
"pcm_s16le",
"pcm_mulaw",
"pcm_alaw",
"wav",
"aac",
"mp3",
"opus",
"flac",
]
def get_config(
*,
text: str,
model: str,
language: str,
voice: str,
audio_format: str,
sample_rate: int | None,
bitrate: int | None,
) -> dict[str, Any]:
config: dict[str, Any] = {
# Select the model to use.
# See: soniox.com/docs/tts/models
"model": model,
#
# Set the language of the input text.
# See: soniox.com/docs/tts/concepts/supported-languages
"language": language,
#
# Select the voice to use.
# See: soniox.com/docs/tts/concepts/voices
"voice": voice,
#
# Audio format.
# See: soniox.com/docs/tts/concepts/audio-formats
"audio_format": audio_format,
#
# Input text.
"text": text,
}
if sample_rate is not None:
config["sample_rate"] = sample_rate
if bitrate is not None:
config["bitrate"] = bitrate
return config
def generate_speech(
client: SonioxClient,
config: dict[str, Any],
output_path: str | None,
) -> None:
destination = (
Path(output_path)
if output_path
else output_file_for_audio_format(config["audio_format"], "tts_async")
)
print("Generating speech...")
written = client.tts.generate_to_file(destination, **config)
print(f"Wrote {written} bytes to {destination.resolve()}")
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"--text",
default=(
"Soniox Text-to-Speech turns written text into natural, expressive audio "
"with high accuracy. It is designed for conversational agents, narration, "
"and accessible experiences, with low latency and high-quality voices."
),
help="Text to generate into speech.",
)
parser.add_argument("--model", default="tts-rt-v1")
parser.add_argument("--language", default="en")
parser.add_argument("--voice", default="Adrian")
parser.add_argument("--audio_format", default="wav")
parser.add_argument("--sample_rate", type=int)
parser.add_argument("--bitrate", type=int)
parser.add_argument(
"--output_path",
help="Optional output file path. If omitted, a timestamped path is generated.",
)
args = parser.parse_args()
if args.audio_format not in VALID_AUDIO_FORMATS:
raise ValueError(f"audio_format must be one of {VALID_AUDIO_FORMATS}")
if args.sample_rate is not None and args.sample_rate not in VALID_SAMPLE_RATES:
raise ValueError(f"sample_rate must be None or one of {VALID_SAMPLE_RATES}")
if args.bitrate is not None and args.bitrate not in VALID_BITRATES:
raise ValueError(f"bitrate must be None or one of {VALID_BITRATES}")
api_key = os.environ.get("SONIOX_API_KEY")
if not api_key:
raise RuntimeError(
"Missing SONIOX_API_KEY.\n"
"1. Get your API key at https://console.soniox.com\n"
"2. Run: export SONIOX_API_KEY=<YOUR_API_KEY>"
)
client = SonioxClient(api_key=api_key)
config = get_config(
text=args.text,
model=args.model,
language=args.language,
voice=args.voice,
audio_format=args.audio_format,
sample_rate=args.sample_rate,
bitrate=args.bitrate,
)
try:
generate_speech(client, config, args.output_path)
except SonioxAPIError as exc:
print("Soniox API error:", exc)
if exc.request_id:
print(" request_id:", exc.request_id)
finally:
client.close()
if __name__ == "__main__":
main()# Generate speech with default settings (wav output)
python soniox_sdk_rest.py --text "Hello from Soniox REST Text-to-Speech."
# Generate raw PCM output
python soniox_sdk_rest.py --audio_format pcm_s16le --sample_rate 24000 --output_path tts-output.pcmSee on GitHub: soniox_sdk_rest.js.
import fs from "fs";
import { SonioxNodeClient } from "@soniox/node";
import path from "path";
import { parseArgs } from "node:util";
import process from "process";
const VALID_SAMPLE_RATES = [8000, 16000, 24000, 44100, 48000];
const VALID_BITRATES = [32000, 64000, 96000, 128000, 192000, 256000, 320000];
const VALID_AUDIO_FORMATS = [
"pcm_f32le",
"pcm_s16le",
"pcm_mulaw",
"pcm_alaw",
"wav",
"aac",
"mp3",
"opus",
"flac",
];
const RAW_PCM_FORMATS = ["pcm_s16le", "pcm_f32le", "pcm_mulaw", "pcm_alaw"];
const DEFAULT_TEXT =
"Soniox Text-to-Speech turns written text into natural, expressive audio " +
"with high accuracy. It is designed for conversational agents, narration, " +
"and accessible experiences, with low latency and high-quality voices.";
// Initialize the client.
// The API key is read from the SONIOX_API_KEY environment variable.
const client = new SonioxNodeClient();
// Resolve a concrete output file path.
// If the provided path has no extension, derive one from audio_format:
// * pcm_s16le -> .wav (we wrap the bytes in a WAV container below)
// * other pcm_* -> .pcm (raw, no container)
// * anything else -> the format name (e.g. .flac, .mp3, .opus)
function resolveOutputPath(outputPath, audioFormat) {
if (outputPath && path.extname(outputPath)) {
return outputPath;
}
const ext =
audioFormat === "pcm_s16le"
? "wav"
: RAW_PCM_FORMATS.includes(audioFormat)
? "pcm"
: audioFormat;
const base = outputPath || "tts_rest";
return `${base}.${ext}`;
}
function pcmS16leToWav(pcm, { sampleRate, numChannels = 1 }) {
const bitsPerSample = 16;
const byteRate = sampleRate * numChannels * (bitsPerSample / 8);
const blockAlign = numChannels * (bitsPerSample / 8);
const dataSize = pcm.byteLength;
const header = Buffer.alloc(44);
header.write("RIFF", 0, "ascii");
header.writeUInt32LE(36 + dataSize, 4);
header.write("WAVE", 8, "ascii");
header.write("fmt ", 12, "ascii");
header.writeUInt32LE(16, 16);
header.writeUInt16LE(1, 20);
header.writeUInt16LE(numChannels, 22);
header.writeUInt32LE(sampleRate, 24);
header.writeUInt32LE(byteRate, 28);
header.writeUInt16LE(blockAlign, 32);
header.writeUInt16LE(bitsPerSample, 34);
header.write("data", 36, "ascii");
header.writeUInt32LE(dataSize, 40);
return Buffer.concat([header, Buffer.from(pcm)]);
}
// Build a REST TTS request body.
function getConfig({
text,
model,
language,
voice,
audioFormat,
sampleRate,
bitrate,
}) {
const config = {
// Select the model to use.
// See: soniox.com/docs/tts/models
model,
// Set the language of the input text.
// See: soniox.com/docs/tts/concepts/supported-languages
language,
// Select the voice to use.
// See: soniox.com/docs/tts/concepts/voices
voice,
// Audio format.
// See: soniox.com/docs/tts/concepts/audio-formats
audio_format: audioFormat,
// Input text.
text,
};
if (sampleRate !== undefined) config.sample_rate = sampleRate;
if (bitrate !== undefined) config.bitrate = bitrate;
return config;
}
async function generateSpeech({ config, outputPath, sampleRate }) {
console.log("Generating speech...");
const wrapInWav =
config.audio_format === "pcm_s16le" &&
path.extname(outputPath).toLowerCase() === ".wav";
if (wrapInWav) {
const pcm = await client.tts.generate(config);
const wav = pcmS16leToWav(pcm, { sampleRate });
fs.writeFileSync(outputPath, wav);
console.log(`Wrote ${wav.length} bytes to ${path.resolve(outputPath)}`);
} else {
const written = await client.tts.generateToFile(outputPath, config);
console.log(`Wrote ${written} bytes to ${path.resolve(outputPath)}`);
}
}
async function main() {
const { values: argv } = parseArgs({
options: {
text: { type: "string", default: DEFAULT_TEXT },
model: { type: "string", default: "tts-rt-v1" },
language: { type: "string", default: "en" },
voice: { type: "string", default: "Adrian" },
audio_format: { type: "string", default: "pcm_s16le" },
sample_rate: { type: "string" },
bitrate: { type: "string" },
output_path: { type: "string" },
},
});
if (!VALID_AUDIO_FORMATS.includes(argv.audio_format)) {
throw new Error(
`audio_format must be one of ${VALID_AUDIO_FORMATS.join(", ")}`,
);
}
let sampleRate =
argv.sample_rate !== undefined ? Number(argv.sample_rate) : undefined;
if (sampleRate === undefined && RAW_PCM_FORMATS.includes(argv.audio_format)) {
sampleRate = 24000;
}
if (sampleRate !== undefined && !VALID_SAMPLE_RATES.includes(sampleRate)) {
throw new Error(
`sample_rate must be one of ${VALID_SAMPLE_RATES.join(", ")}`,
);
}
const bitrate = argv.bitrate !== undefined ? Number(argv.bitrate) : undefined;
if (bitrate !== undefined && !VALID_BITRATES.includes(bitrate)) {
throw new Error(`bitrate must be one of ${VALID_BITRATES.join(", ")}`);
}
const outputPath = resolveOutputPath(argv.output_path, argv.audio_format);
const config = getConfig({
text: argv.text,
model: argv.model,
language: argv.language,
voice: argv.voice,
audioFormat: argv.audio_format,
sampleRate,
bitrate,
});
await generateSpeech({ config, outputPath, sampleRate });
}
main().catch((err) => {
console.error("Soniox TTS error:", err.message);
process.exit(1);
});# Generate speech with default settings (wav output)
node soniox_sdk_rest.js --text "Hello from Soniox REST Text-to-Speech."
# Generate raw PCM output
node soniox_sdk_rest.js --audio_format pcm_s16le --sample_rate 24000 --output_path tts-output.pcmSee on GitHub: soniox_rest.py.
import argparse
import os
from typing import Any, Optional
import requests
from requests import Session
SONIOX_TTS_URL = "https://tts-rt.soniox.com/tts"
MODEL = "tts-rt-v1"
VALID_SAMPLE_RATES = [8000, 16000, 24000, 44100, 48000]
VALID_BITRATES = [32000, 64000, 96000, 128000, 192000, 256000, 320000]
VALID_AUDIO_FORMATS = [
"pcm_f32le",
"pcm_s16le",
"pcm_mulaw",
"pcm_alaw",
"wav",
"aac",
"mp3",
"opus",
"flac",
]
def get_output_path(*, output_path: str, audio_format: str) -> str:
"""
Generates the resulting output path for the given audio format.
"""
if "." in os.path.basename(output_path):
return output_path
ext = "pcm" if audio_format in ("pcm_s16le", "pcm_f32le", "pcm_mulaw", "pcm_alaw") else audio_format
return f"{output_path}.{ext}"
# Get Soniox TTS config.
def get_config(
*,
language: str,
voice: str,
audio_format: str,
text: str,
sample_rate: Optional[int],
bitrate: Optional[int],
) -> dict:
config: dict[str, Any] = {
# Select the model to use.
# See: soniox.com/docs/tts/models
"model": MODEL,
#
# Set the language of the input text.
# See: soniox.com/docs/tts/concepts/supported-languages
"language": language,
#
# Select the voice to use.
# See: soniox.com/docs/tts/concepts/voices
"voice": voice,
#
# Audio format.
# See: soniox.com/docs/tts/concepts/audio-formats
"audio_format": audio_format,
#
# Input text.
"text": text,
}
if sample_rate is not None:
config["sample_rate"] = sample_rate
if bitrate is not None:
config["bitrate"] = bitrate
return config
def generate_speech(
session: Session,
tts_url: str,
config: dict,
output_path: str,
) -> None:
print("Connecting to Soniox...")
res = session.post(tts_url, json=config, stream=True)
if res.status_code != 200:
try:
err = res.json()
except Exception:
err = {"error_message": res.text}
raise RuntimeError(
f"TTS request failed (status={res.status_code}, "
f"error_code={err.get('error_code')}, "
f"error_message={err.get('error_message')})"
)
bytes_written = 0
with open(output_path, "wb") as f:
for chunk in res.iter_content(chunk_size=8192):
if not chunk:
continue
f.write(chunk)
bytes_written += len(chunk)
print(f"Wrote {bytes_written} bytes to {output_path}")
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--text",
default=(
"Soniox Text-to-Speech turns written text into natural, expressive audio "
"with high accuracy. It is designed for conversational agents, narration, "
"and accessible experiences, with low latency and high-quality voices."
),
)
parser.add_argument("--language", default="en")
parser.add_argument("--voice", default="Adrian")
parser.add_argument("--audio_format", default="wav")
parser.add_argument("--sample_rate", type=int)
parser.add_argument("--bitrate", type=int)
parser.add_argument("--output_path", default="tts-rest")
args = parser.parse_args()
if args.audio_format not in VALID_AUDIO_FORMATS:
raise ValueError(f"audio_format must be one of {VALID_AUDIO_FORMATS}")
if args.sample_rate is not None and args.sample_rate not in VALID_SAMPLE_RATES:
raise ValueError(f"sample_rate must be None or one of {VALID_SAMPLE_RATES}")
if args.bitrate is not None and args.bitrate not in VALID_BITRATES:
raise ValueError(f"bitrate must be None or one of {VALID_BITRATES}")
api_key = os.environ.get("SONIOX_API_KEY")
if not api_key:
raise RuntimeError(
"Missing SONIOX_API_KEY.\n"
"1. Get your API key at https://console.soniox.com\n"
"2. Run: export SONIOX_API_KEY=<YOUR_API_KEY>"
)
final_output_path = get_output_path(
output_path=args.output_path, audio_format=args.audio_format
)
config = get_config(
language=args.language,
voice=args.voice,
audio_format=args.audio_format,
text=args.text,
sample_rate=args.sample_rate,
bitrate=args.bitrate,
)
# Create an authenticated session.
session = requests.Session()
session.headers.update({"Authorization": f"Bearer {api_key}"})
generate_speech(session, SONIOX_TTS_URL, config, final_output_path)
if __name__ == "__main__":
main()# Generate speech with default settings (wav output)
python soniox_rest.py --text "Hello from Soniox REST Text-to-Speech."
# Generate raw PCM output
python soniox_rest.py --audio_format pcm_s16le --sample_rate 24000 --output_path tts-outputSee on GitHub: soniox_rest.js.
import fs from "fs";
import path from "path";
import { Readable } from "stream";
import { pipeline } from "stream/promises";
import { parseArgs } from "node:util";
import process from "process";
const SONIOX_TTS_URL = "https://tts-rt.soniox.com/tts";
const MODEL = "tts-rt-v1";
const VALID_SAMPLE_RATES = [8000, 16000, 24000, 44100, 48000];
const VALID_BITRATES = [32000, 64000, 96000, 128000, 192000, 256000, 320000];
const VALID_AUDIO_FORMATS = [
"pcm_f32le",
"pcm_s16le",
"pcm_mulaw",
"pcm_alaw",
"wav",
"aac",
"mp3",
"opus",
"flac",
];
const RAW_PCM_FORMATS = ["pcm_s16le", "pcm_f32le", "pcm_mulaw", "pcm_alaw"];
const DEFAULT_TEXT =
"Soniox Text-to-Speech turns written text into natural, expressive audio " +
"with high accuracy. It is designed for conversational agents, narration, " +
"and accessible experiences, with low latency and high-quality voices.";
// Resolve a concrete output file path.
// If the provided path has no extension, derive one from audio_format:
// * pcm_s16le -> .wav (we wrap the bytes in a WAV container below)
// * other pcm_* -> .pcm (raw, no container)
// * anything else -> the format name (e.g. .flac, .mp3, .opus)
function resolveOutputPath(outputPath, audioFormat) {
if (outputPath && path.extname(outputPath)) {
return outputPath;
}
const ext =
audioFormat === "pcm_s16le"
? "wav"
: RAW_PCM_FORMATS.includes(audioFormat)
? "pcm"
: audioFormat;
const base = outputPath || "tts-rest";
return `${base}.${ext}`;
}
function pcmS16leToWav(pcm, { sampleRate, numChannels = 1 }) {
const bitsPerSample = 16;
const byteRate = sampleRate * numChannels * (bitsPerSample / 8);
const blockAlign = numChannels * (bitsPerSample / 8);
const dataSize = pcm.byteLength;
const header = Buffer.alloc(44);
header.write("RIFF", 0, "ascii");
header.writeUInt32LE(36 + dataSize, 4);
header.write("WAVE", 8, "ascii");
header.write("fmt ", 12, "ascii");
header.writeUInt32LE(16, 16);
header.writeUInt16LE(1, 20);
header.writeUInt16LE(numChannels, 22);
header.writeUInt32LE(sampleRate, 24);
header.writeUInt32LE(byteRate, 28);
header.writeUInt16LE(blockAlign, 32);
header.writeUInt16LE(bitsPerSample, 34);
header.write("data", 36, "ascii");
header.writeUInt32LE(dataSize, 40);
return Buffer.concat([header, Buffer.from(pcm)]);
}
// Get Soniox TTS config.
function getConfig({
language,
voice,
audioFormat,
text,
sampleRate,
bitrate,
}) {
const config = {
// Select the model to use.
// See: soniox.com/docs/tts/models
model: MODEL,
// Set the language of the input text.
// See: soniox.com/docs/tts/concepts/supported-languages
language,
// Select the voice to use.
// See: soniox.com/docs/tts/concepts/voices
voice,
// Audio format.
// See: soniox.com/docs/tts/concepts/audio-formats
audio_format: audioFormat,
// Input text.
text,
};
if (sampleRate !== undefined) config.sample_rate = sampleRate;
if (bitrate !== undefined) config.bitrate = bitrate;
return config;
}
async function generateSpeech({ apiKey, config, outputPath, sampleRate }) {
console.log("Connecting to Soniox...");
const res = await fetch(SONIOX_TTS_URL, {
method: "POST",
headers: {
// Soniox REST TTS uses Bearer auth.
Authorization: `Bearer ${apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify(config),
});
if (!res.ok) {
let err;
try {
err = await res.json();
} catch {
err = { error_message: await res.text() };
}
throw new Error(
`TTS request failed (status=${res.status}, ` +
`error_code=${err.error_code}, error_message=${err.error_message})`,
);
}
if (!res.body) {
throw new Error("Empty response body.");
}
const wrapInWav =
config.audio_format === "pcm_s16le" &&
path.extname(outputPath).toLowerCase() === ".wav";
if (wrapInWav) {
// Buffer the full PCM response, then write a WAV file with a correct
// data-chunk size so the output plays in every media player.
const pcm = Buffer.from(await res.arrayBuffer());
const wav = pcmS16leToWav(pcm, { sampleRate });
fs.writeFileSync(outputPath, wav);
console.log(`Wrote ${wav.length} bytes to ${path.resolve(outputPath)}`);
} else {
// Stream the audio response directly to the output file.
const fileStream = fs.createWriteStream(outputPath);
await pipeline(Readable.fromWeb(res.body), fileStream);
const { size } = fs.statSync(outputPath);
console.log(`Wrote ${size} bytes to ${path.resolve(outputPath)}`);
}
}
async function main() {
const { values: argv } = parseArgs({
options: {
text: { type: "string", default: DEFAULT_TEXT },
language: { type: "string", default: "en" },
voice: { type: "string", default: "Adrian" },
audio_format: { type: "string", default: "pcm_s16le" },
sample_rate: { type: "string" },
bitrate: { type: "string" },
output_path: { type: "string", default: "tts-rest" },
},
});
if (!VALID_AUDIO_FORMATS.includes(argv.audio_format)) {
throw new Error(
`audio_format must be one of ${VALID_AUDIO_FORMATS.join(", ")}`,
);
}
let sampleRate =
argv.sample_rate !== undefined ? Number(argv.sample_rate) : undefined;
if (sampleRate === undefined && RAW_PCM_FORMATS.includes(argv.audio_format)) {
sampleRate = 24000;
}
if (sampleRate !== undefined && !VALID_SAMPLE_RATES.includes(sampleRate)) {
throw new Error(
`sample_rate must be one of ${VALID_SAMPLE_RATES.join(", ")}`,
);
}
const bitrate = argv.bitrate !== undefined ? Number(argv.bitrate) : undefined;
if (bitrate !== undefined && !VALID_BITRATES.includes(bitrate)) {
throw new Error(`bitrate must be one of ${VALID_BITRATES.join(", ")}`);
}
const apiKey = process.env.SONIOX_API_KEY;
if (!apiKey) {
throw new Error(
"Missing SONIOX_API_KEY.\n" +
"1. Get your API key at https://console.soniox.com\n" +
"2. Run: export SONIOX_API_KEY=<YOUR_API_KEY>",
);
}
const outputPath = resolveOutputPath(argv.output_path, argv.audio_format);
const config = getConfig({
language: argv.language,
voice: argv.voice,
audioFormat: argv.audio_format,
text: argv.text,
sampleRate,
bitrate,
});
await generateSpeech({ apiKey, config, outputPath, sampleRate });
}
main().catch((err) => {
console.error("Error:", err.message);
process.exit(1);
});# Generate speech with default settings (wav output)
node soniox_rest.js --text "Hello from Soniox REST Text-to-Speech."
# Generate raw PCM output
node soniox_rest.js --audio_format pcm_s16le --sample_rate 24000 --output_path tts-output