Proxy stream

How to stream audio from a client app to Soniox Speech-to-Text WebSocket API through a proxy server.

Overview

This guide explains how to stream microphone audio from a client to the Soniox WebSocket API through a proxy server.

In this architecture, the client captures audio and sends it over WebSocket to a proxy server. The proxy server establishes a connection to the Soniox WebSocket API, authenticates the session, streams the audio for transcription, and relays the transcribed results back to the client in real time.

This setup is useful when you want to inspect, transform, or store audio and transcription data on the server side before passing it to the client. If your goal is simply to transcribe audio and return results with the lowest possible latency, consider using the direct stream approach instead.

Example

In the following example, we create a proxy HTTP server that:

Listens for incoming WebSocket connections from the client.
Forwards audio data from the client to the WebSocket API.
Relays transcription results back to the client.

Authentication with the WebSocket API is handled by the proxy server using the SONIOX_API_KEY.

Python server that will act as a proxy between our client and WebSocket API.

import os
import json
import asyncio
 
from dotenv import load_dotenv
import websockets
 
load_dotenv()
 
 
async def handle_client(websocket):
    print("Browser client connected")
 
    # create a message queue to store client messages received before
    # Soniox WebSocket API connection is ready, so we don't loose any
    message_queue = []
    soniox_ws = None
    soniox_ws_ready = False
 
    async def init_soniox_connection():
        nonlocal soniox_ws, soniox_ws_ready
 
        try:
            soniox_ws = await websockets.connect(
                "wss://stt-rt.soniox.com/transcribe-websocket"
            )
            print("Connected to Soniox STT WebSocket API")
 
            # Send initial configuration message
            start_message = json.dumps(
                {
                    "api_key": os.getenv("SONIOX_API_KEY"),
                    "audio_format": "auto",
                    "model": "stt-rt-preview",
                    "language_hints": ["en"],
                }
            )
            await soniox_ws.send(start_message)
            print("Sent start message to Soniox")
 
            # mark connection as ready
            soniox_ws_ready = True
 
            # process any queued messages
            while len(message_queue) > 0 and soniox_ws_ready:
                data = message_queue.pop(0)
                await forward_data(data)
 
            # receive messages from Soniox STT WebSocket API
            async for message in soniox_ws:
                try:
                    await websocket.send(message)
                except Exception as e:
                    print(f"Error forwarding Soniox response: {e}")
                    break
 
        except Exception as e:
            print(f"Soniox WebSocket error: {e}")
            soniox_ws_ready = False
        finally:
            if soniox_ws:
                await soniox_ws.close()
            soniox_ws_ready = False
            print("Soniox WebSocket closed")
 
    async def forward_data(data):
        try:
            if soniox_ws:
                await soniox_ws.send(data)
        except Exception as e:
            print(f"Error forwarding data to Soniox: {e}")
 
    # initialize Soniox connection
    soniox_task = asyncio.create_task(init_soniox_connection())
 
    try:
        # receive messages from browser client
        async for data in websocket:
            if soniox_ws_ready:
                # forward messages instantly
                await forward_data(data)
            else:
                # queue the message to be processed
                # as soon as connection to Soniox STT WebSocket API is ready
                message_queue.append(data)
    except Exception as e:
        print(f"Error with browser client: {e}")
    finally:
        print("Browser client disconnected")
        soniox_task.cancel()
        try:
            await soniox_task
        except asyncio.CancelledError:
            pass
 
 
async def main():
    port = int(os.getenv("PORT", 3001))
    server = await websockets.serve(handle_client, "0.0.0.0", port)
    print(f"WebSocket proxy server listening on http://0.0.0.0:{port}")
 
    await server.wait_closed()
 
 
if __name__ == "__main__":
    asyncio.run(main())

View example on GitHub

Next, we create a basic HTML page as the client (same concept works for any other app framework).

The HTML client:

Connects to the proxy server via WebSocket.
Captures audio stream from the microphone through the MediaRecorder.
Streams audio data to the proxy server.
Receives messages from the proxy server and renders transcribed text into a div.

<!DOCTYPE html>
<html>
 
<body>
  <h1>Browser proxy stream example</h1>
  <button id="trigger">Start</button>
  <hr />
  <div>
    <span id="final"></span>
    <span id="nonfinal" style="color: gray"></span>
  </div>
  <div id="error"></div>
  <script>
    const finalEl = document.getElementById("final");
    const nonFinalEl = document.getElementById("nonfinal");
    const errorEl = document.getElementById("error");
    const trigger = document.getElementById("trigger");
 
    let ws;
    let recorder;
    let recorderState = "stopped"; // "stopped" | "starting" | "running" | "stopping"
 
    trigger.onclick = async () => {
      if (recorderState === "stopped") {
        finalEl.textContent = "";
        nonFinalEl.textContent = "";
        errorEl.textContent = "";
        trigger.textContent = "Starting...";
        recorderState = "starting";
 
        // get audio stream from user microphone
        const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
 
        // connect to the proxy server
        ws = new WebSocket("ws://localhost:3001/");
 
        ws.onopen = () => {
          recorder = new MediaRecorder(stream);
 
          recorder.ondataavailable = async (event) => {
            if (event.data.size > 0) {
              // convert the recorded audio chunk (Blob) to raw binary (ArrayBuffer)
              // and send via websocket message
              ws.send(await event.data.arrayBuffer());
            }
          };
 
          recorder.onstop = () => {
            // send empty string message to tell the Soniox WebSocket API to stop
            ws.send("");
          };
 
          // start recording, creating data chunks every 120ms
          recorder.start(120);
 
          recorderState = "running";
          trigger.textContent = "Stop";
        };
 
        let finalText = "";
 
        ws.onmessage = (event) => {
          // parse messages received from Node.js server
          const result = JSON.parse(event.data);
 
          if (result.error_message) {
            errorEl.textContent = `${result.error_message}`;
            return;
          }
 
          // render the transcript
          let nonFinalText = "";
 
          for (let token of result.tokens) {
            if (token.is_final) {
              finalText += token.text;
            } else {
              nonFinalText += token.text;
            }
          }
 
          finalEl.textContent = finalText;
          nonFinalEl.textContent = nonFinalText;
        };
 
        ws.onerror = (error) => {
          console.error("WebSocket error:", error);
          errorEl.textContent = `${message}`;
          stopRecording();
        };
 
        ws.onclose = (event) => {
          console.log("WebSocket connection closed", event.code);
          stopRecording();
        };
      } else if (recorderState === "running") {
        stopRecording();
      }
    };
 
    function stopRecording() {
      if (recorder) {
        // stop microphone recording properly
        recorder.stop();
        recorder.stream.getTracks().forEach((t) => t.stop());
      }
      trigger.textContent = "Start";
      recorderState = "stopped";
    }
  </script>
</body>
 
</html>

View example on GitHub

Proxy stream

Overview

Example

On this page