--- asyncapi: "2.0.0-rc1" id: "urn:com:speechmatics:realtime-asr-api" defaultContentType: "application/json" info: title: Speechmatics Realtime ASR API version: "2.0.0" contact: name: Speechmatics Support url: https://www.speechmatics.com/product/support/ email: support@speechmatics.com servers: - url: example.speechmatics.com description: RealTime ASR server protocol: WebSocket protocolVersion: v13 (RFC 6455) variables: ports: default: "9000" channels: /: publish: message: oneOf: - $ref: "#/components/messages/StartRecognition" - $ref: "#/components/messages/AddAudio" - $ref: "#/components/messages/EndOfStream" - $ref: "#/components/messages/SetRecognitionConfig" subscribe: message: oneOf: - $ref: "#/components/messages/RecognitionStarted" - $ref: "#/components/messages/AudioAdded" - $ref: "#/components/messages/AddPartialTranscript" - $ref: "#/components/messages/AddTranscript" - $ref: "#/components/messages/AddPartialTranslation" - $ref: "#/components/messages/AddTranslation" - $ref: "#/components/messages/EndOfTranscript" - $ref: "#/components/messages/Error" - $ref: "#/components/messages/Warning" - $ref: "#/components/messages/Info" components: messages: # Sub StartRecognition: summary: Initiates a new recognition session. payload: type: object properties: message: enum: - StartRecognition audio_format: "$ref": "#/components/schemas/AudioFormat" transcription_config: "$ref": "#/components/schemas/TranscriptionConfig" translation_config: "$ref": "#/components/schemas/TranslationConfig" required: - message - audio_format - transcription_config AddAudio: summary: A binary chunk of audio. The server confirms receipt by sending an AudioAdded message. contentType: "application/octet-stream" payload: type: string format: binary EndOfStream: summary: Declares that the client has no more audio to send. payload: type: object properties: message: enum: - EndOfStream last_seq_no: type: integer required: - message - last_seq_no SetRecognitionConfig: summary: Allows the client to re-configure the recognition session. payload: type: object properties: message: enum: - SetRecognitionConfig transcription_config: "$ref": "#/components/schemas/TranscriptionConfig" required: - message - transcription_config # Pub RecognitionStarted: summary: Server response to StartRecognition, acknowledging that a recognition session has started. payload: type: object properties: message: enum: - RecognitionStarted id: type: string required: - message AudioAdded: summary: Server response to AddAudio, indicating that audio has been added successfully. payload: type: object properties: message: enum: - AudioAdded seq_no: type: integer required: - message - seq_no AddPartialTranscript: summary: Contains a work-in-progress transcript of a part of the audio that the client has sent. payload: type: object properties: message: enum: - AddPartialTranscript format: type: string example: "2.1" description: Speechmatics JSON output format version number. metadata: "$ref": "#/components/schemas/RecognitionMetadata" results: type: array items: "$ref": "#/components/schemas/RecognitionResult" required: - message - metadata - results AddTranscript: summary: Contains the final transcript of a part of the audio that the client has sent. payload: type: object properties: message: enum: - AddTranscript format: type: string example: "2.1" description: Speechmatics JSON output format version number. metadata: "$ref": "#/components/schemas/RecognitionMetadata" results: type: array items: "$ref": "#/components/schemas/RecognitionResult" required: - message - metadata - results AddPartialTranslation: summary: Contains a work-in-progress translation of a part of the audio that the client has sent. payload: type: object properties: message: enum: - AddPartialTranslation format: type: string example: "2.1" description: Speechmatics JSON output format version number. language: type: string results: type: array items: "$ref": "#/components/schemas/TranslatedSentence" required: - message - language - results AddTranslation: summary: Contains the final translation of a part of the audio that the client has sent. payload: type: object properties: message: enum: - AddTranslation format: type: string example: "2.1" description: Speechmatics JSON output format version number. language: type: string results: type: array items: "$ref": "#/components/schemas/TranslatedSentence" required: - message - language - results EndOfTranscript: summary: Server response to EndOfStream, after the server has finished sending all AddTranscript messages. payload: type: object properties: message: enum: - EndOfTranscript required: - message Info: summary: Additional information sent from the server to the client. payload: type: object properties: message: enum: - Info type: enum: - recognition_quality - model_redirect - deprecated reason: type: string code: type: integer seq_no: type: integer quality: type: string required: - message - type - reason Warning: summary: Warning messages sent from the server to the client. payload: type: object properties: message: enum: - Warning type: enum: - duration_limit_exceeded reason: type: string code: type: integer seq_no: type: integer duration_limit: type: number required: - message - type - reason Error: summary: Error messages sent from the server to the client. payload: type: object properties: message: enum: - Error type: enum: - invalid_message - invalid_model - invalid_config - invalid_audio_type - not_authorised - insufficient_funds - not_allowed - job_error - data_error - buffer_error - protocol_error - unknown_error reason: type: string code: type: integer seq_no: type: integer required: - message - type - reason schemas: AudioFormat: type: object required: - type properties: type: enum: - raw - file encoding: enum: - pcm_f32le - pcm_s16le - mulaw sample_rate: type: integer TranscriptionConfig: type: object properties: language: type: string domain: type: string description: >- Request a specialized model based on 'language' but optimized for a particular field, e.g. "finance" or "medical". output_locale: "$ref": "#/components/schemas/OutputLocale" additional_vocab: "$ref": "#/components/schemas/VocabList" diarization: "$ref": "#/components/schemas/DiarizationConfig" max_delay: type: number minimum: 0 max_delay_mode: "$ref": "#/components/schemas/MaxDelayModeConfig" speaker_change_sensitivity: "$ref": "#/components/schemas/SpeakerChangeSensitivity" speaker_diarization_config: "$ref": "#/components/schemas/SpeakerDiarizationConfig" enable_partials: type: boolean default: false enable_entities: type: boolean default: true operating_point: "$ref": "#/components/schemas/OperatingPoint" punctuation_overrides: "$ref": "#/components/schemas/PunctuationOverrides" required: - language OperatingPoint: type: string enum: - standard - enhanced PunctuationOverrides: type: object properties: permitted_marks: type: array description: "The punctuation marks which the client is prepared to accept in transcription output, or the special value 'all' (the default). Unsupported marks are ignored. This value is used to guide the transcription process." items: pattern: "^(.|all)$" type: string sensitivity: type: number description: "Ranges between zero and one. Higher values will produce more punctuation. The default is 0.5." format: float maximum: 1 minimum: 0 TranslationConfig: type: object properties: target_languages: type: array items: type: string enable_partials: type: boolean default: false required: - target_languages VocabList: type: array items: "$ref": "#/components/schemas/VocabWord" VocabWord: oneOf: - type: object properties: content: type: string minLength: 1 sounds_like: type: array items: type: string minLength: 1 minItems: 1 required: - content - type: string minLength: 1 DiarizationConfig: type: string enum: - none - speaker - speaker_change SpeakerChangeSensitivity: type: number format: float minimum: 0 maximum: 1 SpeakerDiarizationConfig: type: object properties: max_speakers: type: number format: integer minimum: 2 maximum: 100 OutputLocale: type: string minLength: 1 RecognitionMetadata: type: object properties: start_time: type: number format: float end_time: type: number format: float transcript: type: string required: - start_time - end_time - transcript RecognitionResult: type: object properties: type: type: string enum: - word - punctuation - speaker_change start_time: type: number format: float end_time: type: number format: float channel: type: string attaches_to: type: string enum: - next - previous - none - both is_eos: type: boolean alternatives: type: array items: "$ref": "#/components/schemas/RecognitionAlternative" score: type: number format: float minimum: 0 maximum: 1 required: - type - start_time - end_time TranslatedSentence: type: object properties: content: type: string start_time: type: number format: float end_time: type: number format: float speaker: type: string required: - content - start_time - end_time RecognitionAlternative: type: object properties: content: type: string confidence: type: number format: float language: # Although language is technically optional removing language breaks the way adapters adds spaces and # language specific punctuation. This caused REQ-10454. The solution for Bellini was to add the language field # back in at the rt-worker level. The future work on this issue is written up in REQ-10633. type: string display: "$ref": "#/components/schemas/RecognitionDisplay" speaker: type: string tags: type: array items: type: string required: - content - confidence RecognitionDisplay: required: - direction properties: direction: type: string enum: - ltr - rtl MaxDelayModeConfig: type: string enum: - flexible - fixed