openapi: 3.0.0 info: title: Cartesia REST API version: "2024-06-10" servers: - url: "" security: - APIKeyHeader: [] paths: "/tts/bytes": post: summary: Stream Speech (Bytes) description: |- Generate audio from a transcript using a given voice and model. The audio is streamed out as raw bytes. parameters: - $ref: "#/components/parameters/Cartesia-Version" requestBody: $ref: "#/components/requestBodies/TTSRequest" responses: "200": description: A stream of audio data. content: application/octet-stream: schema: type: string format: binary description: Binary audio data. The underlying bytes will be in the requested output format. default: $ref: "#/components/responses/Error" "/tts/sse": post: summary: Stream Speech (Server-Sent Events) description: |- Generate audio from a transcript using a given voice and model. The audio is streamed out as Server-Sent Events. parameters: - $ref: "#/components/parameters/Cartesia-Version" requestBody: $ref: "#/components/requestBodies/TTSRequest" responses: "200": description: A stream of audio data. content: text/event-stream: schema: oneOf: - $ref: "#/components/schemas/JSONChunkResponse" - $ref: "#/components/schemas/JSONDoneResponse" - $ref: "#/components/schemas/JSONErrorResponse" discriminator: propertyName: status_code default: $ref: "#/components/responses/Error" "/voices": get: summary: List Voices description: |- List all voices available to the user, that is, public voices and the user's own voices. parameters: - $ref: "#/components/parameters/Cartesia-Version" responses: "200": description: List of voices. content: application/json: schema: title: Voice List type: array items: $ref: "#/components/schemas/Voice" post: summary: Create Voice description: |- Create a new voice with a given name, description, and embedding. parameters: - $ref: "#/components/parameters/Cartesia-Version" requestBody: required: true content: application/json: schema: type: object required: - name - description - embedding properties: name: "$ref": "#/components/schemas/VoiceName" description: "$ref": "#/components/schemas/VoiceDescription" embedding: "$ref": "#/components/schemas/VoiceEmbedding" responses: "201": description: The created voice. content: application/json: schema: $ref: "#/components/schemas/Voice" default: $ref: "#/components/responses/Error" "/voices/{id}": get: summary: Get Voice parameters: - $ref: "#/components/parameters/Cartesia-Version" - in: path name: id required: true schema: "$ref": "#/components/schemas/VoiceId" responses: "200": description: The voice. content: application/json: schema: $ref: "#/components/schemas/Voice" default: $ref: "#/components/responses/Error" delete: summary: Delete Voice parameters: - $ref: "#/components/parameters/Cartesia-Version" - in: path name: id required: true schema: "$ref": "#/components/schemas/VoiceId" responses: "204": description: Voice deleted. default: $ref: "#/components/responses/Error" "/voices/clone/clip": post: summary: Clone Voice (Clip) description: |- Clones a voice from an audio clip uploaded as a file. The clip is uploaded using multipart/form-data with a `clip` field containing the audio file. operationId: clone_voice_from_clip parameters: - $ref: "#/components/parameters/Cartesia-Version" requestBody: required: true content: multipart/form-data: schema: type: object properties: clip: type: string format: binary responses: "200": $ref: "#/components/responses/VoiceEmbeddingResponse" default: $ref: "#/components/responses/Error" "/voices/clone/url": post: summary: Clone Voice (URL) description: |- Clone a voice from an online URL. Currently only supports YouTube. parameters: - $ref: "#/components/parameters/Cartesia-Version" - required: true schema: type: string title: Link description: The URL of the video to clone the voice from. example: "" name: link in: query responses: "200": $ref: "#/components/responses/VoiceEmbeddingResponse" default: $ref: "#/components/responses/Error" "/": get: security: [] summary: API Status and Version description: Returns the server's version and a status message which can be useful for sanity checking. responses: "200": description: Everything OK. content: application/json: schema: type: object properties: version: type: string title: version status: type: boolean title: ok default: $ref: "#/components/responses/Error" components: securitySchemes: APIKeyHeader: type: apiKey in: header name: X-API-Key parameters: Cartesia-Version: in: header name: Cartesia-Version description: The version of the Cartesia API to use. required: true schema: type: string example: "2024-06-10" requestBodies: TTSRequest: required: true content: application/json: schema: title: TTS Request type: object required: - model_id - transcript - voice - output_format properties: model_id: title: Model ID type: string example: sonic-english transcript: title: Transcript type: string example: "Hello, world! I'm generating audio on Cartesia." duration: title: Duration description: The maximum duration of the audio in seconds. type: integer voice: title: Voice description: The voice to use for the speech. Can be either an ID or an embedding, specified by the `mode` field. oneOf: - type: object properties: mode: type: string enum: [id] id: "$ref": "#/components/schemas/VoiceId" example: mode: "id" id: &voiceID "a0e99841-438c-4a64-b679-ae501e7d6091" - type: object properties: mode: type: string enum: [embedding] embedding: "$ref": "#/components/schemas/VoiceEmbedding" example: mode: "embedding" embedding: &voiceEmbedding [ -0.033633083, 0.072083704, -0.01807767, -0.083488315, -0.04407617, 0.0022592682, 0.070505895, 0.023946615, -0.04788024, -0.06388413, -0.0716355, -0.0022612812, -0.0053448505, -0.07848381, 0.0348162, -0.053745482, -0.092399485, -0.02950225, 0.028591828, -0.10556894, 0.023313355, 0.06224387, 0.0362463, 0.029258432, 0.10769641, 0.043595582, -0.058543224, -0.080402784, -0.0953816, -0.008988032, -0.0028981369, -0.004752721, -0.20742874, 0.058907595, 0.08813939, -0.06192675, 0.099082634, -0.09661578, -0.0077761724, -0.013982456, -0.025798267, 0.04467142, 0.026222011, 0.023023574, 0.011227064, -0.17462021, -0.09880612, -0.1521035, -0.060464993, -0.04735665, -0.09725187, -0.006127679, 0.15818526, -0.039493002, -0.067719474, 0.0066190436, -0.10636633, 0.17073768, -0.051717706, 0.03186961, -0.020547207, -0.02244247, 0.013196935, -0.06431055, -0.115360335, 0.016918058, -0.033195216, 0.11255181, 0.020366343, -0.041032124, 0.08780918, -0.040567942, 0.057276532, 0.05848221, -0.077479474, -0.073524915, -0.01913317, -0.029291833, 0.11210393, -0.09859328, 0.2152541, -0.022976823, 0.028627992, -0.039598297, 0.041829932, -0.05593181, -0.06444655, -0.018057477, -0.008098263, 0.05994528, 0.10430693, -0.13121894, -0.06512868, -0.026126215, 0.046727825, -0.17180993, -0.10577226, -0.08610466, 0.008862588, 0.09547498, -0.010965332, -0.061217085, -0.038954042, 0.019930292, -0.017192135, 0.007296275, 0.03273872, 0.04389937, -0.056483064, 0.003420891, -0.10319067, -0.015706042, 0.1308774, -0.0018035866, -0.03582506, 0.077131025, 0.013398928, 0.003188886, 0.12039741, -0.033974767, 0.06899378, -0.059775922, -0.026934423, 0.028482193, 0.100996524, 0.004498743, -0.02291186, 0.078752205, -0.0063796206, 0.04206536, 0.05721349, 0.06290694, 0.06130212, 0.096969016, -0.057664312, -0.16727506, -0.035220966, 0.090760484, 0.010039947, 0.06513242, 0.011055657, -0.004258431, -0.08316792, -0.15650468, -0.076931365, 0.11385587, -0.038372636, 0.015648656, -0.12029895, -0.06604956, 0.009441591, -0.11912808, 0.013378132, 0.029525978, -0.0056742397, -0.0075976513, 0.019999338, -0.05521377, -0.07650746, -0.017710293, -0.033986397, -0.047768556, 0.13857274, 0.099290825, 0.11736938, 0.017834296, -0.07140237, -0.052047748, -0.06398965, -0.037033975, -0.061061256, -0.03330076, -0.024472248, -0.059656, 0.05359946, -0.043915518, -0.086325996, 0.14189173, 0.021086395, 0.02945159, 0.1029604, 0.018490415, -0.028736332, -0.025272416, -0.06082937, -0.031339463, -0.0007249595, 0.025595888, 0.007144545, -0.16938712, -0.1160664, -0.0654145, ] discriminator: propertyName: mode output_format: title: Output Format type: object required: [container, encoding, sample_rate] properties: container: type: string enum: ["raw"] encoding: type: string enum: ["pcm_s16le", "pcm_f32le", "pcm_mulaw", "pcm_alaw"] sample_rate: type: integer enum: [8000, 16000, 22050, 24000, 44100] language: title: Language type: string enum: ["en", "es", "fr", "de", "pt", "zh", "ja"] example: "en" responses: Error: description: An error response. content: text/plain: schema: title: Error Description type: string example: error in voice VoiceEmbeddingResponse: description: The voice embedding. content: application/json: schema: title: Voice Embedding properties: embedding: "$ref": "#/components/schemas/VoiceEmbedding" type: object required: - embedding schemas: Timestamp: title: Timestamp type: string Voice: type: object properties: id: "$ref": "#/components/schemas/VoiceId" user_id: type: string is_public: type: boolean name: "$ref": "#/components/schemas/VoiceName" description: "$ref": "#/components/schemas/VoiceDescription" created_at: "$ref": "#/components/schemas/Timestamp" embedding: "$ref": "#/components/schemas/VoiceEmbedding" required: - id - user_id - is_public - name - description - created_at - embedding VoiceDescription: title: Voice Description type: string example: "A deep, rich, male voice with an Indian accent." VoiceId: title: Voice ID type: string example: *voiceID VoiceEmbedding: title: Voice Embedding items: type: number type: array example: *voiceEmbedding VoiceName: title: VoiceName type: string example: "Barbershop Man" Clip: title: Clip type: string format: binary JSONBaseResponse: title: JSON Base Response type: object properties: status_code: &statusCode title: Status Code type: integer description: HTTP status code to allow for error handling. This is for errors encountered while streaming the response. done: &done title: Done type: boolean description: Whether the model has finished generating audio. JSONChunkResponse: title: JSON Chunk Response type: object allOf: - "$ref": "#/components/schemas/JSONBaseResponse" - type: object properties: status_code: <<: *statusCode enum: [206] done: <<: *done enum: [false] data: title: Data type: string format: byte description: Binary audio data encoded as base64. The underlying bytes will be in the requested output format. step_time: title: Step Time type: integer description: The time in milliseconds that the model took to generate this chunk of audio. JSONDoneResponse: title: JSON Done Response type: object allOf: - "$ref": "#/components/schemas/JSONBaseResponse" - type: object properties: status_code: <<: *statusCode enum: [200] done: <<: *done enum: [true] JSONErrorResponse: title: JSON Error Response type: object allOf: - "$ref": "#/components/schemas/JSONBaseResponse" - type: object properties: status_code: <<: *statusCode minimum: 400 maximum: 599 done: <<: *done enum: [true] error: title: Error type: string description: A human-readable error message.