openapi: 3.0.0

info:
  title: Cartesia REST API
  version: "2024-06-10"

servers:
  - url: "https://api.cartesia.ai"

security:
  - APIKeyHeader: []

paths:
  "/tts/bytes":
    post:
      summary: Stream Speech (Bytes)
      description: |-
        Generate audio from a transcript using a given voice and model. The audio is streamed out as raw bytes.
      parameters:
        - $ref: "#/components/parameters/Cartesia-Version"
      requestBody:
        $ref: "#/components/requestBodies/TTSRequest"
      responses:
        "200":
          description: A stream of audio data.
          content:
            application/octet-stream:
              schema:
                type: string
                format: binary
                description: Binary audio data. The underlying bytes will be in the requested output format.
        default:
          $ref: "#/components/responses/Error"
  "/tts/sse":
    post:
      summary: Stream Speech (Server-Sent Events)
      description: |-
        Generate audio from a transcript using a given voice and model. The audio is streamed out as Server-Sent Events.
      parameters:
        - $ref: "#/components/parameters/Cartesia-Version"
      requestBody:
        $ref: "#/components/requestBodies/TTSRequest"
      responses:
        "200":
          description: A stream of audio data.
          content:
            text/event-stream:
              schema:
                oneOf:
                  - $ref: "#/components/schemas/JSONChunkResponse"
                  - $ref: "#/components/schemas/JSONDoneResponse"
                  - $ref: "#/components/schemas/JSONErrorResponse"
                discriminator:
                  propertyName: status_code
        default:
          $ref: "#/components/responses/Error"

  "/voices":
    get:
      summary: List Voices
      description: |-
        List all voices available to the user, that is, public voices and the user's own voices.
      parameters:
        - $ref: "#/components/parameters/Cartesia-Version"
      responses:
        "200":
          description: List of voices.
          content:
            application/json:
              schema:
                title: Voice List
                type: array
                items:
                  $ref: "#/components/schemas/Voice"
    post:
      summary: Create Voice
      description: |-
        Create a new voice with a given name, description, and embedding.
      parameters:
        - $ref: "#/components/parameters/Cartesia-Version"
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              required:
                - name
                - description
                - embedding
              properties:
                name:
                  "$ref": "#/components/schemas/VoiceName"
                description:
                  "$ref": "#/components/schemas/VoiceDescription"
                embedding:
                  "$ref": "#/components/schemas/VoiceEmbedding"
      responses:
        "201":
          description: The created voice.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Voice"
        default:
          $ref: "#/components/responses/Error"

  "/voices/{id}":
    get:
      summary: Get Voice
      parameters:
        - $ref: "#/components/parameters/Cartesia-Version"
        - in: path
          name: id
          required: true
          schema:
            "$ref": "#/components/schemas/VoiceId"
      responses:
        "200":
          description: The voice.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Voice"
        default:
          $ref: "#/components/responses/Error"
    delete:
      summary: Delete Voice
      parameters:
        - $ref: "#/components/parameters/Cartesia-Version"
        - in: path
          name: id
          required: true
          schema:
            "$ref": "#/components/schemas/VoiceId"
      responses:
        "204":
          description: Voice deleted.
        default:
          $ref: "#/components/responses/Error"

  "/voices/clone/clip":
    post:
      summary: Clone Voice (Clip)
      description: |-
        Clones a voice from an audio clip uploaded as a file. The clip is uploaded using multipart/form-data with a `clip` field containing the audio file.
      operationId: clone_voice_from_clip
      parameters:
        - $ref: "#/components/parameters/Cartesia-Version"
      requestBody:
        required: true
        content:
          multipart/form-data:
            schema:
              type: object
              properties:
                clip:
                  type: string
                  format: binary
      responses:
        "200":
          $ref: "#/components/responses/VoiceEmbeddingResponse"
        default:
          $ref: "#/components/responses/Error"

  "/voices/clone/url":
    post:
      summary: Clone Voice (URL)
      description: |-
        Clone a voice from an online URL. Currently only supports YouTube.
      parameters:
        - $ref: "#/components/parameters/Cartesia-Version"
        - required: true
          schema:
            type: string
            title: Link
            description: The URL of the video to clone the voice from.
            example: "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
          name: link
          in: query
      responses:
        "200":
          $ref: "#/components/responses/VoiceEmbeddingResponse"
        default:
          $ref: "#/components/responses/Error"

  "/":
    get:
      security: []
      summary: API Status and Version
      description: Returns the server's version and a status message which can be useful for sanity checking.
      responses:
        "200":
          description: Everything OK.
          content:
            application/json:
              schema:
                type: object
                properties:
                  version:
                    type: string
                    title: version
                  status:
                    type: boolean
                    title: ok
        default:
          $ref: "#/components/responses/Error"

components:
  securitySchemes:
    APIKeyHeader:
      type: apiKey
      in: header
      name: X-API-Key

  parameters:
    Cartesia-Version:
      in: header
      name: Cartesia-Version
      description: The version of the Cartesia API to use.
      required: true
      schema:
        type: string
      example: "2024-06-10"

  requestBodies:
    TTSRequest:
      required: true
      content:
        application/json:
          schema:
            title: TTS Request
            type: object
            required:
              - model_id
              - transcript
              - voice
              - output_format
            properties:
              model_id:
                title: Model ID
                type: string
                example: sonic-english
              transcript:
                title: Transcript
                type: string
                example: "Hello, world! I'm generating audio on Cartesia."
              duration:
                title: Duration
                description: The maximum duration of the audio in seconds.
                type: integer
              voice:
                title: Voice
                description: The voice to use for the speech. Can be either an ID or an embedding, specified by the `mode` field.
                oneOf:
                  - type: object
                    properties:
                      mode:
                        type: string
                        enum: [id]
                      id:
                        "$ref": "#/components/schemas/VoiceId"
                    example:
                      mode: "id"
                      id: &voiceID "a0e99841-438c-4a64-b679-ae501e7d6091"
                  - type: object
                    properties:
                      mode:
                        type: string
                        enum: [embedding]
                      embedding:
                        "$ref": "#/components/schemas/VoiceEmbedding"
                    example:
                      mode: "embedding"
                      embedding:
                        &voiceEmbedding [
                          -0.033633083,
                          0.072083704,
                          -0.01807767,
                          -0.083488315,
                          -0.04407617,
                          0.0022592682,
                          0.070505895,
                          0.023946615,
                          -0.04788024,
                          -0.06388413,
                          -0.0716355,
                          -0.0022612812,
                          -0.0053448505,
                          -0.07848381,
                          0.0348162,
                          -0.053745482,
                          -0.092399485,
                          -0.02950225,
                          0.028591828,
                          -0.10556894,
                          0.023313355,
                          0.06224387,
                          0.0362463,
                          0.029258432,
                          0.10769641,
                          0.043595582,
                          -0.058543224,
                          -0.080402784,
                          -0.0953816,
                          -0.008988032,
                          -0.0028981369,
                          -0.004752721,
                          -0.20742874,
                          0.058907595,
                          0.08813939,
                          -0.06192675,
                          0.099082634,
                          -0.09661578,
                          -0.0077761724,
                          -0.013982456,
                          -0.025798267,
                          0.04467142,
                          0.026222011,
                          0.023023574,
                          0.011227064,
                          -0.17462021,
                          -0.09880612,
                          -0.1521035,
                          -0.060464993,
                          -0.04735665,
                          -0.09725187,
                          -0.006127679,
                          0.15818526,
                          -0.039493002,
                          -0.067719474,
                          0.0066190436,
                          -0.10636633,
                          0.17073768,
                          -0.051717706,
                          0.03186961,
                          -0.020547207,
                          -0.02244247,
                          0.013196935,
                          -0.06431055,
                          -0.115360335,
                          0.016918058,
                          -0.033195216,
                          0.11255181,
                          0.020366343,
                          -0.041032124,
                          0.08780918,
                          -0.040567942,
                          0.057276532,
                          0.05848221,
                          -0.077479474,
                          -0.073524915,
                          -0.01913317,
                          -0.029291833,
                          0.11210393,
                          -0.09859328,
                          0.2152541,
                          -0.022976823,
                          0.028627992,
                          -0.039598297,
                          0.041829932,
                          -0.05593181,
                          -0.06444655,
                          -0.018057477,
                          -0.008098263,
                          0.05994528,
                          0.10430693,
                          -0.13121894,
                          -0.06512868,
                          -0.026126215,
                          0.046727825,
                          -0.17180993,
                          -0.10577226,
                          -0.08610466,
                          0.008862588,
                          0.09547498,
                          -0.010965332,
                          -0.061217085,
                          -0.038954042,
                          0.019930292,
                          -0.017192135,
                          0.007296275,
                          0.03273872,
                          0.04389937,
                          -0.056483064,
                          0.003420891,
                          -0.10319067,
                          -0.015706042,
                          0.1308774,
                          -0.0018035866,
                          -0.03582506,
                          0.077131025,
                          0.013398928,
                          0.003188886,
                          0.12039741,
                          -0.033974767,
                          0.06899378,
                          -0.059775922,
                          -0.026934423,
                          0.028482193,
                          0.100996524,
                          0.004498743,
                          -0.02291186,
                          0.078752205,
                          -0.0063796206,
                          0.04206536,
                          0.05721349,
                          0.06290694,
                          0.06130212,
                          0.096969016,
                          -0.057664312,
                          -0.16727506,
                          -0.035220966,
                          0.090760484,
                          0.010039947,
                          0.06513242,
                          0.011055657,
                          -0.004258431,
                          -0.08316792,
                          -0.15650468,
                          -0.076931365,
                          0.11385587,
                          -0.038372636,
                          0.015648656,
                          -0.12029895,
                          -0.06604956,
                          0.009441591,
                          -0.11912808,
                          0.013378132,
                          0.029525978,
                          -0.0056742397,
                          -0.0075976513,
                          0.019999338,
                          -0.05521377,
                          -0.07650746,
                          -0.017710293,
                          -0.033986397,
                          -0.047768556,
                          0.13857274,
                          0.099290825,
                          0.11736938,
                          0.017834296,
                          -0.07140237,
                          -0.052047748,
                          -0.06398965,
                          -0.037033975,
                          -0.061061256,
                          -0.03330076,
                          -0.024472248,
                          -0.059656,
                          0.05359946,
                          -0.043915518,
                          -0.086325996,
                          0.14189173,
                          0.021086395,
                          0.02945159,
                          0.1029604,
                          0.018490415,
                          -0.028736332,
                          -0.025272416,
                          -0.06082937,
                          -0.031339463,
                          -0.0007249595,
                          0.025595888,
                          0.007144545,
                          -0.16938712,
                          -0.1160664,
                          -0.0654145,
                        ]
                discriminator:
                  propertyName: mode
              output_format:
                title: Output Format
                type: object
                required: [container, encoding, sample_rate]
                properties:
                  container:
                    type: string
                    enum: ["raw"]
                  encoding:
                    type: string
                    enum: ["pcm_s16le", "pcm_f32le", "pcm_mulaw", "pcm_alaw"]
                  sample_rate:
                    type: integer
                    enum: [8000, 16000, 22050, 24000, 44100]
              language:
                title: Language
                type: string
                enum: ["en", "es", "fr", "de", "pt", "zh", "ja"]
                example: "en"

  responses:
    Error:
      description: An error response.
      content:
        text/plain:
          schema:
            title: Error Description
            type: string
            example: error in voice

    VoiceEmbeddingResponse:
      description: The voice embedding.
      content:
        application/json:
          schema:
            title: Voice Embedding
            properties:
              embedding:
                "$ref": "#/components/schemas/VoiceEmbedding"
            type: object
            required:
              - embedding

  schemas:
    Timestamp:
      title: Timestamp
      type: string

    Voice:
      type: object
      properties:
        id:
          "$ref": "#/components/schemas/VoiceId"
        user_id:
          type: string
        is_public:
          type: boolean
        name:
          "$ref": "#/components/schemas/VoiceName"
        description:
          "$ref": "#/components/schemas/VoiceDescription"
        created_at:
          "$ref": "#/components/schemas/Timestamp"
        embedding:
          "$ref": "#/components/schemas/VoiceEmbedding"
      required:
        - id
        - user_id
        - is_public
        - name
        - description
        - created_at
        - embedding

    VoiceDescription:
      title: Voice Description
      type: string
      example: "A deep, rich, male voice with an Indian accent."

    VoiceId:
      title: Voice ID
      type: string
      example: *voiceID

    VoiceEmbedding:
      title: Voice Embedding
      items:
        type: number
      type: array
      example: *voiceEmbedding

    VoiceName:
      title: VoiceName
      type: string
      example: "Barbershop Man"

    Clip:
      title: Clip
      type: string
      format: binary

    JSONBaseResponse:
      title: JSON Base Response
      type: object
      properties:
        status_code: &statusCode
          title: Status Code
          type: integer
          description: HTTP status code to allow for error handling. This is for errors encountered while streaming the response.
        done: &done
          title: Done
          type: boolean
          description: Whether the model has finished generating audio.

    JSONChunkResponse:
      title: JSON Chunk Response
      type: object
      allOf:
        - "$ref": "#/components/schemas/JSONBaseResponse"
        - type: object
          properties:
            status_code:
              <<: *statusCode
              enum: [206]
            done:
              <<: *done
              enum: [false]
            data:
              title: Data
              type: string
              format: byte
              description: Binary audio data encoded as base64. The underlying bytes will be in the requested output format.
            step_time:
              title: Step Time
              type: integer
              description: The time in milliseconds that the model took to generate this chunk of audio.

    JSONDoneResponse:
      title: JSON Done Response
      type: object
      allOf:
        - "$ref": "#/components/schemas/JSONBaseResponse"
        - type: object
          properties:
            status_code:
              <<: *statusCode
              enum: [200]
            done:
              <<: *done
              enum: [true]

    JSONErrorResponse:
      title: JSON Error Response
      type: object
      allOf:
        - "$ref": "#/components/schemas/JSONBaseResponse"
        - type: object
          properties:
            status_code:
              <<: *statusCode
              minimum: 400
              maximum: 599
            done:
              <<: *done
              enum: [true]
            error:
              title: Error
              type: string
              description: A human-readable error message.