# This is an example yaml template that can be used to define a service's SLIs and SLOs as code, # which will be visible in Datadog # Once you update and complete this yaml template, please add it to your service's repository # # Please be sure to edit/update each attribute to your applications specifications # # If an attribute requires specific values, all possibilities will be listed as a comment above it. # If you are unsure what metrics your service supports or should be logging, please refer to the # README for instructions on how to generate a list of all supported metrics for your service. # # Mustache templating can be used anywhere. NOTE: If a template is not present on the CLI, it will # default to an empty string. business_unit: devops # should match business_unit in services deployment.yaml # Include any tags that would be valuable; tags here will be appended to all SLOs and SLIs below # tags: # - tag1 # - tag2 # First slo definition slos: - short_name: test-slo-converter-slo-1 # terraform resource name; must be unique within this file name: test slo converter slo 1 # name of SLO within Datadog # Default threshold target is "99.5" when omitted; however acceptable values are floats > 0 and < 100 # threshold: # target: 99.5 # Monitor definition: monitors: - type: error_budget # Query type window: 7_day # The alerting window size [7_day, 30_day, 90_day] alert: critical: # A string or a list of strings specifying slack channels to notify for critical errors - "@slack-temp-channel-1" - "{{ critical_alert }}" warn: "{{ warn_alert }}" threshold: # Error threshold in percentage critical: 100 warning: 50 # optional # First sli definition slis: # - short_name: test-slo-converter-sli-1 # terraform resource name; must be unique within this file # # Available values are latency, error-rate, and external # # external types accept an optional name and tags filter # # External SLIs are references to existing SLIs that are managed in a separate service. # type: external # service: service-core-bank # name_filter: devops-portal test slo converter sli 1 Latency # name of SLI (monitor) within Datadog - short_name: test-slo-converter-sli-1 # terraform resource name; must be unique within this file name: test slo converter sli 1 # SLI name in Datadog becomes " "; or " " when omitted # Update slack message content with additional details or insight to sli. # This becomes the monitor message body. message: |- This is the message text for this monitor. This is an example of line two with a blank line between. @slack-temp-slo-monitoring # Service name will become a tag on all SLIs and SLOs, but this field should match # the FigureDeployment service name so SLOs are included on the Datadog APM page. service: devops-portal # Available values are latency, error-rate, and external # latency types accept a percentile and metric field type: latency # Default percentile is p95 when omitted; however other options are p50, p75, p90, p95, p99 # percentile: p95 # Source trace that contains the resources below; supported metrics include # trace.http.request.*, trace.servlet.request.*, trace.grpc.server.*, and trace.kafka.consume.* metric: trace.http.request # Enter/Update resources with those generated using service-resource-generator.sh described # in README # Resources get joined with "OR" resources: - post_/api/v1/deployments - get_/api/v1/deployments - post_/api/v1/deployments/ # Optional, resources get negated and joined with "AND" excluded_resources: - get_/health # Datadog monitor trigger conditions, in seconds threshold: # in seconds warning: 0.8 # optional critical: 1 # Include any tags that would be valuable for this SLI tags: - tag1 - tag2 # Second sli definition - short_name: test-slo-converter-sli-2 # name: test slo converter sli 2 # message: Here's another message for a second monitor. @slack-temp-slo-monitoring service: devops-portal # Available values are latency, error-rate, and external # error-rate types accept a metric block type: error-rate # Enter/Update resources with those generated using service-resource-generator.sh described in README resources: - post_/api/v1/deployments - get_/api/v1/deployments - post_/api/v1/deployments/ # Error rate formula is #errors / #total_requests, i.e. 1 / 100 = 0.01% error rate metric: # Source traces that contains the resources below; supported metrics include # trace.http.request.*, trace.servlet.request.*, trace.grpc.server.*, # and trace.kafka.consume.* numerator: trace.http.request.errors denominator: trace.http.request.hits threshold: # warning: 0.8 critical: 1 # tags: # - tag1 # - tag2 # First standalone sli definition - slis may exist outside the control of an slo. # This would be useful for situations where a service contains a listing of slis and there's # another service that will pull them into an slo as an external reference, like above. slis: - short_name: test-standalone-slo-converter-sli-1 # terraform resource name; must be unique within this file name: test standalone slo converter sli 1 # SLI name in Datadog becomes " "; or " " when omitted # Update slack message content with additional details or insight to sli. # This becomes the monitor message body. message: |- This is the message text for this monitor. This is an example of line two with a blank line between. @slack-temp-slo-monitoring # Service name will become a tag on all SLIs and SLOs, but this field should match # the FigureDeployment service name so SLOs are included on the Datadog APM page. service: devops-portal # Available values are latency, error-rate, and external # latency types accept a percentile and metric field type: latency # Default percentile is p95 when omitted; however other options are p50, p75, p90, p95, p99 # percentile: p95 # Source trace that contains the resources below; supported metrics include # trace.http.request.*, trace.servlet.request.*, trace.grpc.server.*, and trace.kafka.consume.* metric: trace.http.request # Enter/Update resources with those generated using service-resource-generator.sh described # in README resources: - "!post_/api/v1/deployments" - "!get_/api/v1/deployments" - post_/api/v1/deployments/ # Datadog monitor trigger conditions, in seconds threshold: # warning: 0.8 critical: 1 # Include any tags that would be valuable for this SLI tags: - standalonetag1 - standalonetag2 --- business_unit: devops # should match business_unit in services deployment.yaml # Include any tags that would be valuable; tags here will be appended to all SLOs and SLIs below tags: - tag1 - tag2 # Second slo definition slos: - short_name: test-slo-converter-slo-2 name: test slo converter slo 2 threshold: target: 99.9 # Third sli definition slis: - short_name: test-slo-converter-sli-3 name: test slo converter sli 3 message: |- This is the message text for this monitor. This is an example of line two with a blank line between. @slack-temp-slo-monitoring service: devops-portal type: latency #percentile: p95 resources: - post_/api/v1/deployments - get_/api/v1/deployments - post_/api/v1/deployments/ metric: trace.http.request threshold: # warning: 0.8 critical: 1 # Fourth sli definition - short_name: test-slo-converter-sli-4 name: test slo converter sli 4 message: Here's another message for a second monitor. @slack-temp-slo-monitoring service: devops-portal type: error-rate resources: - post_/api/v1/deployments - get_/api/v1/deployments - post_/api/v1/deployments/ metric: numerator: trace.http.request.errors denominator: trace.http.request.hits threshold: # warning: 0.8 critical: 1