clusterloader2/testing/load/modules/measurements.yaml (214 lines of code) (raw):
## Measurement module defines test scoped measurement.
## Input params
# Valid actions: "start", "gather"
{{$action := .action}}
## Feature-gates and configs:
{{$ALLOWED_SLOW_API_CALLS := DefaultParam .CL2_ALLOWED_SLOW_API_CALLS 0}}
{{$API_AVAILABILITY_PERCENTAGE_THRESHOLD := DefaultParam .CL2_API_AVAILABILITY_PERCENTAGE_THRESHOLD 99.5}}
{{$CLUSTER_OOMS_IGNORED_PROCESSES := DefaultParam .CL2_CLUSTER_OOMS_IGNORED_PROCESSES ""}}
{{$CUSTOM_API_CALL_THRESHOLDS := DefaultParam .CUSTOM_API_CALL_THRESHOLDS ""}}
{{$ENABLE_API_AVAILABILITY_MEASUREMENT := DefaultParam .CL2_ENABLE_API_AVAILABILITY_MEASUREMENT false}}
{{$ENABLE_IN_CLUSTER_NETWORK_LATENCY := DefaultParam .CL2_ENABLE_IN_CLUSTER_NETWORK_LATENCY true}}
{{$ENABLE_SLO_MEASUREMENT := DefaultParam .CL2_ENABLE_SLO_MEASUREMENT true}}
{{$ENABLE_CLUSTER_OOMS_TRACKER := DefaultParam .CL2_ENABLE_CLUSTER_OOMS_TRACKER true}}
{{$ENABLE_NODE_LOCAL_DNS_LATENCY := DefaultParam .CL2_ENABLE_NODE_LOCAL_DNS_LATENCY false}}
{{$ENABLE_RESTART_COUNT_CHECK := DefaultParam .ENABLE_RESTART_COUNT_CHECK true}}
{{$ENABLE_SYSTEM_POD_METRICS:= DefaultParam .ENABLE_SYSTEM_POD_METRICS true}}
{{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS := DefaultParam .CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS false}}
{{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE := DefaultParam .CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE true}}
{{$ENABLE_CEP_PROPAGATION_DELAY_MEASUREMENT := DefaultParam .CL2_ENABLE_CEP_PROPAGATION_DELAY_MEASUREMENT false}}
{{$CEP_PROPAGATION_DELAY_SLO_BUCKET := DefaultParam .CL2_CEP_PROPAGATION_DELAY_SLO_BUCKET 600}}
{{$CEP_PROPAGATION_DELAY_SLO_PERCENTILE := DefaultParam .CL2_CEP_PROPAGATION_DELAY_SLO_PERCENTILE 95.0}}
{{$ENABLE_CONTAINER_RESTARTS_MEASUREMENT := DefaultParam .CL2_ENABLE_CONTAINER_RESTARTS_MEASUREMENT false}}
{{$ENABLE_CONTAINER_RESOURCES_MEASUREMENT := DefaultParam .CL2_ENABLE_CONTAINER_RESOURCES_MEASUREMENT false}}
{{$ENABLE_TERMINATED_WATCHES_MEASUREMENT := DefaultParam .CL2_ENABLE_TERMINATED_WATCHES_MEASUREMENT false}}
{{$ENABLE_QUOTAS_USAGE_MEASUREMENT := DefaultParam .CL2_ENABLE_QUOTAS_USAGE_MEASUREMENT false}}
{{$ALLOWED_CONTAINER_RESTARTS := DefaultParam .CL2_ALLOWED_CONTAINER_RESTARTS 1}}
{{$CUSTOM_ALLOWED_CONTAINER_RESTARTS := DefaultParam .CL2_CUSTOM_ALLOWED_CONTAINER_RESTARTS ""}}
{{$NODE_LOCAL_DNS_LATENCY_THRESHOLD := DefaultParam .CL2_NODE_LOCAL_DNS_LATENCY_THRESHOLD "5s"}}
{{$PROMETHEUS_SCRAPE_KUBE_PROXY := DefaultParam .PROMETHEUS_SCRAPE_KUBE_PROXY true}}
{{$PROMETHEUS_SCRAPE_KUBE_STATE_METRICS := DefaultParam .PROMETHEUS_SCRAPE_KUBE_STATE_METRICS false}}
{{$PROMETHEUS_SCRAPE_METRICS_SERVER_METRICS := DefaultParam .PROMETHEUS_SCRAPE_METRICS_SERVER_METRICS false}}
{{$RESTART_COUNT_THRESHOLD_OVERRIDES:= DefaultParam .RESTART_COUNT_THRESHOLD_OVERRIDES ""}}
{{$USE_SIMPLE_LATENCY_QUERY := DefaultParam .USE_SIMPLE_LATENCY_QUERY false}}
# Probe measurements shared parameter
{{$PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT := DefaultParam .CL2_PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT "15m"}}
steps:
- name: "{{$action}}ing measurements"
measurements:
- Identifier: APIResponsivenessPrometheus
Method: APIResponsivenessPrometheus
Params:
action: {{$action}}
{{if not $USE_SIMPLE_LATENCY_QUERY}}
enableViolations: {{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS}}
allowedSlowCalls: {{$ALLOWED_SLOW_API_CALLS}}
customThresholds: {{YamlQuote $CUSTOM_API_CALL_THRESHOLDS 4}}
{{end}}
- Identifier: APIResponsivenessPrometheusSimple
Method: APIResponsivenessPrometheus
Params:
action: {{$action}}
enableViolations: {{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE}}
useSimpleLatencyQuery: true
summaryName: APIResponsivenessPrometheus_simple
allowedSlowCalls: {{$ALLOWED_SLOW_API_CALLS}}
customThresholds: {{YamlQuote $CUSTOM_API_CALL_THRESHOLDS 4}}
- Identifier: CreatePhasePodStartupLatency
Method: PodStartupLatency
Params:
action: {{$action}}
labelSelector: group = load
threshold: 1h # TODO(https://github.com/kubernetes/perf-tests/issues/1024): Ideally, this should be 5s
{{if $ENABLE_IN_CLUSTER_NETWORK_LATENCY}}
- Identifier: InClusterNetworkLatency
Method: InClusterNetworkLatency
Params:
action: {{$action}}
checkProbesReadyTimeout: {{$PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT}}
replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}}
{{end}}
{{if $ENABLE_NODE_LOCAL_DNS_LATENCY}}
- Identifier: NodeLocalDNSLatency
Method: NodeLocalDNSLatencyPrometheus
Params:
action: {{$action}}
enableViolations: true
threshold: {{$NODE_LOCAL_DNS_LATENCY_THRESHOLD}}
{{end}}
{{if $ENABLE_SLO_MEASUREMENT}}
- Identifier: SLOMeasurement
Method: SLOMeasurement
Params:
action: {{$action}}
checkProbesReadyTimeout: {{$PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT}}
replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}}
{{end}}
{{if $PROMETHEUS_SCRAPE_KUBE_PROXY}}
- Identifier: NetworkProgrammingLatency
Method: NetworkProgrammingLatency
Params:
action: {{$action}}
{{end}}
{{if $PROMETHEUS_SCRAPE_KUBE_STATE_METRICS}}
- Identifier: KubeStateMetricsLatency
Method: KubeStateMetricsLatency
Params:
action: {{$action}}
{{end}}
{{if $PROMETHEUS_SCRAPE_METRICS_SERVER_METRICS}}
- Identifier: MetricsServerPrometheus
Method: MetricsServerPrometheus
Params:
action: {{$action}}
{{end}}
{{if $ENABLE_API_AVAILABILITY_MEASUREMENT}}
- Identifier: APIAvailability
Method: APIAvailability
Params:
action: {{$action}}
pollFrequency: "5s"
hostPollTimeoutSeconds: 5
threshold: {{$API_AVAILABILITY_PERCENTAGE_THRESHOLD}}
{{end}}
{{if $ENABLE_CONTAINER_RESTARTS_MEASUREMENT}}
- Identifier: ContainerRestarts
Method: ContainerRestarts
Params:
action: {{$action}}
enableViolations: true
defaultAllowedRestarts: {{$ALLOWED_CONTAINER_RESTARTS}}
customAllowedRestarts: {{YamlQuote $CUSTOM_ALLOWED_CONTAINER_RESTARTS 4}}
{{end}}
{{if $ENABLE_CONTAINER_RESOURCES_MEASUREMENT}}
- Identifier: ContainerCPU
Method: GenericPrometheusQuery
Params:
action: {{$action}}
metricName: Container CPU
metricVersion: v1
unit: cores
dimensions:
- container
queries:
- name: Perc99
query: quantile_over_time(0.99, sum by (container) (rate(container_cpu_usage_seconds_total[1m]))[%v:])
- name: Perc90
query: quantile_over_time(0.90, sum by (container) (rate(container_cpu_usage_seconds_total[1m]))[%v:])
- name: Perc50
query: quantile_over_time(0.50, sum by (container) (rate(container_cpu_usage_seconds_total[1m]))[%v:])
- Identifier: ContainerMemory
Method: GenericPrometheusQuery
Params:
action: {{$action}}
metricName: Container Memory
metricVersion: v1
unit: MiB
dimensions:
- container
queries:
- name: Perc99
query: quantile_over_time(0.99, sum by (container) (container_memory_working_set_bytes / 1024 / 1024)[%v:])
- name: Perc90
query: quantile_over_time(0.90, sum by (container) (container_memory_working_set_bytes / 1024 / 1024)[%v:])
- name: Perc50
query: quantile_over_time(0.50, sum by (container) (container_memory_working_set_bytes / 1024 / 1024)[%v:])
{{end}}
{{if $ENABLE_TERMINATED_WATCHES_MEASUREMENT}}
- Identifier: TerminatedWatchesMetrics
Method: GenericPrometheusQuery
Params:
action: {{$action}}
metricName: Terminated Watches
metricVersion: v1
dimensions:
- resource
queries:
- name: Terminated watches
query: sum(increase(apiserver_terminated_watchers_total[%v:])) by (resource)
- Identifier: WatchCacheInitializations
Method: GenericPrometheusQuery
Params:
action: {{$action}}
metricName: Watch Cache Initializations
metricVersion: v1
dimensions:
- resource
queries:
- name: Watch cache reinitializations
query: sum(increase(apiserver_watch_cache_initializations_total[%v:])) by (resource)
{{end}}
{{if $ENABLE_QUOTAS_USAGE_MEASUREMENT}}
- Identifier: Quotas total usage
Method: GenericPrometheusQuery
Params:
action: {{$action}}
metricName: Quota usage
metricVersion: v1
prometheusClient: managed
unit: QPMs
dimensions:
- quota_metric
queries:
- name: perc99
query: quantile_over_time(0.99, sum by (quota_metric) (irate(serviceruntime_googleapis_com:quota_rate_net_usage{monitored_resource="consumer_quota"}[1m]))[%v:]) * 60
- name: max
query: max_over_time(sum by (quota_metric) (irate(serviceruntime_googleapis_com:quota_rate_net_usage{monitored_resource="consumer_quota"}[1m]))[%v:]) * 60
{{end}}
{{if $ENABLE_CEP_PROPAGATION_DELAY_MEASUREMENT}}
- Identifier: CiliumEndpointPropagationDelay
Method: CiliumEndpointPropagationDelay
Params:
action: {{$action}}
bucketSLO: {{$CEP_PROPAGATION_DELAY_SLO_BUCKET}}
percentileSLO: {{$CEP_PROPAGATION_DELAY_SLO_PERCENTILE}}
enableViolations: true
{{end}}
- Identifier: TestMetrics
Method: TestMetrics
Params:
action: {{$action}}
systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}}
clusterOOMsIgnoredProcesses: {{YamlQuote $CLUSTER_OOMS_IGNORED_PROCESSES 4}}
clusterOOMsTrackerEnabled: {{$ENABLE_CLUSTER_OOMS_TRACKER}}
restartCountThresholdOverrides: {{YamlQuote $RESTART_COUNT_THRESHOLD_OVERRIDES 4}}
enableRestartCountCheck: {{$ENABLE_RESTART_COUNT_CHECK}}
- module:
path: modules/dns-performance-metrics.yaml
params:
action: {{$action}}