clusterloader2/testing/load/modules/measurements.yaml (214 lines of code) (raw):

## Measurement module defines test scoped measurement. ## Input params # Valid actions: "start", "gather" {{$action := .action}} ## Feature-gates and configs: {{$ALLOWED_SLOW_API_CALLS := DefaultParam .CL2_ALLOWED_SLOW_API_CALLS 0}} {{$API_AVAILABILITY_PERCENTAGE_THRESHOLD := DefaultParam .CL2_API_AVAILABILITY_PERCENTAGE_THRESHOLD 99.5}} {{$CLUSTER_OOMS_IGNORED_PROCESSES := DefaultParam .CL2_CLUSTER_OOMS_IGNORED_PROCESSES ""}} {{$CUSTOM_API_CALL_THRESHOLDS := DefaultParam .CUSTOM_API_CALL_THRESHOLDS ""}} {{$ENABLE_API_AVAILABILITY_MEASUREMENT := DefaultParam .CL2_ENABLE_API_AVAILABILITY_MEASUREMENT false}} {{$ENABLE_IN_CLUSTER_NETWORK_LATENCY := DefaultParam .CL2_ENABLE_IN_CLUSTER_NETWORK_LATENCY true}} {{$ENABLE_SLO_MEASUREMENT := DefaultParam .CL2_ENABLE_SLO_MEASUREMENT true}} {{$ENABLE_CLUSTER_OOMS_TRACKER := DefaultParam .CL2_ENABLE_CLUSTER_OOMS_TRACKER true}} {{$ENABLE_NODE_LOCAL_DNS_LATENCY := DefaultParam .CL2_ENABLE_NODE_LOCAL_DNS_LATENCY false}} {{$ENABLE_RESTART_COUNT_CHECK := DefaultParam .ENABLE_RESTART_COUNT_CHECK true}} {{$ENABLE_SYSTEM_POD_METRICS:= DefaultParam .ENABLE_SYSTEM_POD_METRICS true}} {{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS := DefaultParam .CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS false}} {{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE := DefaultParam .CL2_ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE true}} {{$ENABLE_CEP_PROPAGATION_DELAY_MEASUREMENT := DefaultParam .CL2_ENABLE_CEP_PROPAGATION_DELAY_MEASUREMENT false}} {{$CEP_PROPAGATION_DELAY_SLO_BUCKET := DefaultParam .CL2_CEP_PROPAGATION_DELAY_SLO_BUCKET 600}} {{$CEP_PROPAGATION_DELAY_SLO_PERCENTILE := DefaultParam .CL2_CEP_PROPAGATION_DELAY_SLO_PERCENTILE 95.0}} {{$ENABLE_CONTAINER_RESTARTS_MEASUREMENT := DefaultParam .CL2_ENABLE_CONTAINER_RESTARTS_MEASUREMENT false}} {{$ENABLE_CONTAINER_RESOURCES_MEASUREMENT := DefaultParam .CL2_ENABLE_CONTAINER_RESOURCES_MEASUREMENT false}} {{$ENABLE_TERMINATED_WATCHES_MEASUREMENT := DefaultParam .CL2_ENABLE_TERMINATED_WATCHES_MEASUREMENT false}} {{$ENABLE_QUOTAS_USAGE_MEASUREMENT := DefaultParam .CL2_ENABLE_QUOTAS_USAGE_MEASUREMENT false}} {{$ALLOWED_CONTAINER_RESTARTS := DefaultParam .CL2_ALLOWED_CONTAINER_RESTARTS 1}} {{$CUSTOM_ALLOWED_CONTAINER_RESTARTS := DefaultParam .CL2_CUSTOM_ALLOWED_CONTAINER_RESTARTS ""}} {{$NODE_LOCAL_DNS_LATENCY_THRESHOLD := DefaultParam .CL2_NODE_LOCAL_DNS_LATENCY_THRESHOLD "5s"}} {{$PROMETHEUS_SCRAPE_KUBE_PROXY := DefaultParam .PROMETHEUS_SCRAPE_KUBE_PROXY true}} {{$PROMETHEUS_SCRAPE_KUBE_STATE_METRICS := DefaultParam .PROMETHEUS_SCRAPE_KUBE_STATE_METRICS false}} {{$PROMETHEUS_SCRAPE_METRICS_SERVER_METRICS := DefaultParam .PROMETHEUS_SCRAPE_METRICS_SERVER_METRICS false}} {{$RESTART_COUNT_THRESHOLD_OVERRIDES:= DefaultParam .RESTART_COUNT_THRESHOLD_OVERRIDES ""}} {{$USE_SIMPLE_LATENCY_QUERY := DefaultParam .USE_SIMPLE_LATENCY_QUERY false}} # Probe measurements shared parameter {{$PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT := DefaultParam .CL2_PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT "15m"}} steps: - name: "{{$action}}ing measurements" measurements: - Identifier: APIResponsivenessPrometheus Method: APIResponsivenessPrometheus Params: action: {{$action}} {{if not $USE_SIMPLE_LATENCY_QUERY}} enableViolations: {{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS}} allowedSlowCalls: {{$ALLOWED_SLOW_API_CALLS}} customThresholds: {{YamlQuote $CUSTOM_API_CALL_THRESHOLDS 4}} {{end}} - Identifier: APIResponsivenessPrometheusSimple Method: APIResponsivenessPrometheus Params: action: {{$action}} enableViolations: {{$ENABLE_VIOLATIONS_FOR_API_CALL_PROMETHEUS_SIMPLE}} useSimpleLatencyQuery: true summaryName: APIResponsivenessPrometheus_simple allowedSlowCalls: {{$ALLOWED_SLOW_API_CALLS}} customThresholds: {{YamlQuote $CUSTOM_API_CALL_THRESHOLDS 4}} - Identifier: CreatePhasePodStartupLatency Method: PodStartupLatency Params: action: {{$action}} labelSelector: group = load threshold: 1h # TODO(https://github.com/kubernetes/perf-tests/issues/1024): Ideally, this should be 5s {{if $ENABLE_IN_CLUSTER_NETWORK_LATENCY}} - Identifier: InClusterNetworkLatency Method: InClusterNetworkLatency Params: action: {{$action}} checkProbesReadyTimeout: {{$PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT}} replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}} {{end}} {{if $ENABLE_NODE_LOCAL_DNS_LATENCY}} - Identifier: NodeLocalDNSLatency Method: NodeLocalDNSLatencyPrometheus Params: action: {{$action}} enableViolations: true threshold: {{$NODE_LOCAL_DNS_LATENCY_THRESHOLD}} {{end}} {{if $ENABLE_SLO_MEASUREMENT}} - Identifier: SLOMeasurement Method: SLOMeasurement Params: action: {{$action}} checkProbesReadyTimeout: {{$PROBE_MEASUREMENTS_CHECK_PROBES_READY_TIMEOUT}} replicasPerProbe: {{AddInt 2 (DivideInt .Nodes 100)}} {{end}} {{if $PROMETHEUS_SCRAPE_KUBE_PROXY}} - Identifier: NetworkProgrammingLatency Method: NetworkProgrammingLatency Params: action: {{$action}} {{end}} {{if $PROMETHEUS_SCRAPE_KUBE_STATE_METRICS}} - Identifier: KubeStateMetricsLatency Method: KubeStateMetricsLatency Params: action: {{$action}} {{end}} {{if $PROMETHEUS_SCRAPE_METRICS_SERVER_METRICS}} - Identifier: MetricsServerPrometheus Method: MetricsServerPrometheus Params: action: {{$action}} {{end}} {{if $ENABLE_API_AVAILABILITY_MEASUREMENT}} - Identifier: APIAvailability Method: APIAvailability Params: action: {{$action}} pollFrequency: "5s" hostPollTimeoutSeconds: 5 threshold: {{$API_AVAILABILITY_PERCENTAGE_THRESHOLD}} {{end}} {{if $ENABLE_CONTAINER_RESTARTS_MEASUREMENT}} - Identifier: ContainerRestarts Method: ContainerRestarts Params: action: {{$action}} enableViolations: true defaultAllowedRestarts: {{$ALLOWED_CONTAINER_RESTARTS}} customAllowedRestarts: {{YamlQuote $CUSTOM_ALLOWED_CONTAINER_RESTARTS 4}} {{end}} {{if $ENABLE_CONTAINER_RESOURCES_MEASUREMENT}} - Identifier: ContainerCPU Method: GenericPrometheusQuery Params: action: {{$action}} metricName: Container CPU metricVersion: v1 unit: cores dimensions: - container queries: - name: Perc99 query: quantile_over_time(0.99, sum by (container) (rate(container_cpu_usage_seconds_total[1m]))[%v:]) - name: Perc90 query: quantile_over_time(0.90, sum by (container) (rate(container_cpu_usage_seconds_total[1m]))[%v:]) - name: Perc50 query: quantile_over_time(0.50, sum by (container) (rate(container_cpu_usage_seconds_total[1m]))[%v:]) - Identifier: ContainerMemory Method: GenericPrometheusQuery Params: action: {{$action}} metricName: Container Memory metricVersion: v1 unit: MiB dimensions: - container queries: - name: Perc99 query: quantile_over_time(0.99, sum by (container) (container_memory_working_set_bytes / 1024 / 1024)[%v:]) - name: Perc90 query: quantile_over_time(0.90, sum by (container) (container_memory_working_set_bytes / 1024 / 1024)[%v:]) - name: Perc50 query: quantile_over_time(0.50, sum by (container) (container_memory_working_set_bytes / 1024 / 1024)[%v:]) {{end}} {{if $ENABLE_TERMINATED_WATCHES_MEASUREMENT}} - Identifier: TerminatedWatchesMetrics Method: GenericPrometheusQuery Params: action: {{$action}} metricName: Terminated Watches metricVersion: v1 dimensions: - resource queries: - name: Terminated watches query: sum(increase(apiserver_terminated_watchers_total[%v:])) by (resource) - Identifier: WatchCacheInitializations Method: GenericPrometheusQuery Params: action: {{$action}} metricName: Watch Cache Initializations metricVersion: v1 dimensions: - resource queries: - name: Watch cache reinitializations query: sum(increase(apiserver_watch_cache_initializations_total[%v:])) by (resource) {{end}} {{if $ENABLE_QUOTAS_USAGE_MEASUREMENT}} - Identifier: Quotas total usage Method: GenericPrometheusQuery Params: action: {{$action}} metricName: Quota usage metricVersion: v1 prometheusClient: managed unit: QPMs dimensions: - quota_metric queries: - name: perc99 query: quantile_over_time(0.99, sum by (quota_metric) (irate(serviceruntime_googleapis_com:quota_rate_net_usage{monitored_resource="consumer_quota"}[1m]))[%v:]) * 60 - name: max query: max_over_time(sum by (quota_metric) (irate(serviceruntime_googleapis_com:quota_rate_net_usage{monitored_resource="consumer_quota"}[1m]))[%v:]) * 60 {{end}} {{if $ENABLE_CEP_PROPAGATION_DELAY_MEASUREMENT}} - Identifier: CiliumEndpointPropagationDelay Method: CiliumEndpointPropagationDelay Params: action: {{$action}} bucketSLO: {{$CEP_PROPAGATION_DELAY_SLO_BUCKET}} percentileSLO: {{$CEP_PROPAGATION_DELAY_SLO_PERCENTILE}} enableViolations: true {{end}} - Identifier: TestMetrics Method: TestMetrics Params: action: {{$action}} systemPodMetricsEnabled: {{$ENABLE_SYSTEM_POD_METRICS}} clusterOOMsIgnoredProcesses: {{YamlQuote $CLUSTER_OOMS_IGNORED_PROCESSES 4}} clusterOOMsTrackerEnabled: {{$ENABLE_CLUSTER_OOMS_TRACKER}} restartCountThresholdOverrides: {{YamlQuote $RESTART_COUNT_THRESHOLD_OVERRIDES 4}} enableRestartCountCheck: {{$ENABLE_RESTART_COUNT_CHECK}} - module: path: modules/dns-performance-metrics.yaml params: action: {{$action}}