ai-ml/mix-train-and-inference/kueue/patch.yaml (52 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # [START gke_ai_ml_mix_train_and_inference_kueue_patch] apiVersion: v1 kind: ConfigMap metadata: name: kueue-manager-config data: controller_manager_config.yaml: | apiVersion: config.kueue.x-k8s.io/v1beta1 kind: Configuration health: healthProbeBindAddress: :8081 metrics: bindAddress: :8080 # enableClusterQueueResources: true webhook: port: 9443 leaderElection: leaderElect: true resourceName: c1f6bfd2.kueue.x-k8s.io controller: groupKindConcurrency: Job.batch: 5 Pod: 5 Workload.kueue.x-k8s.io: 5 LocalQueue.kueue.x-k8s.io: 1 ClusterQueue.kueue.x-k8s.io: 1 ResourceFlavor.kueue.x-k8s.io: 1 clientConnection: qps: 50 burst: 100 #pprofBindAddress: :8083 #waitForPodsReady: # enable: false # timeout: 5m # blockAdmission: false # requeuingStrategy: # timestamp: Eviction # backoffLimitCount: null # null indicates infinite requeuing # backoffBaseSeconds: 60 # backoffMaxSeconds: 3600 #manageJobsWithoutQueueName: true #internalCertManagement: # enable: false # webhookServiceName: "" # webhookSecretName: "" integrations: frameworks: - "batch/job" - "kubeflow.org/mpijob" - "ray.io/rayjob" - "ray.io/raycluster" - "jobset.x-k8s.io/jobset" - "kubeflow.org/mxjob" - "kubeflow.org/paddlejob" - "kubeflow.org/pytorchjob" - "kubeflow.org/tfjob" - "kubeflow.org/xgboostjob" - "pod" # externalFrameworks: # - "Foo.v1.example.com" podOptions: # You can change namespaceSelector to define in which # namespaces kueue will manage the pods. namespaceSelector: matchExpressions: - key: kubernetes.io/metadata.name operator: NotIn values: [ kube-system, kueue-system ] # Kueue uses podSelector to manage pods with particular # labels. The default podSelector will match all the pods. podSelector: matchExpressions: - key: kueue-job operator: In values: [ "true", "True", "yes" ] # [END gke_ai_ml_mix_train_and_inference_kueue_patch]