ai-ml/gke-ray/rayserve/llm/tpu/ray-cluster.tpu-v6e-singlehost.yaml

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # [START gke_ai_ml_gke_ray_rayserve_raycluster_tpu_v6e_singlehost] apiVersion: ray.io/v1 kind: RayCluster metadata: name: vllm-tpu spec: headGroupSpec: rayStartParams: {} template: metadata: annotations: gke-gcsfuse/volumes: "true" gke-gcsfuse/cpu-limit: "0" gke-gcsfuse/memory-limit: "0" gke-gcsfuse/ephemeral-storage-limit: "0" spec: serviceAccountName: $KSA_NAME containers: - name: ray-head image: $VLLM_IMAGE imagePullPolicy: IfNotPresent resources: limits: cpu: "2" memory: 8G requests: cpu: "2" memory: 8G env: - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: name: hf-secret key: hf_api_token - name: VLLM_XLA_CACHE_PATH value: "/data" ports: - containerPort: 6379 name: gcs - containerPort: 8265 name: dashboard - containerPort: 10001 name: client - containerPort: 8000 name: serve - containerPort: 8471 name: slicebuilder - containerPort: 8081 name: mxla volumeMounts: - name: gcs-fuse-csi-ephemeral mountPath: /data - name: dshm mountPath: /dev/shm volumes: - name: gke-gcsfuse-cache emptyDir: medium: Memory - name: dshm emptyDir: medium: Memory - name: gcs-fuse-csi-ephemeral csi: driver: gcsfuse.csi.storage.gke.io volumeAttributes: bucketName: $GSBUCKET mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1" workerGroupSpecs: - groupName: tpu-group replicas: 1 minReplicas: 1 maxReplicas: 1 numOfHosts: 1 rayStartParams: {} template: metadata: annotations: gke-gcsfuse/volumes: "true" gke-gcsfuse/cpu-limit: "0" gke-gcsfuse/memory-limit: "0" gke-gcsfuse/ephemeral-storage-limit: "0" spec: serviceAccountName: $KSA_NAME containers: - name: ray-worker image: $VLLM_IMAGE imagePullPolicy: IfNotPresent resources: limits: cpu: "100" google.com/tpu: "8" ephemeral-storage: 40G memory: 200G requests: cpu: "100" google.com/tpu: "8" ephemeral-storage: 40G memory: 200G env: - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: name: hf-secret key: hf_api_token - name: VLLM_XLA_CACHE_PATH value: "/data" volumeMounts: - name: gcs-fuse-csi-ephemeral mountPath: /data - name: dshm mountPath: /dev/shm volumes: - name: gke-gcsfuse-cache emptyDir: medium: Memory - name: dshm emptyDir: medium: Memory - name: gcs-fuse-csi-ephemeral csi: driver: gcsfuse.csi.storage.gke.io volumeAttributes: bucketName: $GSBUCKET mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1" nodeSelector: cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice cloud.google.com/gke-tpu-topology: 2x4 # [END gke_ai_ml_gke_ray_rayserve_raycluster_tpu_v6e_singlehost]

ai-ml/gke-ray/rayserve/llm/tpu/ray-cluster.tpu-v6e-singlehost.yaml (126 lines of code) (raw):