ai-ml/gke-ray/rayserve/llm/tpu/ray-cluster.tpu-v6e-singlehost.yaml (126 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# [START gke_ai_ml_gke_ray_rayserve_raycluster_tpu_v6e_singlehost]
apiVersion: ray.io/v1
kind: RayCluster
metadata:
name: vllm-tpu
spec:
headGroupSpec:
rayStartParams: {}
template:
metadata:
annotations:
gke-gcsfuse/volumes: "true"
gke-gcsfuse/cpu-limit: "0"
gke-gcsfuse/memory-limit: "0"
gke-gcsfuse/ephemeral-storage-limit: "0"
spec:
serviceAccountName: $KSA_NAME
containers:
- name: ray-head
image: $VLLM_IMAGE
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: "2"
memory: 8G
requests:
cpu: "2"
memory: 8G
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-secret
key: hf_api_token
- name: VLLM_XLA_CACHE_PATH
value: "/data"
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: serve
- containerPort: 8471
name: slicebuilder
- containerPort: 8081
name: mxla
volumeMounts:
- name: gcs-fuse-csi-ephemeral
mountPath: /data
- name: dshm
mountPath: /dev/shm
volumes:
- name: gke-gcsfuse-cache
emptyDir:
medium: Memory
- name: dshm
emptyDir:
medium: Memory
- name: gcs-fuse-csi-ephemeral
csi:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: $GSBUCKET
mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1"
workerGroupSpecs:
- groupName: tpu-group
replicas: 1
minReplicas: 1
maxReplicas: 1
numOfHosts: 1
rayStartParams: {}
template:
metadata:
annotations:
gke-gcsfuse/volumes: "true"
gke-gcsfuse/cpu-limit: "0"
gke-gcsfuse/memory-limit: "0"
gke-gcsfuse/ephemeral-storage-limit: "0"
spec:
serviceAccountName: $KSA_NAME
containers:
- name: ray-worker
image: $VLLM_IMAGE
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: "100"
google.com/tpu: "8"
ephemeral-storage: 40G
memory: 200G
requests:
cpu: "100"
google.com/tpu: "8"
ephemeral-storage: 40G
memory: 200G
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-secret
key: hf_api_token
- name: VLLM_XLA_CACHE_PATH
value: "/data"
volumeMounts:
- name: gcs-fuse-csi-ephemeral
mountPath: /data
- name: dshm
mountPath: /dev/shm
volumes:
- name: gke-gcsfuse-cache
emptyDir:
medium: Memory
- name: dshm
emptyDir:
medium: Memory
- name: gcs-fuse-csi-ephemeral
csi:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: $GSBUCKET
mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1"
nodeSelector:
cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
cloud.google.com/gke-tpu-topology: 2x4
# [END gke_ai_ml_gke_ray_rayserve_raycluster_tpu_v6e_singlehost]