ai-ml/llm-serving-tpus-jetstream/pytorch/jetstream-pytorch-gemma-7b-it-2x4.yaml (93 lines of code) (raw):
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# [START gke_ai_ml_llm_serving_tpus_jetstream_pytorch_gemma_7b_it_2x4]
apiVersion: apps/v1
kind: Deployment
metadata:
name: jetstream-pytorch-server
spec:
replicas: 2
selector:
matchLabels:
app: jetstream-pytorch-server
template:
metadata:
labels:
app: jetstream-pytorch-server
annotations:
gke-gcsfuse/volumes: "true"
spec:
nodeSelector:
cloud.google.com/gke-tpu-topology: 2x4
cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
containers:
- name: jetstream-pytorch-server
image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-pytorch-server:v0.2.4
args:
- --model_id=google/gemma-7b-it
- --override_batch_size=30
- --working_dir=/models/pytorch/
- --enable_model_warmup=True
volumeMounts:
- name: gcs-fuse-checkpoint
mountPath: /models
- name: huggingface-credentials
mountPath: /huggingface
readOnly: true
ports:
- containerPort: 9000
resources:
requests:
google.com/tpu: 8
limits:
google.com/tpu: 8
startupProbe:
httpGet:
path: /healthcheck
port: 8000
scheme: HTTP
periodSeconds: 60
initialDelaySeconds: 90
failureThreshold: 50
livenessProbe:
httpGet:
path: /healthcheck
port: 8000
scheme: HTTP
periodSeconds: 60
failureThreshold: 30
readinessProbe:
httpGet:
path: /healthcheck
port: 8000
scheme: HTTP
periodSeconds: 60
failureThreshold: 30
- name: jetstream-http
image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.3
ports:
- containerPort: 8000
volumes:
- name: huggingface-credentials
secret:
defaultMode: 0400
secretName: huggingface-secret
- name: gke-gcsfuse-cache
emptyDir:
medium: Memory
- name: gcs-fuse-checkpoint
csi:
driver: gcsfuse.csi.storage.gke.io
volumeAttributes:
bucketName: BUCKET_NAME
mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1"
---
apiVersion: v1
kind: Service
metadata:
name: jetstream-svc
spec:
selector:
app: jetstream-pytorch-server
ports:
- protocol: TCP
name: jetstream-http
port: 8000
targetPort: 8000
# [END gke_ai_ml_llm_serving_tpus_jetstream_pytorch_gemma_7b_it_2x4]