ai-ml/llm-serving-tpus-jetstream/pytorch/jetstream-pytorch-gemma-7b-it-2x4.yaml

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # [START gke_ai_ml_llm_serving_tpus_jetstream_pytorch_gemma_7b_it_2x4] apiVersion: apps/v1 kind: Deployment metadata: name: jetstream-pytorch-server spec: replicas: 2 selector: matchLabels: app: jetstream-pytorch-server template: metadata: labels: app: jetstream-pytorch-server annotations: gke-gcsfuse/volumes: "true" spec: nodeSelector: cloud.google.com/gke-tpu-topology: 2x4 cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice containers: - name: jetstream-pytorch-server image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-pytorch-server:v0.2.4 args: - --model_id=google/gemma-7b-it - --override_batch_size=30 - --working_dir=/models/pytorch/ - --enable_model_warmup=True volumeMounts: - name: gcs-fuse-checkpoint mountPath: /models - name: huggingface-credentials mountPath: /huggingface readOnly: true ports: - containerPort: 9000 resources: requests: google.com/tpu: 8 limits: google.com/tpu: 8 startupProbe: httpGet: path: /healthcheck port: 8000 scheme: HTTP periodSeconds: 60 initialDelaySeconds: 90 failureThreshold: 50 livenessProbe: httpGet: path: /healthcheck port: 8000 scheme: HTTP periodSeconds: 60 failureThreshold: 30 readinessProbe: httpGet: path: /healthcheck port: 8000 scheme: HTTP periodSeconds: 60 failureThreshold: 30 - name: jetstream-http image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.3 ports: - containerPort: 8000 volumes: - name: huggingface-credentials secret: defaultMode: 0400 secretName: huggingface-secret - name: gke-gcsfuse-cache emptyDir: medium: Memory - name: gcs-fuse-checkpoint csi: driver: gcsfuse.csi.storage.gke.io volumeAttributes: bucketName: BUCKET_NAME mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1" --- apiVersion: v1 kind: Service metadata: name: jetstream-svc spec: selector: app: jetstream-pytorch-server ports: - protocol: TCP name: jetstream-http port: 8000 targetPort: 8000 # [END gke_ai_ml_llm_serving_tpus_jetstream_pytorch_gemma_7b_it_2x4]

ai-ml/llm-serving-tpus-jetstream/pytorch/jetstream-pytorch-gemma-7b-it-2x4.yaml (93 lines of code) (raw):