gpu-workload/t5/kubernetes/serving-gpu.yaml (89 lines of code) (raw):

--- apiVersion: apps/v1 kind: Deployment metadata: name: t5-inference labels: model: t5 version: v1.0 machine: gpu spec: replicas: 1 selector: matchLabels: model: t5 version: v1.0 machine: gpu template: metadata: labels: model: t5 version: v1.0 machine: gpu spec: nodeSelector: cloud.google.com/gke-accelerator: nvidia-tesla-t4 securityContext: fsGroup: 1000 runAsUser: 1000 runAsGroup: 1000 containers: - name: inference image: gcr.io/lustrous-baton-363720/models/t5-small:1.0-gpu # image: IMAGE_NAME imagePullPolicy: IfNotPresent args: ["torchserve", "--start", "--foreground"] resources: limits: nvidia.com/gpu: "1" cpu: "3000m" memory: 16Gi ephemeral-storage: 10Gi requests: nvidia.com/gpu: "1" cpu: "3000m" memory: 16Gi ephemeral-storage: 10Gi ports: - containerPort: 8080 name: http - containerPort: 8081 name: management - containerPort: 8082 name: metrics readinessProbe: httpGet: path: /ping port: http initialDelaySeconds: 120 failureThreshold: 10 livenessProbe: httpGet: path: /models/t5-small port: management initialDelaySeconds: 150 periodSeconds: 5 --- apiVersion: v1 kind: Service metadata: name: t5-inference labels: model: t5 version: v1.0 machine: gpu spec: type: ClusterIP selector: model: t5 version: v1.0 machine: gpu ports: - port: 8080 name: http targetPort: http - port: 8081 name: management targetPort: management - port: 8082 name: metrics targetPort: metrics