gpu-workload/t5/kubernetes/serving-cpu.yaml (85 lines of code) (raw):
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: t5-inference
labels:
model: t5
version: v1.0
machine: cpu
spec:
replicas: 1
selector:
matchLabels:
model: t5
version: v1.0
machine: cpu
template:
metadata:
labels:
model: t5
version: v1.0
machine: cpu
spec:
securityContext:
fsGroup: 1000
runAsUser: 1000
runAsGroup: 1000
containers:
- name: inference
image: gcr.io/lustrous-baton-363720/models/t5-small:1.0-cpu
# image: IMAGE_NAME
imagePullPolicy: IfNotPresent
args: ["torchserve", "--start", "--foreground"]
resources:
limits:
cpu: "3000m"
memory: 16Gi
ephemeral-storage: 10Gi
requests:
cpu: "3000m"
memory: 16Gi
ephemeral-storage: 10Gi
ports:
- containerPort: 8080
name: http
- containerPort: 8081
name: management
- containerPort: 8082
name: metrics
readinessProbe:
httpGet:
path: /ping
port: http
initialDelaySeconds: 120
failureThreshold: 10
livenessProbe:
httpGet:
path: /models/t5-small
port: management
initialDelaySeconds: 150
periodSeconds: 5
---
apiVersion: v1
kind: Service
metadata:
name: t5-inference
labels:
model: t5
version: v1.0
machine: cpu
spec:
type: ClusterIP
selector:
model: t5
version: v1.0
machine: cpu
ports:
- port: 8080
name: http
targetPort: http
- port: 8081
name: management
targetPort: management
- port: 8082
name: metrics
targetPort: metrics