example/integrations/mxnet/train/train-mnist-cpu.yaml (89 lines of code) (raw):
apiVersion: batch.volcano.sh/v1alpha1
kind: Job
metadata:
name: mxnet-job
spec:
minAvailable: 5
schedulerName: volcano
policies:
- event: PodEvicted
action: RestartJob
- event: PodFailed
action: RestartJob
plugins:
svc: []
tasks:
- replicas: 2
name: worker
template:
spec:
imagePullSecrets:
- name: default-secret
containers:
- image: volcanosh/mxnet-train-mnist-cpu:v1
args:
- --kv-store=dist_sync
imagePullPolicy: IfNotPresent
name: mxnet
env:
- name: DMLC_PS_ROOT_PORT
value: "9000"
- name: DMLC_PS_ROOT_URI
value: mxnet-job-scheduler-0.mxnet-job
- name: DMLC_NUM_SERVER
value: "2"
- name: DMLC_NUM_WORKER
value: "2"
- name: DMLC_ROLE
value: "worker"
- name: DMLC_USE_KUBERNETES
value: "1"
restartPolicy: OnFailure
- replicas: 2
name: server
template:
spec:
imagePullSecrets:
- name: default-secret
containers:
- image: volcanosh/mxnet-train-mnist-cpu:v1
imagePullPolicy: IfNotPresent
name: mxnet
env:
- name: DMLC_PS_ROOT_PORT
value: "9000"
- name: DMLC_PS_ROOT_URI
value: mxnet-job-scheduler-0.mxnet-job
- name: DMLC_NUM_SERVER
value: "2"
- name: DMLC_NUM_WORKER
value: "2"
- name: DMLC_ROLE
value: "server"
- name: DMLC_USE_KUBERNETES
value: "1"
restartPolicy: OnFailure
- replicas: 1
name: scheduler
template:
spec:
imagePullSecrets:
- name: default-secret
containers:
- image: volcanosh/mxnet-train-mnist-cpu:v1
imagePullPolicy: IfNotPresent
name: mxnet
env:
- name: DMLC_PS_ROOT_PORT
value: "9000"
- name: DMLC_PS_ROOT_URI
value: mxnet-job-scheduler-0.mxnet-job
- name: DMLC_NUM_SERVER
value: "2"
- name: DMLC_NUM_WORKER
value: "2"
- name: DMLC_ROLE
value: "scheduler"
- name: DMLC_USE_KUBERNETES
value: "1"
restartPolicy: OnFailure