example/integrations/mxnet/train/train-mnist-cpu.yaml (89 lines of code) (raw):

apiVersion: batch.volcano.sh/v1alpha1 kind: Job metadata: name: mxnet-job spec: minAvailable: 5 schedulerName: volcano policies: - event: PodEvicted action: RestartJob - event: PodFailed action: RestartJob plugins: svc: [] tasks: - replicas: 2 name: worker template: spec: imagePullSecrets: - name: default-secret containers: - image: volcanosh/mxnet-train-mnist-cpu:v1 args: - --kv-store=dist_sync imagePullPolicy: IfNotPresent name: mxnet env: - name: DMLC_PS_ROOT_PORT value: "9000" - name: DMLC_PS_ROOT_URI value: mxnet-job-scheduler-0.mxnet-job - name: DMLC_NUM_SERVER value: "2" - name: DMLC_NUM_WORKER value: "2" - name: DMLC_ROLE value: "worker" - name: DMLC_USE_KUBERNETES value: "1" restartPolicy: OnFailure - replicas: 2 name: server template: spec: imagePullSecrets: - name: default-secret containers: - image: volcanosh/mxnet-train-mnist-cpu:v1 imagePullPolicy: IfNotPresent name: mxnet env: - name: DMLC_PS_ROOT_PORT value: "9000" - name: DMLC_PS_ROOT_URI value: mxnet-job-scheduler-0.mxnet-job - name: DMLC_NUM_SERVER value: "2" - name: DMLC_NUM_WORKER value: "2" - name: DMLC_ROLE value: "server" - name: DMLC_USE_KUBERNETES value: "1" restartPolicy: OnFailure - replicas: 1 name: scheduler template: spec: imagePullSecrets: - name: default-secret containers: - image: volcanosh/mxnet-train-mnist-cpu:v1 imagePullPolicy: IfNotPresent name: mxnet env: - name: DMLC_PS_ROOT_PORT value: "9000" - name: DMLC_PS_ROOT_URI value: mxnet-job-scheduler-0.mxnet-job - name: DMLC_NUM_SERVER value: "2" - name: DMLC_NUM_WORKER value: "2" - name: DMLC_ROLE value: "scheduler" - name: DMLC_USE_KUBERNETES value: "1" restartPolicy: OnFailure