example/kubecon-2019-china/horovod-sample/lm-horovod-tf-mnist-v0.5.yaml (72 lines of code) (raw):

apiVersion: batch.volcano.sh/v1alpha1 kind: Job metadata: name: lm-horovod-job labels: "volcano.sh/job-type": Horovod spec: minAvailable: 4 schedulerName: volcano plugins: ssh: [] svc: [] # 如果有pod被 杀死,重启整个作业 policies: - event: PodEvicted action: RestartJob tasks: - replicas: 1 name: master policies: - event: TaskCompleted action: CompleteJob template: spec: containers: - command: - /bin/sh - -c - | WORKER_HOST=`cat /etc/volcano/worker.host | tr "\n" ","`; mkdir -p /var/run/sshd; /usr/sbin/sshd; mpiexec --allow-run-as-root --host ${WORKER_HOST} -np 3 python tensorflow_mnist_lm.py; image: volcanosh/horovod-tf-mnist:0.5 name: master ports: - containerPort: 22 name: job-port resources: requests: cpu: "500m" memory: "1024Mi" limits: cpu: "500m" memory: "1024Mi" restartPolicy: OnFailure imagePullSecrets: - name: default-secret - replicas: 3 name: worker template: spec: containers: - command: - /bin/sh - -c - | mkdir -p /var/run/sshd; /usr/sbin/sshd -D; image: volcanosh/horovod-tf-mnist:0.5 name: worker ports: - containerPort: 22 name: job-port resources: requests: cpu: "1000m" memory: "2048Mi" limits: cpu: "1000m" memory: "2048Mi" restartPolicy: OnFailure imagePullSecrets: - name: default-secret ---