example/kubecon-2019-china/horovod-sample/lm-horovod-tf-mnist-v0.5.yaml (72 lines of code) (raw):
apiVersion: batch.volcano.sh/v1alpha1
kind: Job
metadata:
name: lm-horovod-job
labels:
"volcano.sh/job-type": Horovod
spec:
minAvailable: 4
schedulerName: volcano
plugins:
ssh: []
svc: []
# 如果有pod被 杀死,重启整个作业
policies:
- event: PodEvicted
action: RestartJob
tasks:
- replicas: 1
name: master
policies:
- event: TaskCompleted
action: CompleteJob
template:
spec:
containers:
- command:
- /bin/sh
- -c
- |
WORKER_HOST=`cat /etc/volcano/worker.host | tr "\n" ","`;
mkdir -p /var/run/sshd; /usr/sbin/sshd;
mpiexec --allow-run-as-root --host ${WORKER_HOST} -np 3 python tensorflow_mnist_lm.py;
image: volcanosh/horovod-tf-mnist:0.5
name: master
ports:
- containerPort: 22
name: job-port
resources:
requests:
cpu: "500m"
memory: "1024Mi"
limits:
cpu: "500m"
memory: "1024Mi"
restartPolicy: OnFailure
imagePullSecrets:
- name: default-secret
- replicas: 3
name: worker
template:
spec:
containers:
- command:
- /bin/sh
- -c
- |
mkdir -p /var/run/sshd; /usr/sbin/sshd -D;
image: volcanosh/horovod-tf-mnist:0.5
name: worker
ports:
- containerPort: 22
name: job-port
resources:
requests:
cpu: "1000m"
memory: "2048Mi"
limits:
cpu: "1000m"
memory: "2048Mi"
restartPolicy: OnFailure
imagePullSecrets:
- name: default-secret
---