example/kubecon-2019-china/mpi-sample/mpi-example.yaml (73 lines of code) (raw):

apiVersion: batch.volcano.sh/v1alpha1 kind: Job metadata: name: lm-mpi-job labels: # 根据业务需要设置作业类型 "volcano.sh/job-type": "MPI" spec: # 设置最小需要的服务 (小于总replicas数) minAvailable: 3 schedulerName: volcano plugins: # 提供 ssh 免密认证 ssh: [] # 提供运行作业所需要的网络信息,hosts文件,headless service等 svc: [] # 如果有pod被 杀死,重启整个作业 policies: - event: PodEvicted action: RestartJob tasks: - replicas: 1 name: mpimaster # 当 mpiexec 结束,认识整个mpi作业结束 policies: - event: TaskCompleted action: CompleteJob template: spec: # Volcano 的信息会统一放到 /etc/volcano 目录下 containers: - command: - /bin/sh - -c - | MPI_HOST=`cat /etc/volcano/mpiworker.host | tr "\n" ","`; mkdir -p /var/run/sshd; /usr/sbin/sshd; mpiexec --allow-run-as-root --host ${MPI_HOST} -np 2 mpi_hello_world; image: volcanosh/example-mpi:0.0.1 name: mpimaster ports: - containerPort: 22 name: mpijob-port workingDir: /home resources: requests: cpu: "500m" memory: "1024Mi" limits: cpu: "500m" memory: "1024Mi" restartPolicy: OnFailure imagePullSecrets: - name: default-secret - replicas: 2 name: mpiworker template: spec: containers: - command: - /bin/sh - -c - | mkdir -p /var/run/sshd; /usr/sbin/sshd -D; image: volcanosh/example-mpi:0.0.1 name: mpiworker ports: - containerPort: 22 name: mpijob-port workingDir: /home resources: requests: cpu: "1024m" memory: "2048Mi" limits: cpu: "1024m" memory: "2048Mi" restartPolicy: OnFailure imagePullSecrets: - name: default-secret