example/integrations/paddlepaddle/ctr-paddlepaddle-on-volcano.yaml (183 lines of code) (raw):
apiVersion: batch.volcano.sh/v1alpha1
kind: Job
metadata:
name: ctr-volcano
spec:
minAvailable: 4
schedulerName: volcano
policies:
- event: PodEvicted
action: RestartJob
- event: PodFailed
action: RestartJob
tasks:
- replicas: 2
name: pserver
template:
metadata:
labels:
paddle-job-pserver: fluid-ctr
spec:
imagePullSecrets:
- name: default-secret
volumes:
- hostPath:
path: /home/work/
type: ""
name: seqdata
containers:
- image: volcanosh/edlctr:v1
command:
- paddle_k8s
- start_fluid
imagePullPolicy: IfNotPresent
name: pserver
volumeMounts:
- mountPath: /mnt/seqdata
name: seqdata
resources:
limits:
cpu: 10
memory: 30Gi
ephemeral-storage: 10Gi
requests:
cpu: 1
memory: 100M
ephemeral-storage: 1Gi
env:
- name: GLOG_v
value: "0"
- name: GLOG_logtostderr
value: "1"
- name: TOPOLOGY
value: ""
- name: TRAINER_PACKAGE
value: /workspace
- name: NAMESPACE
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
- name: POD_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
- name: POD_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.name
- name: PADDLE_CURRENT_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
- name: PADDLE_JOB_NAME
value: fluid-ctr
- name: PADDLE_IS_LOCAL
value: "0"
- name: PADDLE_TRAINERS_NUM
value: "2"
- name: PADDLE_PSERVERS_NUM
value: "2"
- name: FLAGS_rpc_deadline
value: "36000000"
- name: ENTRY
value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1
- name: PADDLE_PORT
value: "30236"
- name: LD_LIBRARY_PATH
value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind
- name: PADDLE_TRAINING_ROLE
value: PSERVER
- name: TRAINING_ROLE
value: PSERVER
restartPolicy: OnFailure
- replicas: 2
policies:
- event: TaskCompleted
action: CompleteJob
name: trainer
template:
metadata:
labels:
paddle-job: fluid-ctr
spec:
imagePullSecrets:
- name: default-secret
volumes:
- hostPath:
path: /home/work/
type: ""
name: seqdata
containers:
- image: volcanosh/edlctr:v1
command:
- paddle_k8s
- start_fluid
imagePullPolicy: IfNotPresent
name: trainer
volumeMounts:
- mountPath: /mnt/seqdata
name: seqdata
resources:
limits:
cpu: 10
memory: 30Gi
ephemeral-storage: 10Gi
requests:
cpu: 1
memory: 100M
ephemeral-storage: 10Gi
env:
- name: GLOG_v
value: "0"
- name: GLOG_logtostderr
value: "1"
- name: TOPOLOGY
- name: TRAINER_PACKAGE
value: /workspace
- name: CPU_NUM
value: "2"
- name: NAMESPACE
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
- name: POD_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
- name: POD_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.name
- name: PADDLE_CURRENT_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
- name: PADDLE_JOB_NAME
value: fluid-ctr
- name: PADDLE_IS_LOCAL
value: "0"
- name: FLAGS_rpc_deadline
value: "36000000"
- name: PADDLE_PORT
value: "30236"
- name: PADDLE_PSERVERS_NUM
value: "2"
- name: PADDLE_TRAINERS_NUM
value: "2"
- name: PADDLE_TRAINING_ROLE
value: TRAINER
- name: TRAINING_ROLE
value: TRAINER
- name: LD_LIBRARY_PATH
value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind
- name: ENTRY
value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1
restartPolicy: OnFailure