https://github.com/kubeflow/katib
Raw File
Tip revision: 9fccfae5b3bb59ce473aad4ff173cc790a117dfe authored by hougang liu on 18 December 2018, 11:02:16 UTC
sync up worker status all the time
Tip revision: 9fccfae
workerConfigMap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: worker-template
  namespace: kubeflow
data:
  cpuWorkerTemplate.yaml : |-
    apiVersion: batch/v1
    kind: Job
    metadata:
      name: {{.WorkerID}}
      namespace: kubeflow
    spec:
      template:
        spec:
          containers:
          - name: {{.WorkerID}}
            image: katib/mxnet-mnist-example
            command:
            - "python"
            - "/mxnet/example/image-classification/train_mnist.py"
            - "--batch-size=64"
            {{- with .HyperParameters}}
            {{- range .}}
            - "{{.Name}}={{.Value}}"
            {{- end}}
            {{- end}}
          restartPolicy: Never
  gpuWorkerTemplate.yaml : |-
    apiVersion: batch/v1
    kind: Job
    metadata:
      name: {{.WorkerID}}
      namespace: kubeflow
    spec:
      template:
        spec:
          containers:
          - name: {{.WorkerID}}
            image: katib/mxnet-mnist-example:gpu
            command:
            - "python"
            - "/mxnet/example/image-classification/train_mnist.py"
            - "--batch-size=64"
            {{- with .HyperParameters}}
            {{- range .}}
            - "{{.Name}}={{.Value}}"
            {{- end}}
            {{- end}}
            resources:
              limits:
                nvidia.com/gpu: 1
          restartPolicy: Never
back to top