https://github.com/kubeflow/katib
Raw File
Tip revision: de54e9c094821e8bc91023666ed6bac2ee08fdaa authored by DeeperMind on 08 March 2019, 02:47:16 UTC
add fault tolerance for trial failure
Tip revision: de54e9c
random-example.yaml
apiVersion: "kubeflow.org/v1alpha1"
kind: StudyJob
metadata:
  namespace: kubeflow
  labels:
    controller-tools.k8s.io: "1.0"
  name: random-example
spec:
  studyName: random-example
  owner: crd
  optimizationtype: maximize
  objectivevaluename: Validation-accuracy
  optimizationgoal: 0.99
  requestcount: 4
  metricsnames:
    - accuracy
  parameterconfigs:
    - name: --lr
      parametertype: double
      feasible:
        min: "0.01"
        max: "0.03"
    - name: --num-layers
      parametertype: int
      feasible:
        min: "2"
        max: "5"
    - name: --optimizer
      parametertype: categorical
      feasible:
        list:
        - sgd
        - adam
        - ftrl
  workerSpec:
    goTemplate:
        rawTemplate: |-
          apiVersion: batch/v1
          kind: Job
          metadata:
            name: {{.WorkerID}}
            namespace: kubeflow
          spec:
            template:
              spec:
                containers:
                - name: {{.WorkerID}}
                  image: katib/mxnet-mnist-example
                  command:
                  - "python"
                  - "/mxnet/example/image-classification/train_mnist.py"
                  - "--batch-size=64"
                  {{- with .HyperParameters}}
                  {{- range .}}
                  - "{{.Name}}={{.Value}}"
                  {{- end}}
                  {{- end}}
                restartPolicy: Never
  suggestionSpec:
    suggestionAlgorithm: "random"
    requestNumber: 3
back to top