Revision b3a80a856488550c2dd4b6231c7a40952ce29dbf authored by hougang liu on 13 December 2018, 23:05:39 UTC, committed by hougang liu on 13 December 2018, 23:05:39 UTC
1 parent 51af3d7
Raw File
pytorchjob-example.yaml
apiVersion: "kubeflow.org/v1alpha1"
kind: StudyJob
metadata:
  namespace: kubeflow
  labels:
    controller-tools.k8s.io: "1.0"
  name: pytorchjob-example
spec:
  studyName: pytorchjob-example
  owner: crd
  optimizationtype: maximize
  objectivevaluename: accuracy
  optimizationgoal: 0.99
  requestcount: 4
  metricsnames:
    - accuracy
  parameterconfigs:
    - name: --lr
      parametertype: double
      feasible:
        min: "0.01"
        max: "0.05"
    - name: --momentum
      parametertype: double
      feasible:
        min: "0.5"
        max: "0.9"
  workerSpec:
    retain: true
    goTemplate:
        rawTemplate: |-
          apiVersion: "kubeflow.org/v1beta1"
          kind: PyTorchJob
          metadata:
            name: {{.WorkerID}}
            namespace: kubeflow
          spec:
           pytorchReplicaSpecs:
            Master:
              replicas: 1
              restartPolicy: Never
              template:
                spec:
                  containers:
                    - name: pytorch
                      image: gcr.io/kubeflow-ci/pytorch-mnist-with-summary:0.4
                      imagePullPolicy: Always
                      command:
                        - "python"
                        - "/opt/pytorch_dist_mnist/mnist_with_summary.py"
                        {{- with .HyperParameters}}
                        {{- range .}}
                        - "{{.Name}}={{.Value}}"
                        {{- end}}
                        {{- end}}
  metricsCollectorSpec:
    retain: true
    goTemplate:
      rawTemplate: |-
        apiVersion: batch/v1beta1
        kind: CronJob
        metadata:
          name: {{.WorkerID}}
          namespace: kubeflow
        spec:
          schedule: "*/1 * * * *"
          successfulJobsHistoryLimit: 15
          failedJobsHistoryLimit: 15
          jobTemplate:
            spec:
              template:
                spec:
                  serviceAccountName: metrics-collector
                  containers:
                  - name: {{.WorkerID}}
                    image: johnugeorge/metrics-collector
                    args:
                    - "./metricscollector"
                    - "-s"
                    - "{{.StudyID}}"
                    - "-t"
                    - "{{.TrialID}}"
                    - "-w"
                    - "{{.WorkerID}}"
                    - "-k"
                    - "{{.WorkerKind}}"
                    - "-n"
                    - "{{.NameSpace}}"
                  restartPolicy: Never

  suggestionSpec:
    suggestionAlgorithm: "random"
    requestNumber: 3
back to top