Revision b3a80a856488550c2dd4b6231c7a40952ce29dbf authored by hougang liu on 13 December 2018, 23:05:39 UTC, committed by hougang liu on 13 December 2018, 23:05:39 UTC
1 parent 51af3d7
pytorchjob-example.yaml
apiVersion: "kubeflow.org/v1alpha1"
kind: StudyJob
metadata:
namespace: kubeflow
labels:
controller-tools.k8s.io: "1.0"
name: pytorchjob-example
spec:
studyName: pytorchjob-example
owner: crd
optimizationtype: maximize
objectivevaluename: accuracy
optimizationgoal: 0.99
requestcount: 4
metricsnames:
- accuracy
parameterconfigs:
- name: --lr
parametertype: double
feasible:
min: "0.01"
max: "0.05"
- name: --momentum
parametertype: double
feasible:
min: "0.5"
max: "0.9"
workerSpec:
retain: true
goTemplate:
rawTemplate: |-
apiVersion: "kubeflow.org/v1beta1"
kind: PyTorchJob
metadata:
name: {{.WorkerID}}
namespace: kubeflow
spec:
pytorchReplicaSpecs:
Master:
replicas: 1
restartPolicy: Never
template:
spec:
containers:
- name: pytorch
image: gcr.io/kubeflow-ci/pytorch-mnist-with-summary:0.4
imagePullPolicy: Always
command:
- "python"
- "/opt/pytorch_dist_mnist/mnist_with_summary.py"
{{- with .HyperParameters}}
{{- range .}}
- "{{.Name}}={{.Value}}"
{{- end}}
{{- end}}
metricsCollectorSpec:
retain: true
goTemplate:
rawTemplate: |-
apiVersion: batch/v1beta1
kind: CronJob
metadata:
name: {{.WorkerID}}
namespace: kubeflow
spec:
schedule: "*/1 * * * *"
successfulJobsHistoryLimit: 15
failedJobsHistoryLimit: 15
jobTemplate:
spec:
template:
spec:
serviceAccountName: metrics-collector
containers:
- name: {{.WorkerID}}
image: johnugeorge/metrics-collector
args:
- "./metricscollector"
- "-s"
- "{{.StudyID}}"
- "-t"
- "{{.TrialID}}"
- "-w"
- "{{.WorkerID}}"
- "-k"
- "{{.WorkerKind}}"
- "-n"
- "{{.NameSpace}}"
restartPolicy: Never
suggestionSpec:
suggestionAlgorithm: "random"
requestNumber: 3
Computing file changes ...