tfjob-example.yaml
apiVersion: "kubeflow.org/v1beta1"
kind: Experiment
metadata:
namespace: kubeflow
name: tfjob-example
spec:
parallelTrialCount: 3
maxTrialCount: 12
maxFailedTrialCount: 3
objective:
type: maximize
goal: 0.99
objectiveMetricName: accuracy_1
algorithm:
algorithmName: random
metricsCollectorSpec:
source:
fileSystemPath:
path: /train
kind: Directory
collector:
kind: TensorFlowEvent
parameters:
- name: learning_rate
parameterType: double
feasibleSpace:
min: "0.01"
max: "0.05"
- name: batch_size
parameterType: int
feasibleSpace:
min: "100"
max: "200"
trialTemplate:
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: learning_rate
- name: batchSize
description: Batch Size
reference: batch_size
trialSpec:
apiVersion: "kubeflow.org/v1"
kind: TFJob
spec:
tfReplicaSpecs:
Worker:
replicas: 2
restartPolicy: OnFailure
template:
spec:
containers:
- name: tensorflow
image: gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0
imagePullPolicy: Always
command:
- "python"
- "/var/tf_mnist/mnist_with_summaries.py"
- "--log_dir=/train/metrics"
- "--learning_rate=${trialParameters.learningRate}"
- "--batch_size=${trialParameters.batchSize}"