Raw File
tfjob-example.yaml
apiVersion: "kubeflow.org/v1beta1"
kind: Experiment
metadata:
  namespace: kubeflow
  name: tfjob-example
spec:
  parallelTrialCount: 3
  maxTrialCount: 12
  maxFailedTrialCount: 3
  objective:
    type: maximize
    goal: 0.99
    objectiveMetricName: accuracy_1
  algorithm:
    algorithmName: random
  metricsCollectorSpec:
    source:
      fileSystemPath:
        path: /train
        kind: Directory
    collector:
      kind: TensorFlowEvent
  parameters:
    - name: learning_rate
      parameterType: double
      feasibleSpace:
        min: "0.01"
        max: "0.05"
    - name: batch_size
      parameterType: int
      feasibleSpace:
        min: "100"
        max: "200"
  trialTemplate:
    trialParameters:
      - name: learningRate
        description: Learning rate for the training model
        reference: learning_rate
      - name: batchSize
        description: Batch Size
        reference: batch_size
    trialSpec:
      apiVersion: "kubeflow.org/v1"
      kind: TFJob
      spec:
        tfReplicaSpecs:
          Worker:
            replicas: 2
            restartPolicy: OnFailure
            template:
              spec:
                containers:
                  - name: tensorflow
                    image: gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0
                    imagePullPolicy: Always
                    command:
                      - "python"
                      - "/var/tf_mnist/mnist_with_summaries.py"
                      - "--log_dir=/train/metrics"
                      - "--learning_rate=${trialParameters.learningRate}"
                      - "--batch_size=${trialParameters.batchSize}"
back to top