Content - 581de119505821ad8dc5d14e36fc4bcb5b64c139 - de57f9d/src/Func.h

visit type:
Tip revision: 5c063bb708ff8d1391775e56dad8896b2ab015a9 authored by Steven Johnson on 26 September 2022, 18:49:16 UTC
Merge branch 'main' into abadams/fix_round
Tip revision: 5c063bb
Func.h
#ifndef HALIDE_FUNC_H
#define HALIDE_FUNC_H

/** \file
 *
 * Defines Func - the front-end handle on a halide function, and related classes.
 */

#include "Argument.h"
#include "Expr.h"
#include "JITModule.h"
#include "Module.h"
#include "Param.h"
#include "Pipeline.h"
#include "RDom.h"
#include "Target.h"
#include "Tuple.h"
#include "Var.h"

#include <map>
#include <utility>

namespace Halide {

class OutputImageParam;
class ParamMap;

/** A class that can represent Vars or RVars. Used for reorder calls
 * which can accept a mix of either. */
struct VarOrRVar {
    VarOrRVar(const std::string &n, bool r)
        : var(n), rvar(n), is_rvar(r) {
    }
    VarOrRVar(const Var &v)
        : var(v), is_rvar(false) {
    }
    VarOrRVar(const RVar &r)
        : rvar(r), is_rvar(true) {
    }
    VarOrRVar(const RDom &r)
        : rvar(RVar(r)), is_rvar(true) {
    }
    template<int N>
    VarOrRVar(const ImplicitVar<N> &u)
        : var(u), is_rvar(false) {
    }

    const std::string &name() const {
        if (is_rvar) {
            return rvar.name();
        } else {
            return var.name();
        }
    }

    Var var;
    RVar rvar;
    bool is_rvar;
};

class ImageParam;

namespace Internal {
class Function;
struct Split;
struct StorageDim;
}  // namespace Internal

/** A single definition of a Func. May be a pure or update definition. */
class Stage {
    /** Reference to the Function this stage (or definition) belongs to. */
    Internal::Function function;
    Internal::Definition definition;
    /** Indicate which stage the definition belongs to (0 for initial
     * definition, 1 for first update, etc.). */
    size_t stage_index;
    /** Pure Vars of the Function (from the init definition). */
    std::vector<Var> dim_vars;

    void set_dim_type(const VarOrRVar &var, Internal::ForType t);
    void set_dim_device_api(const VarOrRVar &var, DeviceAPI device_api);
    void split(const std::string &old, const std::string &outer, const std::string &inner,
               const Expr &factor, bool exact, TailStrategy tail);
    void remove(const std::string &var);
    Stage &purify(const VarOrRVar &old_name, const VarOrRVar &new_name);

    const std::vector<Internal::StorageDim> &storage_dims() const {
        return function.schedule().storage_dims();
    }

    Stage &compute_with(LoopLevel loop_level, const std::map<std::string, LoopAlignStrategy> &align);

public:
    Stage(Internal::Function f, Internal::Definition d, size_t stage_index)
        : function(std::move(f)), definition(std::move(d)), stage_index(stage_index) {
        internal_assert(definition.defined());

        dim_vars.reserve(function.args().size());
        for (const auto &arg : function.args()) {
            dim_vars.emplace_back(arg);
        }
        internal_assert(definition.args().size() == dim_vars.size());
    }

    /** Return the current StageSchedule associated with this Stage. For
     * introspection only: to modify schedule, use the Func interface. */
    const Internal::StageSchedule &get_schedule() const {
        return definition.schedule();
    }

    /** Return a string describing the current var list taking into
     * account all the splits, reorders, and tiles. */
    std::string dump_argument_list() const;

    /** Return the name of this stage, e.g. "f.update(2)" */
    std::string name() const;

    /** Calling rfactor() on an associative update definition a Func will split
     * the update into an intermediate which computes the partial results and
     * replaces the current update definition with a new definition which merges
     * the partial results. If called on a init/pure definition, this will
     * throw an error. rfactor() will automatically infer the associative reduction
     * operator and identity of the operator. If it can't prove the operation
     * is associative or if it cannot find an identity for that operator, this
     * will throw an error. In addition, commutativity of the operator is required
     * if rfactor() is called on the inner dimension but excluding the outer
     * dimensions.
     *
     * rfactor() takes as input 'preserved', which is a list of <RVar, Var> pairs.
     * The rvars not listed in 'preserved' are removed from the original Func and
     * are lifted to the intermediate Func. The remaining rvars (the ones in
     * 'preserved') are made pure in the intermediate Func. The intermediate Func's
     * update definition inherits all scheduling directives (e.g. split,fuse, etc.)
     * applied to the original Func's update definition. The loop order of the
     * intermediate Func's update definition is the same as the original, although
     * the RVars in 'preserved' are replaced by the new pure Vars. The loop order of the
     * intermediate Func's init definition from innermost to outermost is the args'
     * order of the original Func's init definition followed by the new pure Vars.
     *
     * The intermediate Func also inherits storage order from the original Func
     * with the new pure Vars added to the outermost.
     *
     * For example, f.update(0).rfactor({{r.y, u}}) would rewrite a pipeline like this:
     \code
     f(x, y) = 0;
     f(x, y) += g(r.x, r.y);
     \endcode
     * into a pipeline like this:
     \code
     f_intm(x, y, u) = 0;
     f_intm(x, y, u) += g(r.x, u);

     f(x, y) = 0;
     f(x, y) += f_intm(x, y, r.y);
     \endcode
     *
     * This has a variety of uses. You can use it to split computation of an associative reduction:
     \code
     f(x, y) = 10;
     RDom r(0, 96);
     f(x, y) = max(f(x, y), g(x, y, r.x));
     f.update(0).split(r.x, rxo, rxi, 8).reorder(y, x).parallel(x);
     f.update(0).rfactor({{rxo, u}}).compute_root().parallel(u).update(0).parallel(u);
     \endcode
     *
     *, which is equivalent to:
     \code
     parallel for u = 0 to 11:
       for y:
         for x:
           f_intm(x, y, u) = -inf
     parallel for x:
       for y:
         parallel for u = 0 to 11:
           for rxi = 0 to 7:
             f_intm(x, y, u) = max(f_intm(x, y, u), g(8*u + rxi))
     for y:
       for x:
         f(x, y) = 10
     parallel for x:
       for y:
         for rxo = 0 to 11:
           f(x, y) = max(f(x, y), f_intm(x, y, rxo))
     \endcode
     *
     */
    // @{
    Func rfactor(std::vector<std::pair<RVar, Var>> preserved);
    Func rfactor(const RVar &r, const Var &v);
    // @}

    /** Schedule the iteration over this stage to be fused with another
     * stage 's' from outermost loop to a given LoopLevel. 'this' stage will
     * be computed AFTER 's' in the innermost fused dimension. There should not
     * be any dependencies between those two fused stages. If either of the
     * stages being fused is a stage of an extern Func, this will throw an error.
     *
     * Note that the two stages that are fused together should have the same
     * exact schedule from the outermost to the innermost fused dimension, and
     * the stage we are calling compute_with on should not have specializations,
     * e.g. f2.compute_with(f1, x) is allowed only if f2 has no specializations.
     *
     * Also, if a producer is desired to be computed at the fused loop level,
     * the function passed to the compute_at() needs to be the "parent". Consider
     * the following code:
     \code
     input(x, y) = x + y;
     f(x, y) = input(x, y);
     f(x, y) += 5;
     g(x, y) = x - y;
     g(x, y) += 10;
     f.compute_with(g, y);
     f.update().compute_with(g.update(), y);
     \endcode
     *
     * To compute 'input' at the fused loop level at dimension y, we specify
     * input.compute_at(g, y) instead of input.compute_at(f, y) since 'g' is
     * the "parent" for this fused loop (i.e. 'g' is computed first before 'f'
     * is computed). On the other hand, to compute 'input' at the innermost
     * dimension of 'f', we specify input.compute_at(f, x) instead of
     * input.compute_at(g, x) since the x dimension of 'f' is not fused
     * (only the y dimension is).
     *
     * Given the constraints, this has a variety of uses. Consider the
     * following code:
     \code
     f(x, y) = x + y;
     g(x, y) = x - y;
     h(x, y) = f(x, y) + g(x, y);
     f.compute_root();
     g.compute_root();
     f.split(x, xo, xi, 8);
     g.split(x, xo, xi, 8);
     g.compute_with(f, xo);
     \endcode
     *
     * This is equivalent to:
     \code
     for y:
       for xo:
         for xi:
           f(8*xo + xi) = (8*xo + xi) + y
         for xi:
           g(8*xo + xi) = (8*xo + xi) - y
     for y:
       for x:
         h(x, y) = f(x, y) + g(x, y)
     \endcode
     *
     * The size of the dimensions of the stages computed_with do not have
     * to match. Consider the following code where 'g' is half the size of 'f':
     \code
     Image<int> f_im(size, size), g_im(size/2, size/2);
     input(x, y) = x + y;
     f(x, y) = input(x, y);
     g(x, y) = input(2*x, 2*y);
     g.compute_with(f, y);
     input.compute_at(f, y);
     Pipeline({f, g}).realize({f_im, g_im});
     \endcode
     *
     * This is equivalent to:
     \code
     for y = 0 to size-1:
       for x = 0 to size-1:
         input(x, y) = x + y;
       for x = 0 to size-1:
         f(x, y) = input(x, y)
       for x = 0 to size/2-1:
         if (y < size/2-1):
           g(x, y) = input(2*x, 2*y)
     \endcode
     *
     * 'align' specifies how the loop iteration of each dimension of the
     * two stages being fused should be aligned in the fused loop nests
     * (see LoopAlignStrategy for options). Consider the following loop nests:
     \code
     for z = f_min_z to f_max_z:
       for y = f_min_y to f_max_y:
         for x = f_min_x to f_max_x:
           f(x, y, z) = x + y + z
     for z = g_min_z to g_max_z:
       for y = g_min_y to g_max_y:
         for x = g_min_x to g_max_x:
           g(x, y, z) = x - y - z
     \endcode
     *
     * If no alignment strategy is specified, the following loop nest will be
     * generated:
     \code
     for z = min(f_min_z, g_min_z) to max(f_max_z, g_max_z):
       for y = min(f_min_y, g_min_y) to max(f_max_y, g_max_y):
         for x = f_min_x to f_max_x:
           if (f_min_z <= z <= f_max_z):
             if (f_min_y <= y <= f_max_y):
               f(x, y, z) = x + y + z
         for x = g_min_x to g_max_x:
           if (g_min_z <= z <= g_max_z):
             if (g_min_y <= y <= g_max_y):
               g(x, y, z) = x - y - z
     \endcode
     *
     * Instead, these alignment strategies:
     \code
     g.compute_with(f, y, {{z, LoopAlignStrategy::AlignStart}, {y, LoopAlignStrategy::AlignEnd}});
     \endcode
     * will produce the following loop nest:
     \code
     f_loop_min_z = f_min_z
     f_loop_max_z = max(f_max_z, (f_min_z - g_min_z) + g_max_z)
     for z = f_min_z to f_loop_max_z:
       f_loop_min_y = min(f_min_y, (f_max_y - g_max_y) + g_min_y)
       f_loop_max_y = f_max_y
       for y = f_loop_min_y to f_loop_max_y:
         for x = f_min_x to f_max_x:
           if (f_loop_min_z <= z <= f_loop_max_z):
             if (f_loop_min_y <= y <= f_loop_max_y):
               f(x, y, z) = x + y + z
         for x = g_min_x to g_max_x:
           g_shift_z = g_min_z - f_loop_min_z
           g_shift_y = g_max_y - f_loop_max_y
           if (g_min_z <= (z + g_shift_z) <= g_max_z):
             if (g_min_y <= (y + g_shift_y) <= g_max_y):
               g(x, y + g_shift_y, z + g_shift_z) = x - (y + g_shift_y) - (z + g_shift_z)
     \endcode
     *
     * LoopAlignStrategy::AlignStart on dimension z will shift the loop iteration
     * of 'g' at dimension z so that its starting value matches that of 'f'.
     * Likewise, LoopAlignStrategy::AlignEnd on dimension y will shift the loop
     * iteration of 'g' at dimension y so that its end value matches that of 'f'.
     */
    // @{
    Stage &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
    Stage &compute_with(LoopLevel loop_level, LoopAlignStrategy align = LoopAlignStrategy::Auto);
    Stage &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
    Stage &compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrategy align = LoopAlignStrategy::Auto);
    // @}

    /** Scheduling calls that control how the domain of this stage is
     * traversed. See the documentation for Func for the meanings. */
    // @{

    Stage &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
    Stage &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
    Stage &serial(const VarOrRVar &var);
    Stage &parallel(const VarOrRVar &var);
    Stage &vectorize(const VarOrRVar &var);
    Stage &unroll(const VarOrRVar &var);
    Stage &parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);
    Stage &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
    Stage &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
    Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
                const VarOrRVar &xo, const VarOrRVar &yo,
                const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor,
                TailStrategy tail = TailStrategy::Auto);
    Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
                const VarOrRVar &xi, const VarOrRVar &yi,
                const Expr &xfactor, const Expr &yfactor,
                TailStrategy tail = TailStrategy::Auto);
    Stage &tile(const std::vector<VarOrRVar> &previous,
                const std::vector<VarOrRVar> &outers,
                const std::vector<VarOrRVar> &inners,
                const std::vector<Expr> &factors,
                const std::vector<TailStrategy> &tails);
    Stage &tile(const std::vector<VarOrRVar> &previous,
                const std::vector<VarOrRVar> &outers,
                const std::vector<VarOrRVar> &inners,
                const std::vector<Expr> &factors,
                TailStrategy tail = TailStrategy::Auto);
    Stage &tile(const std::vector<VarOrRVar> &previous,
                const std::vector<VarOrRVar> &inners,
                const std::vector<Expr> &factors,
                TailStrategy tail = TailStrategy::Auto);
    Stage &reorder(const std::vector<VarOrRVar> &vars);

    template<typename... Args>
    HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
    reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
        std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
        return reorder(collected_args);
    }

    Stage &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
    Stage specialize(const Expr &condition);
    void specialize_fail(const std::string &message);

    Stage &gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
    Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
    Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);

    Stage &gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);

    Stage &gpu_single_thread(DeviceAPI device_api = DeviceAPI::Default_GPU);

    Stage &gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
    Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
    Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);

    Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
    Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
               const VarOrRVar &thread_x, const VarOrRVar &thread_y,
               DeviceAPI device_api = DeviceAPI::Default_GPU);
    Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
               const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z,
               DeviceAPI device_api = DeviceAPI::Default_GPU);

    Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
                    TailStrategy tail = TailStrategy::Auto,
                    DeviceAPI device_api = DeviceAPI::Default_GPU);

    Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
                    TailStrategy tail = TailStrategy::Auto,
                    DeviceAPI device_api = DeviceAPI::Default_GPU);
    Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
                    const VarOrRVar &bx, const VarOrRVar &by,
                    const VarOrRVar &tx, const VarOrRVar &ty,
                    const Expr &x_size, const Expr &y_size,
                    TailStrategy tail = TailStrategy::Auto,
                    DeviceAPI device_api = DeviceAPI::Default_GPU);

    Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
                    const VarOrRVar &tx, const VarOrRVar &ty,
                    const Expr &x_size, const Expr &y_size,
                    TailStrategy tail = TailStrategy::Auto,
                    DeviceAPI device_api = DeviceAPI::Default_GPU);

    Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
                    const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
                    const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
                    const Expr &x_size, const Expr &y_size, const Expr &z_size,
                    TailStrategy tail = TailStrategy::Auto,
                    DeviceAPI device_api = DeviceAPI::Default_GPU);
    Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
                    const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
                    const Expr &x_size, const Expr &y_size, const Expr &z_size,
                    TailStrategy tail = TailStrategy::Auto,
                    DeviceAPI device_api = DeviceAPI::Default_GPU);

    Stage &allow_race_conditions();
    Stage &atomic(bool override_associativity_test = false);

    Stage &hexagon(const VarOrRVar &x = Var::outermost());

    Stage &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
                    PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf);
    Stage &prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
                    PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf);
    template<typename T>
    Stage &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
                    PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
        return prefetch(image.parameter(), at, from, std::move(offset), strategy);
    }
    // @}

    /** Attempt to get the source file and line where this stage was
     * defined by parsing the process's own debug symbols. Returns an
     * empty string if no debug symbols were found or the debug
     * symbols were not understood. Works on OS X and Linux only. */
    std::string source_location() const;

    /** Assert that this stage has intentionally been given no schedule, and
     * suppress the warning about unscheduled update definitions that would
     * otherwise fire. This counts as a schedule, so calling this twice on the
     * same Stage will fail the assertion. */
    void unscheduled();
};

// For backwards compatibility, keep the ScheduleHandle name.
typedef Stage ScheduleHandle;

class FuncTupleElementRef;

/** A fragment of front-end syntax of the form f(x, y, z), where x, y,
 * z are Vars or Exprs. If could be the left hand side of a definition or
 * an update definition, or it could be a call to a function. We don't know
 * until we see how this object gets used.
 */
class FuncRef {
    Internal::Function func;
    int implicit_placeholder_pos;
    int implicit_count;
    std::vector<Expr> args;
    std::vector<Expr> args_with_implicit_vars(const std::vector<Expr> &e) const;

    /** Helper for function update by Tuple. If the function does not
     * already have a pure definition, init_val will be used as RHS of
     * each tuple element in the initial function definition. */
    template<typename BinaryOp>
    Stage func_ref_update(const Tuple &e, int init_val);

    /** Helper for function update by Expr. If the function does not
     * already have a pure definition, init_val will be used as RHS in
     * the initial function definition. */
    template<typename BinaryOp>
    Stage func_ref_update(Expr e, int init_val);

public:
    FuncRef(const Internal::Function &, const std::vector<Expr> &,
            int placeholder_pos = -1, int count = 0);
    FuncRef(Internal::Function, const std::vector<Var> &,
            int placeholder_pos = -1, int count = 0);

    /** Use this as the left-hand-side of a definition or an update definition
     * (see \ref RDom).
     */
    Stage operator=(const Expr &);

    /** Use this as the left-hand-side of a definition or an update definition
     * for a Func with multiple outputs. */
    Stage operator=(const Tuple &);

    /** Define a stage that adds the given expression to this Func. If the
     * expression refers to some RDom, this performs a sum reduction of the
     * expression over the domain. If the function does not already have a
     * pure definition, this sets it to zero.
     */
    // @{
    Stage operator+=(Expr);
    Stage operator+=(const Tuple &);
    Stage operator+=(const FuncRef &);
    // @}

    /** Define a stage that adds the negative of the given expression to this
     * Func. If the expression refers to some RDom, this performs a sum reduction
     * of the negative of the expression over the domain. If the function does
     * not already have a pure definition, this sets it to zero.
     */
    // @{
    Stage operator-=(Expr);
    Stage operator-=(const Tuple &);
    Stage operator-=(const FuncRef &);
    // @}

    /** Define a stage that multiplies this Func by the given expression. If the
     * expression refers to some RDom, this performs a product reduction of the
     * expression over the domain. If the function does not already have a pure
     * definition, this sets it to 1.
     */
    // @{
    Stage operator*=(Expr);
    Stage operator*=(const Tuple &);
    Stage operator*=(const FuncRef &);
    // @}

    /** Define a stage that divides this Func by the given expression.
     * If the expression refers to some RDom, this performs a product
     * reduction of the inverse of the expression over the domain. If the
     * function does not already have a pure definition, this sets it to 1.
     */
    // @{
    Stage operator/=(Expr);
    Stage operator/=(const Tuple &);
    Stage operator/=(const FuncRef &);
    // @}

    /* Override the usual assignment operator, so that
     * f(x, y) = g(x, y) defines f.
     */
    Stage operator=(const FuncRef &);

    /** Use this as a call to the function, and not the left-hand-side
     * of a definition. Only works for single-output Funcs. */
    operator Expr() const;

    /** When a FuncRef refers to a function that provides multiple
     * outputs, you can access each output as an Expr using
     * operator[].
     */
    FuncTupleElementRef operator[](int) const;

    /** How many outputs does the function this refers to produce. */
    size_t size() const;

    /** What function is this calling? */
    Internal::Function function() const {
        return func;
    }
};

/** Explicit overloads of min and max for FuncRef. These exist to
 * disambiguate calls to min on FuncRefs when a user has pulled both
 * Halide::min and std::min into their namespace. */
// @{
inline Expr min(const FuncRef &a, const FuncRef &b) {
    return min(Expr(a), Expr(b));
}
inline Expr max(const FuncRef &a, const FuncRef &b) {
    return max(Expr(a), Expr(b));
}
// @}

/** A fragment of front-end syntax of the form f(x, y, z)[index], where x, y,
 * z are Vars or Exprs. If could be the left hand side of an update
 * definition, or it could be a call to a function. We don't know
 * until we see how this object gets used.
 */
class FuncTupleElementRef {
    FuncRef func_ref;
    std::vector<Expr> args;  // args to the function
    int idx;                 // Index to function outputs

    /** Helper function that generates a Tuple where element at 'idx' is set
     * to 'e' and the rests are undef. */
    Tuple values_with_undefs(const Expr &e) const;

public:
    FuncTupleElementRef(const FuncRef &ref, const std::vector<Expr> &args, int idx);

    /** Use this as the left-hand-side of an update definition of Tuple
     * component 'idx' of a Func (see \ref RDom). The function must
     * already have an initial definition.
     */
    Stage operator=(const Expr &e);

    /** Define a stage that adds the given expression to Tuple component 'idx'
     * of this Func. The other Tuple components are unchanged. If the expression
     * refers to some RDom, this performs a sum reduction of the expression over
     * the domain. The function must already have an initial definition.
     */
    Stage operator+=(const Expr &e);

    /** Define a stage that adds the negative of the given expression to Tuple
     * component 'idx' of this Func. The other Tuple components are unchanged.
     * If the expression refers to some RDom, this performs a sum reduction of
     * the negative of the expression over the domain. The function must already
     * have an initial definition.
     */
    Stage operator-=(const Expr &e);

    /** Define a stage that multiplies Tuple component 'idx' of this Func by
     * the given expression. The other Tuple components are unchanged. If the
     * expression refers to some RDom, this performs a product reduction of
     * the expression over the domain. The function must already have an
     * initial definition.
     */
    Stage operator*=(const Expr &e);

    /** Define a stage that divides Tuple component 'idx' of this Func by
     * the given expression. The other Tuple components are unchanged.
     * If the expression refers to some RDom, this performs a product
     * reduction of the inverse of the expression over the domain. The function
     * must already have an initial definition.
     */
    Stage operator/=(const Expr &e);

    /* Override the usual assignment operator, so that
     * f(x, y)[index] = g(x, y) defines f.
     */
    Stage operator=(const FuncRef &e);

    /** Use this as a call to Tuple component 'idx' of a Func, and not the
     * left-hand-side of a definition. */
    operator Expr() const;

    /** What function is this calling? */
    Internal::Function function() const {
        return func_ref.function();
    }

    /** Return index to the function outputs. */
    int index() const {
        return idx;
    }
};

namespace Internal {
class IRMutator;
}  // namespace Internal

/** Helper class for identifying purpose of an Expr passed to memoize.
 */
class EvictionKey {
protected:
    Expr key;
    friend class Func;

public:
    explicit EvictionKey(const Expr &expr = Expr())
        : key(expr) {
    }
};

/** A halide function. This class represents one stage in a Halide
 * pipeline, and is the unit by which we schedule things. By default
 * they are aggressively inlined, so you are encouraged to make lots
 * of little functions, rather than storing things in Exprs. */
class Func {

    /** A handle on the internal halide function that this
     * represents */
    Internal::Function func;

    /** When you make a reference to this function with fewer
     * arguments than it has dimensions, the argument list is bulked
     * up with 'implicit' vars with canonical names. This lets you
     * pass around partially applied Halide functions. */
    // @{
    std::pair<int, int> add_implicit_vars(std::vector<Var> &) const;
    std::pair<int, int> add_implicit_vars(std::vector<Expr> &) const;
    // @}

    /** The imaging pipeline that outputs this Func alone. */
    Pipeline pipeline_;

    /** Get the imaging pipeline that outputs this Func alone,
     * creating it (and freezing the Func) if necessary. */
    Pipeline pipeline();

    // Helper function for recursive reordering support
    Func &reorder_storage(const std::vector<Var> &dims, size_t start);

    void invalidate_cache();

public:
    /** Declare a new undefined function with the given name */
    explicit Func(const std::string &name);

    /** Declare a new undefined function with the given name.
     * The function will be constrained to represent Exprs of required_type.
     * If required_dims is not AnyDims, the function will be constrained to exactly
     * that many dimensions. */
    explicit Func(const Type &required_type, int required_dims, const std::string &name);

    /** Declare a new undefined function with the given name.
     * If required_types is not empty, the function will be constrained to represent
     * Tuples of the same arity and types. (If required_types is empty, there is no constraint.)
     * If required_dims is not AnyDims, the function will be constrained to exactly
     * that many dimensions. */
    explicit Func(const std::vector<Type> &required_types, int required_dims, const std::string &name);

    /** Declare a new undefined function with an
     * automatically-generated unique name */
    Func();

    /** Declare a new function with an automatically-generated unique
     * name, and define it to return the given expression (which may
     * not contain free variables). */
    explicit Func(const Expr &e);

    /** Construct a new Func to wrap an existing, already-define
     * Function object. */
    explicit Func(Internal::Function f);

    /** Construct a new Func to wrap a Buffer. */
    template<typename T, int Dims>
    HALIDE_NO_USER_CODE_INLINE explicit Func(Buffer<T, Dims> &im)
        : Func() {
        (*this)(_) = im(_);
    }

    /** Evaluate this function over some rectangular domain and return
     * the resulting buffer or buffers. Performs compilation if the
     * Func has not previously been realized and compile_jit has not
     * been called. If the final stage of the pipeline is on the GPU,
     * data is copied back to the host before being returned. The
     * returned Realization should probably be instantly converted to
     * a Buffer class of the appropriate type. That is, do this:
     *
     \code
     f(x) = sin(x);
     Buffer<float> im = f.realize(...);
     \endcode
     *
     * If your Func has multiple values, because you defined it using
     * a Tuple, then casting the result of a realize call to a buffer
     * or image will produce a run-time error. Instead you should do the
     * following:
     *
     \code
     f(x) = Tuple(x, sin(x));
     Realization r = f.realize(...);
     Buffer<int> im0 = r[0];
     Buffer<float> im1 = r[1];
     \endcode
     *
     * In Halide formal arguments of a computation are specified using
     * Param<T> and ImageParam objects in the expressions defining the
     * computation. The param_map argument to realize allows
     * specifying a set of per-call parameters to be used for a
     * specific computation. This method is thread-safe where the
     * globals used by Param<T> and ImageParam are not. Any parameters
     * that are not in the param_map are taken from the global values,
     * so those can continue to be used if they are not changing
     * per-thread.
     *
     * One can explicitly construct a ParamMap and
     * use its set method to insert Parameter to scalar or Buffer
     * value mappings:
     *
     \code
     Param<int32> p(42);
     ImageParam img(Int(32), 1);
     f(x) = img(x) + p;

     Buffer<int32_t) arg_img(10, 10);
     <fill in arg_img...>
     ParamMap params;
     params.set(p, 17);
     params.set(img, arg_img);

     Target t = get_jit_target_from_environment();
     Buffer<int32_t> result = f.realize({10, 10}, t, params);
     \endcode
     *
     * Alternatively, an initializer list can be used
     * directly in the realize call to pass this information:
     *
     \code
     Param<int32> p(42);
     ImageParam img(Int(32), 1);
     f(x) = img(x) + p;

     Buffer<int32_t) arg_img(10, 10);
     <fill in arg_img...>

     Target t = get_jit_target_from_environment();
     Buffer<int32_t> result = f.realize({10, 10}, t, { { p, 17 }, { img, arg_img } });
     \endcode
     *
     * If the Func cannot be realized into a buffer of the given size
     * due to scheduling constraints on scattering update definitions,
     * it will be realized into a larger buffer of the minimum size
     * possible, and a cropped view at the requested size will be
     * returned. It is thus not safe to assume the returned buffers
     * are contiguous in memory. This behavior can be disabled with
     * the NoBoundsQuery target flag, in which case an error about
     * writing out of bounds on the output buffer will trigger
     * instead.
     *
     */
    Realization realize(std::vector<int32_t> sizes = {}, const Target &target = Target(),
                        const ParamMap &param_map = ParamMap::empty_map());

    /** Same as above, but takes a custom user-provided context to be
     * passed to runtime functions. This can be used to pass state to
     * runtime overrides in a thread-safe manner. A nullptr context is
     * legal, and is equivalent to calling the variant of realize
     * that does not take a context. */
    Realization realize(JITUserContext *context,
                        std::vector<int32_t> sizes = {},
                        const Target &target = Target(),
                        const ParamMap &param_map = ParamMap::empty_map());

    /** Evaluate this function into an existing allocated buffer or
     * buffers. If the buffer is also one of the arguments to the
     * function, strange things may happen, as the pipeline isn't
     * necessarily safe to run in-place. If you pass multiple buffers,
     * they must have matching sizes. This form of realize does *not*
     * automatically copy data back from the GPU. */
    void realize(Pipeline::RealizationArg outputs, const Target &target = Target(),
                 const ParamMap &param_map = ParamMap::empty_map());

    /** Same as above, but takes a custom user-provided context to be
     * passed to runtime functions. This can be used to pass state to
     * runtime overrides in a thread-safe manner. A nullptr context is
     * legal, and is equivalent to calling the variant of realize
     * that does not take a context. */
    void realize(JITUserContext *context,
                 Pipeline::RealizationArg outputs,
                 const Target &target = Target(),
                 const ParamMap &param_map = ParamMap::empty_map());

    /** For a given size of output, or a given output buffer,
     * determine the bounds required of all unbound ImageParams
     * referenced. Communicates the result by allocating new buffers
     * of the appropriate size and binding them to the unbound
     * ImageParams.
     *
     * Set the documentation for Func::realize regarding the
     * ParamMap. There is one difference in that input Buffer<>
     * arguments that are being inferred are specified as a pointer to
     * the Buffer<> in the ParamMap. E.g.
     *
     \code
     Param<int32> p(42);
     ImageParam img(Int(32), 1);
     f(x) = img(x) + p;

     Target t = get_jit_target_from_environment();
     Buffer<> in;
     f.infer_input_bounds({10, 10}, t, { { img, &in } });
     \endcode
     * On return, in will be an allocated buffer of the correct size
     * to evaulate f over a 10x10 region.
     */
    // @{
    void infer_input_bounds(const std::vector<int32_t> &sizes,
                            const Target &target = get_jit_target_from_environment(),
                            const ParamMap &param_map = ParamMap::empty_map());
    void infer_input_bounds(Pipeline::RealizationArg outputs,
                            const Target &target = get_jit_target_from_environment(),
                            const ParamMap &param_map = ParamMap::empty_map());
    // @}

    /** Versions of infer_input_bounds that take a custom user context
     * to pass to runtime functions. */
    // @{
    void infer_input_bounds(JITUserContext *context,
                            const std::vector<int32_t> &sizes,
                            const Target &target = get_jit_target_from_environment(),
                            const ParamMap &param_map = ParamMap::empty_map());
    void infer_input_bounds(JITUserContext *context,
                            Pipeline::RealizationArg outputs,
                            const Target &target = get_jit_target_from_environment(),
                            const ParamMap &param_map = ParamMap::empty_map());
    // @}
    /** Statically compile this function to llvm bitcode, with the
     * given filename (which should probably end in .bc), type
     * signature, and C function name (which defaults to the same name
     * as this halide function */
    //@{
    void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
                            const Target &target = get_target_from_environment());
    void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &,
                            const Target &target = get_target_from_environment());
    // @}

    /** Statically compile this function to llvm assembly, with the
     * given filename (which should probably end in .ll), type
     * signature, and C function name (which defaults to the same name
     * as this halide function */
    //@{
    void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
                                  const Target &target = get_target_from_environment());
    void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &,
                                  const Target &target = get_target_from_environment());
    // @}

    /** Statically compile this function to an object file, with the
     * given filename (which should probably end in .o or .obj), type
     * signature, and C function name (which defaults to the same name
     * as this halide function. You probably don't want to use this
     * directly; call compile_to_static_library or compile_to_file instead. */
    //@{
    void compile_to_object(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
                           const Target &target = get_target_from_environment());
    void compile_to_object(const std::string &filename, const std::vector<Argument> &,
                           const Target &target = get_target_from_environment());
    // @}

    /** Emit a header file with the given filename for this
     * function. The header will define a function with the type
     * signature given by the second argument, and a name given by the
     * third. The name defaults to the same name as this halide
     * function. You don't actually have to have defined this function
     * yet to call this. You probably don't want to use this directly;
     * call compile_to_static_library or compile_to_file instead. */
    void compile_to_header(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name = "",
                           const Target &target = get_target_from_environment());

    /** Statically compile this function to text assembly equivalent
     * to the object file generated by compile_to_object. This is
     * useful for checking what Halide is producing without having to
     * disassemble anything, or if you need to feed the assembly into
     * some custom toolchain to produce an object file (e.g. iOS) */
    //@{
    void compile_to_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
                             const Target &target = get_target_from_environment());
    void compile_to_assembly(const std::string &filename, const std::vector<Argument> &,
                             const Target &target = get_target_from_environment());
    // @}

    /** Statically compile this function to C source code. This is
     * useful for providing fallback code paths that will compile on
     * many platforms. Vectorization will fail, and parallelization
     * will produce serial code. */
    void compile_to_c(const std::string &filename,
                      const std::vector<Argument> &,
                      const std::string &fn_name = "",
                      const Target &target = get_target_from_environment());

    /** Write out an internal representation of lowered code. Useful
     * for analyzing and debugging scheduling. Can emit html or plain
     * text. */
    void compile_to_lowered_stmt(const std::string &filename,
                                 const std::vector<Argument> &args,
                                 StmtOutputFormat fmt = Text,
                                 const Target &target = get_target_from_environment());

    /** Write out the loop nests specified by the schedule for this
     * Function. Helpful for understanding what a schedule is
     * doing. */
    void print_loop_nest();

    /** Compile to object file and header pair, with the given
     * arguments. The name defaults to the same name as this halide
     * function.
     */
    void compile_to_file(const std::string &filename_prefix, const std::vector<Argument> &args,
                         const std::string &fn_name = "",
                         const Target &target = get_target_from_environment());

    /** Compile to static-library file and header pair, with the given
     * arguments. The name defaults to the same name as this halide
     * function.
     */
    void compile_to_static_library(const std::string &filename_prefix, const std::vector<Argument> &args,
                                   const std::string &fn_name = "",
                                   const Target &target = get_target_from_environment());

    /** Compile to static-library file and header pair once for each target;
     * each resulting function will be considered (in order) via halide_can_use_target_features()
     * at runtime, with the first appropriate match being selected for subsequent use.
     * This is typically useful for specializations that may vary unpredictably by machine
     * (e.g., SSE4.1/AVX/AVX2 on x86 desktop machines).
     * All targets must have identical arch-os-bits.
     */
    void compile_to_multitarget_static_library(const std::string &filename_prefix,
                                               const std::vector<Argument> &args,
                                               const std::vector<Target> &targets);

    /** Like compile_to_multitarget_static_library(), except that the object files
     * are all output as object files (rather than bundled into a static library).
     *
     * `suffixes` is an optional list of strings to use for as the suffix for each object
     * file. If nonempty, it must be the same length as `targets`. (If empty, Target::to_string()
     * will be used for each suffix.)
     *
     * Note that if `targets.size()` > 1, the wrapper code (to select the subtarget)
     * will be generated with the filename `${filename_prefix}_wrapper.o`
     *
     * Note that if `targets.size()` > 1 and `no_runtime` is not specified, the runtime
     * will be generated with the filename `${filename_prefix}_runtime.o`
     */
    void compile_to_multitarget_object_files(const std::string &filename_prefix,
                                             const std::vector<Argument> &args,
                                             const std::vector<Target> &targets,
                                             const std::vector<std::string> &suffixes);

    /** Store an internal representation of lowered code as a self
     * contained Module suitable for further compilation. */
    Module compile_to_module(const std::vector<Argument> &args, const std::string &fn_name = "",
                             const Target &target = get_target_from_environment());

    /** Compile and generate multiple target files with single call.
     * Deduces target files based on filenames specified in
     * output_files map.
     */
    void compile_to(const std::map<OutputFileType, std::string> &output_files,
                    const std::vector<Argument> &args,
                    const std::string &fn_name,
                    const Target &target = get_target_from_environment());

    /** Eagerly jit compile the function to machine code. This
     * normally happens on the first call to realize. If you're
     * running your halide pipeline inside time-sensitive code and
     * wish to avoid including the time taken to compile a pipeline,
     * then you can call this ahead of time. Default is to use the Target
     * returned from Halide::get_jit_target_from_environment()
     */
    void compile_jit(const Target &target = get_jit_target_from_environment());

    /** Get a struct containing the currently set custom functions
     * used by JIT. This can be mutated. Changes will take effect the
     * next time this Func is realized. */
    JITHandlers &jit_handlers();

    /** Eagerly jit compile the function to machine code and return a callable
     * struct that behaves like a function pointer. The calling convention
     * will exactly match that of an AOT-compiled version of this Func
     * with the same Argument list.
     */
    Callable compile_to_callable(const std::vector<Argument> &args,
                                 const Target &target = get_jit_target_from_environment());

    /** Add a custom pass to be used during lowering. It is run after
     * all other lowering passes. Can be used to verify properties of
     * the lowered Stmt, instrument it with extra code, or otherwise
     * modify it. The Func takes ownership of the pass, and will call
     * delete on it when the Func goes out of scope. So don't pass a
     * stack object, or share pass instances between multiple
     * Funcs. */
    template<typename T>
    void add_custom_lowering_pass(T *pass) {
        // Template instantiate a custom deleter for this type, then
        // wrap in a lambda. The custom deleter lives in user code, so
        // that deletion is on the same heap as construction (I hate Windows).
        add_custom_lowering_pass(pass, [pass]() { delete_lowering_pass<T>(pass); });
    }

    /** Add a custom pass to be used during lowering, with the
     * function that will be called to delete it also passed in. Set
     * it to nullptr if you wish to retain ownership of the object. */
    void add_custom_lowering_pass(Internal::IRMutator *pass, std::function<void()> deleter);

    /** Remove all previously-set custom lowering passes */
    void clear_custom_lowering_passes();

    /** Get the custom lowering passes. */
    const std::vector<CustomLoweringPass> &custom_lowering_passes();

    /** When this function is compiled, include code that dumps its
     * values to a file after it is realized, for the purpose of
     * debugging.
     *
     * If filename ends in ".tif" or ".tiff" (case insensitive) the file
     * is in TIFF format and can be read by standard tools. Oherwise, the
     * file format is as follows:
     *
     * All data is in the byte-order of the target platform.  First, a
     * 20 byte-header containing four 32-bit ints, giving the extents
     * of the first four dimensions.  Dimensions beyond four are
     * folded into the fourth.  Then, a fifth 32-bit int giving the
     * data type of the function. The typecodes are given by: float =
     * 0, double = 1, uint8_t = 2, int8_t = 3, uint16_t = 4, int16_t =
     * 5, uint32_t = 6, int32_t = 7, uint64_t = 8, int64_t = 9. The
     * data follows the header, as a densely packed array of the given
     * size and the given type. If given the extension .tmp, this file
     * format can be natively read by the program ImageStack. */
    void debug_to_file(const std::string &filename);

    /** The name of this function, either given during construction,
     * or automatically generated. */
    const std::string &name() const;

    /** Get the pure arguments. */
    std::vector<Var> args() const;

    /** The right-hand-side value of the pure definition of this
     * function. Causes an error if there's no pure definition, or if
     * the function is defined to return multiple values. */
    Expr value() const;

    /** The values returned by this function. An error if the function
     * has not been been defined. Returns a Tuple with one element for
     * functions defined to return a single value. */
    Tuple values() const;

    /** Does this function have at least a pure definition. */
    bool defined() const;

    /** Get the left-hand-side of the update definition. An empty
     * vector if there's no update definition. If there are
     * multiple update definitions for this function, use the
     * argument to select which one you want. */
    const std::vector<Expr> &update_args(int idx = 0) const;

    /** Get the right-hand-side of an update definition. An error if
     * there's no update definition. If there are multiple
     * update definitions for this function, use the argument to
     * select which one you want. */
    Expr update_value(int idx = 0) const;

    /** Get the right-hand-side of an update definition for
     * functions that returns multiple values. An error if there's no
     * update definition. Returns a Tuple with one element for
     * functions that return a single value. */
    Tuple update_values(int idx = 0) const;

    /** Get the RVars of the reduction domain for an update definition, if there is
     * one. */
    std::vector<RVar> rvars(int idx = 0) const;

    /** Does this function have at least one update definition? */
    bool has_update_definition() const;

    /** How many update definitions does this function have? */
    int num_update_definitions() const;

    /** Is this function an external stage? That is, was it defined
     * using define_extern? */
    bool is_extern() const;

    /** Add an extern definition for this Func. This lets you define a
     * Func that represents an external pipeline stage. You can, for
     * example, use it to wrap a call to an extern library such as
     * fftw. */
    // @{
    void define_extern(const std::string &function_name,
                       const std::vector<ExternFuncArgument> &params, Type t,
                       int dimensionality,
                       NameMangling mangling = NameMangling::Default,
                       DeviceAPI device_api = DeviceAPI::Host) {
        define_extern(function_name, params, t,
                      Internal::make_argument_list(dimensionality), mangling,
                      device_api);
    }

    void define_extern(const std::string &function_name,
                       const std::vector<ExternFuncArgument> &params,
                       const std::vector<Type> &types, int dimensionality,
                       NameMangling mangling) {
        define_extern(function_name, params, types,
                      Internal::make_argument_list(dimensionality), mangling);
    }

    void define_extern(const std::string &function_name,
                       const std::vector<ExternFuncArgument> &params,
                       const std::vector<Type> &types, int dimensionality,
                       NameMangling mangling = NameMangling::Default,
                       DeviceAPI device_api = DeviceAPI::Host) {
        define_extern(function_name, params, types,
                      Internal::make_argument_list(dimensionality), mangling,
                      device_api);
    }

    void define_extern(const std::string &function_name,
                       const std::vector<ExternFuncArgument> &params, Type t,
                       const std::vector<Var> &arguments,
                       NameMangling mangling = NameMangling::Default,
                       DeviceAPI device_api = DeviceAPI::Host) {
        define_extern(function_name, params, std::vector<Type>{t}, arguments,
                      mangling, device_api);
    }

    void define_extern(const std::string &function_name,
                       const std::vector<ExternFuncArgument> &params,
                       const std::vector<Type> &types,
                       const std::vector<Var> &arguments,
                       NameMangling mangling = NameMangling::Default,
                       DeviceAPI device_api = DeviceAPI::Host);
    // @}

    /** Get the type(s) of the outputs of this Func.
     *
     * It is not legal to call type() unless the Func has non-Tuple elements.
     *
     * If the Func isn't yet defined, and was not specified with required types,
     * a runtime error will occur.
     *
     * If the Func isn't yet defined, but *was* specified with required types,
     * the requirements will be returned. */
    // @{
    const Type &type() const;
    const std::vector<Type> &types() const;
    // @}

    HALIDE_ATTRIBUTE_DEPRECATED("Func::output_type() is deprecated; use Func::type() instead.")
    const Type &output_type() const {
        return type();
    }
    HALIDE_ATTRIBUTE_DEPRECATED("Func::output_types() is deprecated; use Func::types() instead.")
    const std::vector<Type> &output_types() const {
        return types();
    }

    /** Get the number of outputs of this Func. Corresponds to the
     * size of the Tuple this Func was defined to return.
     * If the Func isn't yet defined, but was specified with required types,
     * the number of outputs specified in the requirements will be returned. */
    int outputs() const;

    /** Get the name of the extern function called for an extern
     * definition. */
    const std::string &extern_function_name() const;

    /** The dimensionality (number of arguments) of this function.
     * If the Func isn't yet defined, but was specified with required dimensionality,
     * the dimensionality specified in the requirements will be returned. */
    int dimensions() const;

    /** Construct either the left-hand-side of a definition, or a call
     * to a functions that happens to only contain vars as
     * arguments. If the function has already been defined, and fewer
     * arguments are given than the function has dimensions, then
     * enough implicit vars are added to the end of the argument list
     * to make up the difference (see \ref Var::implicit) */
    // @{
    FuncRef operator()(std::vector<Var>) const;

    template<typename... Args>
    HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Var, Args...>::value, FuncRef>::type
    operator()(Args &&...args) const {
        std::vector<Var> collected_args{std::forward<Args>(args)...};
        return this->operator()(collected_args);
    }
    // @}

    /** Either calls to the function, or the left-hand-side of
     * an update definition (see \ref RDom). If the function has
     * already been defined, and fewer arguments are given than the
     * function has dimensions, then enough implicit vars are added to
     * the end of the argument list to make up the difference. (see
     * \ref Var::implicit)*/
    // @{
    FuncRef operator()(std::vector<Expr>) const;

    template<typename... Args>
    HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Expr, Args...>::value, FuncRef>::type
    operator()(const Expr &x, Args &&...args) const {
        std::vector<Expr> collected_args{x, std::forward<Args>(args)...};
        return (*this)(collected_args);
    }
    // @}

    /** Creates and returns a new identity Func that wraps this Func. During
     * compilation, Halide replaces all calls to this Func done by 'f'
     * with calls to the wrapper. If this Func is already wrapped for
     * use in 'f', will return the existing wrapper.
     *
     * For example, g.in(f) would rewrite a pipeline like this:
     \code
     g(x, y) = ...
     f(x, y) = ... g(x, y) ...
     \endcode
     * into a pipeline like this:
     \code
     g(x, y) = ...
     g_wrap(x, y) = g(x, y)
     f(x, y) = ... g_wrap(x, y)
     \endcode
     *
     * This has a variety of uses. You can use it to schedule this
     * Func differently in the different places it is used:
     \code
     g(x, y) = ...
     f1(x, y) = ... g(x, y) ...
     f2(x, y) = ... g(x, y) ...
     g.in(f1).compute_at(f1, y).vectorize(x, 8);
     g.in(f2).compute_at(f2, x).unroll(x);
     \endcode
     *
     * You can also use it to stage loads from this Func via some
     * intermediate buffer (perhaps on the stack as in
     * test/performance/block_transpose.cpp, or in shared GPU memory
     * as in test/performance/wrap.cpp). In this we compute the
     * wrapper at tiles of the consuming Funcs like so:
     \code
     g.compute_root()...
     g.in(f).compute_at(f, tiles)...
     \endcode
     *
     * Func::in() can also be used to compute pieces of a Func into a
     * smaller scratch buffer (perhaps on the GPU) and then copy them
     * into a larger output buffer one tile at a time. See
     * apps/interpolate/interpolate.cpp for an example of this. In
     * this case we compute the Func at tiles of its own wrapper:
     \code
     f.in(g).compute_root().gpu_tile(...)...
     f.compute_at(f.in(g), tiles)...
     \endcode
     *
     * A similar use of Func::in() wrapping Funcs with multiple update
     * stages in a pure wrapper. The following code:
     \code
     f(x, y) = x + y;
     f(x, y) += 5;
     g(x, y) = f(x, y);
     f.compute_root();
     \endcode
     *
     * Is equivalent to:
     \code
     for y:
       for x:
         f(x, y) = x + y;
     for y:
       for x:
         f(x, y) += 5
     for y:
       for x:
         g(x, y) = f(x, y)
     \endcode
     * using Func::in(), we can write:
     \code
     f(x, y) = x + y;
     f(x, y) += 5;
     g(x, y) = f(x, y);
     f.in(g).compute_root();
     \endcode
     * which instead produces:
     \code
     for y:
       for x:
         f(x, y) = x + y;
         f(x, y) += 5
         f_wrap(x, y) = f(x, y)
     for y:
       for x:
         g(x, y) = f_wrap(x, y)
     \endcode
     */
    Func in(const Func &f);

    /** Create and return an identity wrapper shared by all the Funcs in
     * 'fs'. If any of the Funcs in 'fs' already have a custom wrapper,
     * this will throw an error. */
    Func in(const std::vector<Func> &fs);

    /** Create and return a global identity wrapper, which wraps all calls to
     * this Func by any other Func. If a global wrapper already exists,
     * returns it. The global identity wrapper is only used by callers for
     * which no custom wrapper has been specified.
     */
    Func in();

    /** Similar to \ref Func::in; however, instead of replacing the call to
     * this Func with an identity Func that refers to it, this replaces the
     * call with a clone of this Func.
     *
     * For example, f.clone_in(g) would rewrite a pipeline like this:
     \code
     f(x, y) = x + y;
     g(x, y) = f(x, y) + 2;
     h(x, y) = f(x, y) - 3;
     \endcode
     * into a pipeline like this:
     \code
     f(x, y) = x + y;
     f_clone(x, y) = x + y;
     g(x, y) = f_clone(x, y) + 2;
     h(x, y) = f(x, y) - 3;
     \endcode
     *
     */
    //@{
    Func clone_in(const Func &f);
    Func clone_in(const std::vector<Func> &fs);
    //@}

    /** Declare that this function should be implemented by a call to
     * halide_buffer_copy with the given target device API. Asserts
     * that the Func has a pure definition which is a simple call to a
     * single input, and no update definitions. The wrapper Funcs
     * returned by in() are suitable candidates. Consumes all pure
     * variables, and rewrites the Func to have an extern definition
     * that calls halide_buffer_copy. */
    Func copy_to_device(DeviceAPI d = DeviceAPI::Default_GPU);

    /** Declare that this function should be implemented by a call to
     * halide_buffer_copy with a NULL target device API. Equivalent to
     * copy_to_device(DeviceAPI::Host). Asserts that the Func has a
     * pure definition which is a simple call to a single input, and
     * no update definitions. The wrapper Funcs returned by in() are
     * suitable candidates. Consumes all pure variables, and rewrites
     * the Func to have an extern definition that calls
     * halide_buffer_copy.
     *
     * Note that if the source Func is already valid in host memory,
     * this compiles to code that does the minimum number of calls to
     * memcpy.
     */
    Func copy_to_host();

    /** Split a dimension into inner and outer subdimensions with the
     * given names, where the inner dimension iterates from 0 to
     * factor-1. The inner and outer subdimensions can then be dealt
     * with using the other scheduling calls. It's ok to reuse the old
     * variable name as either the inner or outer variable. The final
     * argument specifies how the tail should be handled if the split
     * factor does not provably divide the extent. */
    Func &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);

    /** Join two dimensions into a single fused dimension. The fused
     * dimension covers the product of the extents of the inner and
     * outer dimensions given. */
    Func &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);

    /** Mark a dimension to be traversed serially. This is the default. */
    Func &serial(const VarOrRVar &var);

    /** Mark a dimension to be traversed in parallel */
    Func &parallel(const VarOrRVar &var);

    /** Split a dimension by the given task_size, and the parallelize the
     * outer dimension. This creates parallel tasks that have size
     * task_size. After this call, var refers to the outer dimension of
     * the split. The inner dimension has a new anonymous name. If you
     * wish to mutate it, or schedule with respect to it, do the split
     * manually. */
    Func &parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);

    /** Mark a dimension to be computed all-at-once as a single
     * vector. The dimension should have constant extent -
     * e.g. because it is the inner dimension following a split by a
     * constant factor. For most uses of vectorize you want the two
     * argument form. The variable to be vectorized should be the
     * innermost one. */
    Func &vectorize(const VarOrRVar &var);

    /** Mark a dimension to be completely unrolled. The dimension
     * should have constant extent - e.g. because it is the inner
     * dimension following a split by a constant factor. For most uses
     * of unroll you want the two-argument form. */
    Func &unroll(const VarOrRVar &var);

    /** Split a dimension by the given factor, then vectorize the
     * inner dimension. This is how you vectorize a loop of unknown
     * size. The variable to be vectorized should be the innermost
     * one. After this call, var refers to the outer dimension of the
     * split. 'factor' must be an integer. */
    Func &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);

    /** Split a dimension by the given factor, then unroll the inner
     * dimension. This is how you unroll a loop of unknown size by
     * some constant factor. After this call, var refers to the outer
     * dimension of the split. 'factor' must be an integer. */
    Func &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);

    /** Statically declare that the range over which a function should
     * be evaluated is given by the second and third arguments. This
     * can let Halide perform some optimizations. E.g. if you know
     * there are going to be 4 color channels, you can completely
     * vectorize the color channel dimension without the overhead of
     * splitting it up. If bounds inference decides that it requires
     * more of this function than the bounds you have stated, a
     * runtime error will occur when you try to run your pipeline. */
    Func &bound(const Var &var, Expr min, Expr extent);

    /** Statically declare the range over which the function will be
     * evaluated in the general case. This provides a basis for the auto
     * scheduler to make trade-offs and scheduling decisions. The auto
     * generated schedules might break when the sizes of the dimensions are
     * very different from the estimates specified. These estimates are used
     * only by the auto scheduler if the function is a pipeline output. */
    Func &set_estimate(const Var &var, const Expr &min, const Expr &extent);

    /** Set (min, extent) estimates for all dimensions in the Func
     * at once; this is equivalent to calling `set_estimate(args()[n], min, extent)`
     * repeatedly, but slightly terser. The size of the estimates vector
     * must match the dimensionality of the Func. */
    Func &set_estimates(const Region &estimates);

    /** Expand the region computed so that the min coordinates is
     * congruent to 'remainder' modulo 'modulus', and the extent is a
     * multiple of 'modulus'. For example, f.align_bounds(x, 2) forces
     * the min and extent realized to be even, and calling
     * f.align_bounds(x, 2, 1) forces the min to be odd and the extent
     * to be even. The region computed always contains the region that
     * would have been computed without this directive, so no
     * assertions are injected.
     */
    Func &align_bounds(const Var &var, Expr modulus, Expr remainder = 0);

    /** Expand the region computed so that the extent is a
     * multiple of 'modulus'. For example, f.align_extent(x, 2) forces
     * the extent realized to be even. The region computed always contains the
     * region that would have been computed without this directive, so no
     * assertions are injected. (This is essentially equivalent to align_bounds(),
     * but always leaving the min untouched.)
     */
    Func &align_extent(const Var &var, Expr modulus);

    /** Bound the extent of a Func's realization, but not its
     * min. This means the dimension can be unrolled or vectorized
     * even when its min is not fixed (for example because it is
     * compute_at tiles of another Func). This can also be useful for
     * forcing a function's allocation to be a fixed size, which often
     * means it can go on the stack. */
    Func &bound_extent(const Var &var, Expr extent);

    /** Split two dimensions at once by the given factors, and then
     * reorder the resulting dimensions to be xi, yi, xo, yo from
     * innermost outwards. This gives a tiled traversal. */
    Func &tile(const VarOrRVar &x, const VarOrRVar &y,
               const VarOrRVar &xo, const VarOrRVar &yo,
               const VarOrRVar &xi, const VarOrRVar &yi,
               const Expr &xfactor, const Expr &yfactor,
               TailStrategy tail = TailStrategy::Auto);

    /** A shorter form of tile, which reuses the old variable names as
     * the new outer dimensions */
    Func &tile(const VarOrRVar &x, const VarOrRVar &y,
               const VarOrRVar &xi, const VarOrRVar &yi,
               const Expr &xfactor, const Expr &yfactor,
               TailStrategy tail = TailStrategy::Auto);

    /** A more general form of tile, which defines tiles of any dimensionality. */
    Func &tile(const std::vector<VarOrRVar> &previous,
               const std::vector<VarOrRVar> &outers,
               const std::vector<VarOrRVar> &inners,
               const std::vector<Expr> &factors,
               const std::vector<TailStrategy> &tails);

    /** The generalized tile, with a single tail strategy to apply to all vars. */
    Func &tile(const std::vector<VarOrRVar> &previous,
               const std::vector<VarOrRVar> &outers,
               const std::vector<VarOrRVar> &inners,
               const std::vector<Expr> &factors,
               TailStrategy tail = TailStrategy::Auto);

    /** Generalized tiling, reusing the previous names as the outer names. */
    Func &tile(const std::vector<VarOrRVar> &previous,
               const std::vector<VarOrRVar> &inners,
               const std::vector<Expr> &factors,
               TailStrategy tail = TailStrategy::Auto);

    /** Reorder variables to have the given nesting order, from
     * innermost out */
    Func &reorder(const std::vector<VarOrRVar> &vars);

    template<typename... Args>
    HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Func &>::type
    reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
        std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
        return reorder(collected_args);
    }

    /** Rename a dimension. Equivalent to split with a inner size of one. */
    Func &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);

    /** Specify that race conditions are permitted for this Func,
     * which enables parallelizing over RVars even when Halide cannot
     * prove that it is safe to do so. Use this with great caution,
     * and only if you can prove to yourself that this is safe, as it
     * may result in a non-deterministic routine that returns
     * different values at different times or on different machines. */
    Func &allow_race_conditions();

    /** Issue atomic updates for this Func. This allows parallelization
     * on associative RVars. The function throws a compile error when
     * Halide fails to prove associativity. Use override_associativity_test
     * to disable the associativity test if you believe the function is
     * associative or the order of reduction variable execution does not
     * matter.
     * Halide compiles this into hardware atomic operations whenever possible,
     * and falls back to a mutex lock per storage element if it is impossible
     * to atomically update.
     * There are three possible outcomes of the compiled code:
     * atomic add, compare-and-swap loop, and mutex lock.
     * For example:
     *
     * hist(x) = 0;
     * hist(im(r)) += 1;
     * hist.compute_root();
     * hist.update().atomic().parallel();
     *
     * will be compiled to atomic add operations.
     *
     * hist(x) = 0;
     * hist(im(r)) = min(hist(im(r)) + 1, 100);
     * hist.compute_root();
     * hist.update().atomic().parallel();
     *
     * will be compiled to compare-and-swap loops.
     *
     * arg_max() = {0, im(0)};
     * Expr old_index = arg_max()[0];
     * Expr old_max   = arg_max()[1];
     * Expr new_index = select(old_max < im(r), r, old_index);
     * Expr new_max   = max(im(r), old_max);
     * arg_max() = {new_index, new_max};
     * arg_max.compute_root();
     * arg_max.update().atomic().parallel();
     *
     * will be compiled to updates guarded by a mutex lock,
     * since it is impossible to atomically update two different locations.
     *
     * Currently the atomic operation is supported by x86, CUDA, and OpenCL backends.
     * Compiling to other backends results in a compile error.
     * If an operation is compiled into a mutex lock, and is vectorized or is
     * compiled to CUDA or OpenCL, it also results in a compile error,
     * since per-element mutex lock on vectorized operation leads to a
     * deadlock.
     * Vectorization of predicated RVars (through rdom.where()) on CPU
     * is also unsupported yet (see https://github.com/halide/Halide/issues/4298).
     * 8-bit and 16-bit atomics on GPU are also not supported. */
    Func &atomic(bool override_associativity_test = false);

    /** Specialize a Func. This creates a special-case version of the
     * Func where the given condition is true. The most effective
     * conditions are those of the form param == value, and boolean
     * Params. Consider a simple example:
     \code
     f(x) = x + select(cond, 0, 1);
     f.compute_root();
     \endcode
     * This is equivalent to:
     \code
     for (int x = 0; x < width; x++) {
       f[x] = x + (cond ? 0 : 1);
     }
     \endcode
     * Adding the scheduling directive:
     \code
     f.specialize(cond)
     \endcode
     * makes it equivalent to:
     \code
     if (cond) {
       for (int x = 0; x < width; x++) {
         f[x] = x;
       }
     } else {
       for (int x = 0; x < width; x++) {
         f[x] = x + 1;
       }
     }
     \endcode
     * Note that the inner loops have been simplified. In the first
     * path Halide knows that cond is true, and in the second path
     * Halide knows that it is false.
     *
     * The specialized version gets its own schedule, which inherits
     * every directive made about the parent Func's schedule so far
     * except for its specializations. This method returns a handle to
     * the new schedule. If you wish to retrieve the specialized
     * sub-schedule again later, you can call this method with the
     * same condition. Consider the following example of scheduling
     * the specialized version:
     *
     \code
     f(x) = x;
     f.compute_root();
     f.specialize(width > 1).unroll(x, 2);
     \endcode
     * Assuming for simplicity that width is even, this is equivalent to:
     \code
     if (width > 1) {
       for (int x = 0; x < width/2; x++) {
         f[2*x] = 2*x;
         f[2*x + 1] = 2*x + 1;
       }
     } else {
       for (int x = 0; x < width/2; x++) {
         f[x] = x;
       }
     }
     \endcode
     * For this case, it may be better to schedule the un-specialized
     * case instead:
     \code
     f(x) = x;
     f.compute_root();
     f.specialize(width == 1); // Creates a copy of the schedule so far.
     f.unroll(x, 2); // Only applies to the unspecialized case.
     \endcode
     * This is equivalent to:
     \code
     if (width == 1) {
       f[0] = 0;
     } else {
       for (int x = 0; x < width/2; x++) {
         f[2*x] = 2*x;
         f[2*x + 1] = 2*x + 1;
       }
     }
     \endcode
     * This can be a good way to write a pipeline that splits,
     * vectorizes, or tiles, but can still handle small inputs.
     *
     * If a Func has several specializations, the first matching one
     * will be used, so the order in which you define specializations
     * is significant. For example:
     *
     \code
     f(x) = x + select(cond1, a, b) - select(cond2, c, d);
     f.specialize(cond1);
     f.specialize(cond2);
     \endcode
     * is equivalent to:
     \code
     if (cond1) {
       for (int x = 0; x < width; x++) {
         f[x] = x + a - (cond2 ? c : d);
       }
     } else if (cond2) {
       for (int x = 0; x < width; x++) {
         f[x] = x + b - c;
       }
     } else {
       for (int x = 0; x < width; x++) {
         f[x] = x + b - d;
       }
     }
     \endcode
     *
     * Specializations may in turn be specialized, which creates a
     * nested if statement in the generated code.
     *
     \code
     f(x) = x + select(cond1, a, b) - select(cond2, c, d);
     f.specialize(cond1).specialize(cond2);
     \endcode
     * This is equivalent to:
     \code
     if (cond1) {
       if (cond2) {
         for (int x = 0; x < width; x++) {
           f[x] = x + a - c;
         }
       } else {
         for (int x = 0; x < width; x++) {
           f[x] = x + a - d;
         }
       }
     } else {
       for (int x = 0; x < width; x++) {
         f[x] = x + b - (cond2 ? c : d);
       }
     }
     \endcode
     * To create a 4-way if statement that simplifies away all of the
     * ternary operators above, you could say:
     \code
     f.specialize(cond1).specialize(cond2);
     f.specialize(cond2);
     \endcode
     * or
     \code
     f.specialize(cond1 && cond2);
     f.specialize(cond1);
     f.specialize(cond2);
     \endcode
     *
     * Any prior Func which is compute_at some variable of this Func
     * gets separately included in all paths of the generated if
     * statement. The Var in the compute_at call to must exist in all
     * paths, but it may have been generated via a different path of
     * splits, fuses, and renames. This can be used somewhat
     * creatively. Consider the following code:
     \code
     g(x, y) = 8*x;
     f(x, y) = g(x, y) + 1;
     f.compute_root().specialize(cond);
     Var g_loop;
     f.specialize(cond).rename(y, g_loop);
     f.rename(x, g_loop);
     g.compute_at(f, g_loop);
     \endcode
     * When cond is true, this is equivalent to g.compute_at(f,y).
     * When it is false, this is equivalent to g.compute_at(f,x).
     */
    Stage specialize(const Expr &condition);

    /** Add a specialization to a Func that always terminates execution
     * with a call to halide_error(). By itself, this is of limited use,
     * but can be useful to terminate chains of specialize() calls where
     * no "default" case is expected (thus avoiding unnecessary code generation).
     *
     * For instance, say we want to optimize a pipeline to process images
     * in planar and interleaved format; we might typically do something like:
     \code
     ImageParam im(UInt(8), 3);
     Func f = do_something_with(im);
     f.specialize(im.dim(0).stride() == 1).vectorize(x, 8);  // planar
     f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c);  // interleaved
     \endcode
     * This code will vectorize along rows for the planar case, and across pixel
     * components for the interleaved case... but there is an implicit "else"
     * for the unhandled cases, which generates unoptimized code. If we never
     * anticipate passing any other sort of images to this, we code streamline
     * our code by adding specialize_fail():
     \code
     ImageParam im(UInt(8), 3);
     Func f = do_something(im);
     f.specialize(im.dim(0).stride() == 1).vectorize(x, 8);  // planar
     f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c);  // interleaved
     f.specialize_fail("Unhandled image format");
     \endcode
     * Conceptually, this produces codes like:
     \code
     if (im.dim(0).stride() == 1) {
        do_something_planar();
     } else if (im.dim(2).stride() == 1) {
        do_something_interleaved();
     } else {
        halide_error("Unhandled image format");
     }
     \endcode
     *
     * Note that calling specialize_fail() terminates the specialization chain
     * for a given Func; you cannot create new specializations for the Func
     * afterwards (though you can retrieve handles to previous specializations).
     */
    void specialize_fail(const std::string &message);

    /** Tell Halide that the following dimensions correspond to GPU
     * thread indices. This is useful if you compute a producer
     * function within the block indices of a consumer function, and
     * want to control how that function's dimensions map to GPU
     * threads. If the selected target is not an appropriate GPU, this
     * just marks those dimensions as parallel. */
    // @{
    Func &gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
    Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
    Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
    // @}

    /** The given dimension corresponds to the lanes in a GPU
     * warp. GPU warp lanes are distinguished from GPU threads by the
     * fact that all warp lanes run together in lockstep, which
     * permits lightweight communication of data from one lane to
     * another. */
    Func &gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);

    /** Tell Halide to run this stage using a single gpu thread and
     * block. This is not an efficient use of your GPU, but it can be
     * useful to avoid copy-back for intermediate update stages that
     * touch a very small part of your Func. */
    Func &gpu_single_thread(DeviceAPI device_api = DeviceAPI::Default_GPU);

    /** Tell Halide that the following dimensions correspond to GPU
     * block indices. This is useful for scheduling stages that will
     * run serially within each GPU block. If the selected target is
     * not ptx, this just marks those dimensions as parallel. */
    // @{
    Func &gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
    Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
    Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
    // @}

    /** Tell Halide that the following dimensions correspond to GPU
     * block indices and thread indices. If the selected target is not
     * ptx, these just mark the given dimensions as parallel. The
     * dimensions are consumed by this call, so do all other
     * unrolling, reordering, etc first. */
    // @{
    Func &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
    Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
              const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
    Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
              const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
    // @}

    /** Short-hand for tiling a domain and mapping the tile indices
     * to GPU block indices and the coordinates within each tile to
     * GPU thread indices. Consumes the variables given, so do all
     * other scheduling first. */
    // @{
    Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
                   TailStrategy tail = TailStrategy::Auto,
                   DeviceAPI device_api = DeviceAPI::Default_GPU);

    Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
                   TailStrategy tail = TailStrategy::Auto,
                   DeviceAPI device_api = DeviceAPI::Default_GPU);
    Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
                   const VarOrRVar &bx, const VarOrRVar &by,
                   const VarOrRVar &tx, const VarOrRVar &ty,
                   const Expr &x_size, const Expr &y_size,
                   TailStrategy tail = TailStrategy::Auto,
                   DeviceAPI device_api = DeviceAPI::Default_GPU);

    Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
                   const VarOrRVar &tx, const VarOrRVar &ty,
                   const Expr &x_size, const Expr &y_size,
                   TailStrategy tail = TailStrategy::Auto,
                   DeviceAPI device_api = DeviceAPI::Default_GPU);

    Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
                   const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
                   const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
                   const Expr &x_size, const Expr &y_size, const Expr &z_size,
                   TailStrategy tail = TailStrategy::Auto,
                   DeviceAPI device_api = DeviceAPI::Default_GPU);
    Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
                   const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
                   const Expr &x_size, const Expr &y_size, const Expr &z_size,
                   TailStrategy tail = TailStrategy::Auto,
                   DeviceAPI device_api = DeviceAPI::Default_GPU);
    // @}

    /** Schedule for execution on Hexagon. When a loop is marked with
     * Hexagon, that loop is executed on a Hexagon DSP. */
    Func &hexagon(const VarOrRVar &x = Var::outermost());

    /** Prefetch data written to or read from a Func or an ImageParam by a
     * subsequent loop iteration, at an optionally specified iteration offset. You may specify
     * specification of different vars for the location of the prefetch() instruction
     * vs. the location that is being prefetched:
     *
     * - the first var specified, 'at', indicates the loop in which the prefetch will be placed
     * - the second var specified, 'from', determines the var used to find the bounds to prefetch
     *   (in conjunction with 'offset')
     *
     * If 'at' and 'from' are distinct vars, then 'from' must be at a nesting level outside 'at.'
     * Note that the value for 'offset' applies only to 'from', not 'at'.
     *
     * The final argument specifies how prefetch of region outside bounds
     * should be handled.
     *
     * For example, consider this pipeline:
     \code
     Func f, g;
     Var x, y, z;
     f(x, y) = x + y;
     g(x, y) = 2 * f(x, y);
     h(x, y) = 3 * f(x, y);
     \endcode
     *
     * The following schedule:
     \code
     f.compute_root();
     g.prefetch(f, x, x, 2, PrefetchBoundStrategy::NonFaulting);
     h.prefetch(f, x, y, 2, PrefetchBoundStrategy::NonFaulting);
     \endcode
     *
     * will inject prefetch call at the innermost loop of 'g' and 'h' and generate
     * the following loop nest:
     \code
     for y = ...
       for x = ...
         f(x, y) = x + y
     for y = ..
       for x = ...
         prefetch(&f[x + 2, y], 1, 16);
         g(x, y) = 2 * f(x, y)
     for y = ..
       for x = ...
         prefetch(&f[x, y + 2], 1, 16);
         h(x, y) = 3 * f(x, y)
     \endcode
     *
     * Note that the 'from' nesting level need not be adjacent to 'at':
     \code
     Func f, g;
     Var x, y, z, w;
     f(x, y, z, w) = x + y + z + w;
     g(x, y, z, w) = 2 * f(x, y, z, w);
     \endcode
     *
     * The following schedule:
     \code
     f.compute_root();
     g.prefetch(f, y, w, 2, PrefetchBoundStrategy::NonFaulting);
     \endcode
     *
     * will produce code that prefetches a tile of data:
     \code
     for w = ...
       for z = ...
         for y = ...
           for x = ...
         f(x, y, z, w) = x + y + z + w
     for w = ...
       for z = ...
         for y = ...
           for x0 = ...
              prefetch(&f[x0, y, z, w + 2], 1, 16);
           for x = ...
             g(x, y, z, w) = 2 * f(x, y, z, w)
     \endcode
     *
     * Note that calling prefetch() with the same var for both 'at' and 'from'
     * is equivalent to calling prefetch() with that var.
     */
    // @{
    Func &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
                   PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf);
    Func &prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
                   PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf);
    template<typename T>
    Func &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
                   PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
        return prefetch(image.parameter(), at, from, std::move(offset), strategy);
    }
    // @}

    /** Specify how the storage for the function is laid out. These
     * calls let you specify the nesting order of the dimensions. For
     * example, foo.reorder_storage(y, x) tells Halide to use
     * column-major storage for any realizations of foo, without
     * changing how you refer to foo in the code. You may want to do
     * this if you intend to vectorize across y. When representing
     * color images, foo.reorder_storage(c, x, y) specifies packed
     * storage (red, green, and blue values adjacent in memory), and
     * foo.reorder_storage(x, y, c) specifies planar storage (entire
     * red, green, and blue images one after the other in memory).
     *
     * If you leave out some dimensions, those remain in the same
     * positions in the nesting order while the specified variables
     * are reordered around them. */
    // @{
    Func &reorder_storage(const std::vector<Var> &dims);

    Func &reorder_storage(const Var &x, const Var &y);
    template<typename... Args>
    HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Var, Args...>::value, Func &>::type
    reorder_storage(const Var &x, const Var &y, Args &&...args) {
        std::vector<Var> collected_args{x, y, std::forward<Args>(args)...};
        return reorder_storage(collected_args);
    }
    // @}

    /** Pad the storage extent of a particular dimension of
     * realizations of this function up to be a multiple of the
     * specified alignment. This guarantees that the strides for the
     * dimensions stored outside of dim will be multiples of the
     * specified alignment, where the strides and alignment are
     * measured in numbers of elements.
     *
     * For example, to guarantee that a function foo(x, y, c)
     * representing an image has scanlines starting on offsets
     * aligned to multiples of 16, use foo.align_storage(x, 16). */
    Func &align_storage(const Var &dim, const Expr &alignment);

    /** Store realizations of this function in a circular buffer of a
     * given extent. This is more efficient when the extent of the
     * circular buffer is a power of 2. If the fold factor is too
     * small, or the dimension is not accessed monotonically, the
     * pipeline will generate an error at runtime.
     *
     * The fold_forward option indicates that the new values of the
     * producer are accessed by the consumer in a monotonically
     * increasing order. Folding storage of producers is also
     * supported if the new values are accessed in a monotonically
     * decreasing order by setting fold_forward to false.
     *
     * For example, consider the pipeline:
     \code
     Func f, g;
     Var x, y;
     g(x, y) = x*y;
     f(x, y) = g(x, y) + g(x, y+1);
     \endcode
     *
     * If we schedule f like so:
     *
     \code
     g.compute_at(f, y).store_root().fold_storage(y, 2);
     \endcode
     *
     * Then g will be computed at each row of f and stored in a buffer
     * with an extent in y of 2, alternately storing each computed row
     * of g in row y=0 or y=1.
     */
    Func &fold_storage(const Var &dim, const Expr &extent, bool fold_forward = true);

    /** Compute this function as needed for each unique value of the
     * given var for the given calling function f.
     *
     * For example, consider the simple pipeline:
     \code
     Func f, g;
     Var x, y;
     g(x, y) = x*y;
     f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
     \endcode
     *
     * If we schedule f like so:
     *
     \code
     g.compute_at(f, x);
     \endcode
     *
     * Then the C code equivalent to this pipeline will look like this
     *
     \code

     int f[height][width];
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x++) {
             int g[2][2];
             g[0][0] = x*y;
             g[0][1] = (x+1)*y;
             g[1][0] = x*(y+1);
             g[1][1] = (x+1)*(y+1);
             f[y][x] = g[0][0] + g[1][0] + g[0][1] + g[1][1];
         }
     }

     \endcode
     *
     * The allocation and computation of g is within f's loop over x,
     * and enough of g is computed to satisfy all that f will need for
     * that iteration. This has excellent locality - values of g are
     * used as soon as they are computed, but it does redundant
     * work. Each value of g ends up getting computed four times. If
     * we instead schedule f like so:
     *
     \code
     g.compute_at(f, y);
     \endcode
     *
     * The equivalent C code is:
     *
     \code
     int f[height][width];
     for (int y = 0; y < height; y++) {
         int g[2][width+1];
         for (int x = 0; x < width; x++) {
             g[0][x] = x*y;
             g[1][x] = x*(y+1);
         }
         for (int x = 0; x < width; x++) {
             f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
         }
     }
     \endcode
     *
     * The allocation and computation of g is within f's loop over y,
     * and enough of g is computed to satisfy all that f will need for
     * that iteration. This does less redundant work (each point in g
     * ends up being evaluated twice), but the locality is not quite
     * as good, and we have to allocate more temporary memory to store
     * g.
     */
    Func &compute_at(const Func &f, const Var &var);

    /** Schedule a function to be computed within the iteration over
     * some dimension of an update domain. Produces equivalent code
     * to the version of compute_at that takes a Var. */
    Func &compute_at(const Func &f, const RVar &var);

    /** Schedule a function to be computed within the iteration over
     * a given LoopLevel. */
    Func &compute_at(LoopLevel loop_level);

    /** Schedule the iteration over the initial definition of this function
     *  to be fused with another stage 's' from outermost loop to a
     * given LoopLevel. */
    // @{
    Func &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
    Func &compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrategy align = LoopAlignStrategy::Auto);
    Func &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
    Func &compute_with(LoopLevel loop_level, LoopAlignStrategy align = LoopAlignStrategy::Auto);

    /** Compute all of this function once ahead of time. Reusing
     * the example in \ref Func::compute_at :
     *
     \code
     Func f, g;
     Var x, y;
     g(x, y) = x*y;
     f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);

     g.compute_root();
     \endcode
     *
     * is equivalent to
     *
     \code
     int f[height][width];
     int g[height+1][width+1];
     for (int y = 0; y < height+1; y++) {
         for (int x = 0; x < width+1; x++) {
             g[y][x] = x*y;
         }
     }
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x++) {
             f[y][x] = g[y][x] + g[y+1][x] + g[y][x+1] + g[y+1][x+1];
         }
     }
     \endcode
     *
     * g is computed once ahead of time, and enough is computed to
     * satisfy all uses of it. This does no redundant work (each point
     * in g is evaluated once), but has poor locality (values of g are
     * probably not still in cache when they are used by f), and
     * allocates lots of temporary memory to store g.
     */
    Func &compute_root();

    /** Use the halide_memoization_cache_... interface to store a
     *  computed version of this function across invocations of the
     *  Func.
     *
     * If an eviction_key is provided, it must be constructed with
     * Expr of integer or handle type. The key Expr will be promoted
     * to a uint64_t and can be used with halide_memoization_cache_evict
     * to remove memoized entries using this eviction key from the
     * cache. Memoized computations that do not provide an eviction
     * key will never be evicted by this mechanism.
     */
    Func &memoize(const EvictionKey &eviction_key = EvictionKey());

    /** Produce this Func asynchronously in a separate
     * thread. Consumers will be run by the task system when the
     * production is complete. If this Func's store level is different
     * to its compute level, consumers will be run concurrently,
     * blocking as necessary to prevent reading ahead of what the
     * producer has computed. If storage is folded, then the producer
     * will additionally not be permitted to run too far ahead of the
     * consumer, to avoid clobbering data that has not yet been
     * used.
     *
     * Take special care when combining this with custom thread pool
     * implementations, as avoiding deadlock with producer-consumer
     * parallelism requires a much more sophisticated parallel runtime
     * than with data parallelism alone. It is strongly recommended
     * you just use Halide's default thread pool, which guarantees no
     * deadlock and a bound on the number of threads launched.
     */
    Func &async();

    /** Bound the extent of a Func's storage, but not extent of its
     * compute. This can be useful for forcing a function's allocation
     * to be a fixed size, which often means it can go on the stack.
     * If bounds inference decides that it requires more storage for
     * this function than the allocation size you have stated, a runtime
     * error will occur when you try to run the pipeline. */
    Func &bound_storage(const Var &dim, const Expr &bound);

    /** Allocate storage for this function within f's loop over
     * var. Scheduling storage is optional, and can be used to
     * separate the loop level at which storage occurs from the loop
     * level at which computation occurs to trade off between locality
     * and redundant work. This can open the door for two types of
     * optimization.
     *
     * Consider again the pipeline from \ref Func::compute_at :
     \code
     Func f, g;
     Var x, y;
     g(x, y) = x*y;
     f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
     \endcode
     *
     * If we schedule it like so:
     *
     \code
     g.compute_at(f, x).store_at(f, y);
     \endcode
     *
     * Then the computation of g takes place within the loop over x,
     * but the storage takes place within the loop over y:
     *
     \code
     int f[height][width];
     for (int y = 0; y < height; y++) {
         int g[2][width+1];
         for (int x = 0; x < width; x++) {
             g[0][x] = x*y;
             g[0][x+1] = (x+1)*y;
             g[1][x] = x*(y+1);
             g[1][x+1] = (x+1)*(y+1);
             f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
         }
     }
     \endcode
     *
     * Provided the for loop over x is serial, halide then
     * automatically performs the following sliding window
     * optimization:
     *
     \code
     int f[height][width];
     for (int y = 0; y < height; y++) {
         int g[2][width+1];
         for (int x = 0; x < width; x++) {
             if (x == 0) {
                 g[0][x] = x*y;
                 g[1][x] = x*(y+1);
             }
             g[0][x+1] = (x+1)*y;
             g[1][x+1] = (x+1)*(y+1);
             f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
         }
     }
     \endcode
     *
     * Two of the assignments to g only need to be done when x is
     * zero. The rest of the time, those sites have already been
     * filled in by a previous iteration. This version has the
     * locality of compute_at(f, x), but allocates more memory and
     * does much less redundant work.
     *
     * Halide then further optimizes this pipeline like so:
     *
     \code
     int f[height][width];
     for (int y = 0; y < height; y++) {
         int g[2][2];
         for (int x = 0; x < width; x++) {
             if (x == 0) {
                 g[0][0] = x*y;
                 g[1][0] = x*(y+1);
             }
             g[0][(x+1)%2] = (x+1)*y;
             g[1][(x+1)%2] = (x+1)*(y+1);
             f[y][x] = g[0][x%2] + g[1][x%2] + g[0][(x+1)%2] + g[1][(x+1)%2];
         }
     }
     \endcode
     *
     * Halide has detected that it's possible to use a circular buffer
     * to represent g, and has reduced all accesses to g modulo 2 in
     * the x dimension. This optimization only triggers if the for
     * loop over x is serial, and if halide can statically determine
     * some power of two large enough to cover the range needed. For
     * powers of two, the modulo operator compiles to more efficient
     * bit-masking. This optimization reduces memory usage, and also
     * improves locality by reusing recently-accessed memory instead
     * of pulling new memory into cache.
     *
     */
    Func &store_at(const Func &f, const Var &var);

    /** Equivalent to the version of store_at that takes a Var, but
     * schedules storage within the loop over a dimension of a
     * reduction domain */
    Func &store_at(const Func &f, const RVar &var);

    /** Equivalent to the version of store_at that takes a Var, but
     * schedules storage at a given LoopLevel. */
    Func &store_at(LoopLevel loop_level);

    /** Equivalent to \ref Func::store_at, but schedules storage
     * outside the outermost loop. */
    Func &store_root();

    /** Aggressively inline all uses of this function. This is the
     * default schedule, so you're unlikely to need to call this. For
     * a Func with an update definition, that means it gets computed
     * as close to the innermost loop as possible.
     *
     * Consider once more the pipeline from \ref Func::compute_at :
     *
     \code
     Func f, g;
     Var x, y;
     g(x, y) = x*y;
     f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
     \endcode
     *
     * Leaving g as inline, this compiles to code equivalent to the following C:
     *
     \code
     int f[height][width];
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x++) {
             f[y][x] = x*y + x*(y+1) + (x+1)*y + (x+1)*(y+1);
         }
     }
     \endcode
     */
    Func &compute_inline();

    /** Get a handle on an update step for the purposes of scheduling
     * it. */
    Stage update(int idx = 0);

    /** Set the type of memory this Func should be stored in. Controls
     * whether allocations go on the stack or the heap on the CPU, and
     * in global vs shared vs local on the GPU. See the documentation
     * on MemoryType for more detail. */
    Func &store_in(MemoryType memory_type);

    /** Trace all loads from this Func by emitting calls to
     * halide_trace. If the Func is inlined, this has no
     * effect. */
    Func &trace_loads();

    /** Trace all stores to the buffer backing this Func by emitting
     * calls to halide_trace. If the Func is inlined, this call
     * has no effect. */
    Func &trace_stores();

    /** Trace all realizations of this Func by emitting calls to
     * halide_trace. */
    Func &trace_realizations();

    /** Add a string of arbitrary text that will be passed thru to trace
     * inspection code if the Func is realized in trace mode. (Funcs that are
     * inlined won't have their tags emitted.) Ignored entirely if
     * tracing is not enabled for the Func (or globally).
     */
    Func &add_trace_tag(const std::string &trace_tag);

    /** Get a handle on the internal halide function that this Func
     * represents. Useful if you want to do introspection on Halide
     * functions */
    Internal::Function function() const {
        return func;
    }

    /** You can cast a Func to its pure stage for the purposes of
     * scheduling it. */
    operator Stage() const;

    /** Get a handle on the output buffer for this Func. Only relevant
     * if this is the output Func in a pipeline. Useful for making
     * static promises about strides, mins, and extents. */
    // @{
    OutputImageParam output_buffer() const;
    std::vector<OutputImageParam> output_buffers() const;
    // @}

    /** Use a Func as an argument to an external stage. */
    operator ExternFuncArgument() const;

    /** Infer the arguments to the Func, sorted into a canonical order:
     * all buffers (sorted alphabetically by name), followed by all non-buffers
     * (sorted alphabetically by name).
     This lets you write things like:
     \code
     func.compile_to_assembly("/dev/stdout", func.infer_arguments());
     \endcode
     */
    std::vector<Argument> infer_arguments() const;

    /** Get the source location of the pure definition of this
     * Func. See Stage::source_location() */
    std::string source_location() const;

    /** Return the current StageSchedule associated with this initial
     * Stage of this Func. For introspection only: to modify schedule,
     * use the Func interface. */
    const Internal::StageSchedule &get_schedule() const {
        return Stage(*this).get_schedule();
    }
};

namespace Internal {

template<typename Last>
inline void check_types(const Tuple &t, int idx) {
    using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
    user_assert(t[idx].type() == type_of<T>())
        << "Can't evaluate expression "
        << t[idx] << " of type " << t[idx].type()
        << " as a scalar of type " << type_of<T>() << "\n";
}

template<typename First, typename Second, typename... Rest>
inline void check_types(const Tuple &t, int idx) {
    check_types<First>(t, idx);
    check_types<Second, Rest...>(t, idx + 1);
}

template<typename Last>
inline void assign_results(Realization &r, int idx, Last last) {
    using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
    *last = Buffer<T>(r[idx])();
}

template<typename First, typename Second, typename... Rest>
inline void assign_results(Realization &r, int idx, First first, Second second, Rest &&...rest) {
    assign_results<First>(r, idx, first);
    assign_results<Second, Rest...>(r, idx + 1, second, rest...);
}

}  // namespace Internal

/** JIT-Compile and run enough code to evaluate a Halide
 * expression. This can be thought of as a scalar version of
 * \ref Func::realize */
template<typename T>
HALIDE_NO_USER_CODE_INLINE T evaluate(JITUserContext *ctx, const Expr &e) {
    user_assert(e.type() == type_of<T>())
        << "Can't evaluate expression "
        << e << " of type " << e.type()
        << " as a scalar of type " << type_of<T>() << "\n";
    Func f;
    f() = e;
    Buffer<T, 0> im = f.realize(ctx);
    return im();
}

/** evaluate with a default user context */
template<typename T>
HALIDE_NO_USER_CODE_INLINE T evaluate(const Expr &e) {
    return evaluate<T>(nullptr, e);
}

/** JIT-compile and run enough code to evaluate a Halide Tuple. */
template<typename First, typename... Rest>
HALIDE_NO_USER_CODE_INLINE void evaluate(JITUserContext *ctx, Tuple t, First first, Rest &&...rest) {
    Internal::check_types<First, Rest...>(t, 0);

    Func f;
    f() = t;
    Realization r = f.realize(ctx);
    Internal::assign_results(r, 0, first, rest...);
}

/** JIT-compile and run enough code to evaluate a Halide Tuple. */
template<typename First, typename... Rest>
HALIDE_NO_USER_CODE_INLINE void evaluate(Tuple t, First first, Rest &&...rest) {
    evaluate<First, Rest...>(nullptr, std::move(t), std::forward<First>(first), std::forward<Rest...>(rest...));
}

namespace Internal {

inline void schedule_scalar(Func f) {
    Target t = get_jit_target_from_environment();
    if (t.has_gpu_feature()) {
        f.gpu_single_thread();
    }
    if (t.has_feature(Target::HVX)) {
        f.hexagon();
    }
}

}  // namespace Internal

/** JIT-Compile and run enough code to evaluate a Halide
 * expression. This can be thought of as a scalar version of
 * \ref Func::realize. Can use GPU if jit target from environment
 * specifies one.
 */
template<typename T>
HALIDE_NO_USER_CODE_INLINE T evaluate_may_gpu(const Expr &e) {
    user_assert(e.type() == type_of<T>())
        << "Can't evaluate expression "
        << e << " of type " << e.type()
        << " as a scalar of type " << type_of<T>() << "\n";
    Func f;
    f() = e;
    Internal::schedule_scalar(f);
    Buffer<T, 0> im = f.realize();
    return im();
}

/** JIT-compile and run enough code to evaluate a Halide Tuple. Can
 *  use GPU if jit target from environment specifies one. */
// @{
template<typename First, typename... Rest>
HALIDE_NO_USER_CODE_INLINE void evaluate_may_gpu(Tuple t, First first, Rest &&...rest) {
    Internal::check_types<First, Rest...>(t, 0);

    Func f;
    f() = t;
    Internal::schedule_scalar(f);
    Realization r = f.realize();
    Internal::assign_results(r, 0, first, rest...);
}
// @}

}  // namespace Halide

#endif