Func.h
#ifndef HALIDE_FUNC_H
#define HALIDE_FUNC_H
/** \file
*
* Defines Func - the front-end handle on a halide function, and related classes.
*/
#include "Argument.h"
#include "Expr.h"
#include "JITModule.h"
#include "Module.h"
#include "Param.h"
#include "Pipeline.h"
#include "RDom.h"
#include "Target.h"
#include "Tuple.h"
#include "Var.h"
#include <map>
#include <utility>
namespace Halide {
class OutputImageParam;
class ParamMap;
/** A class that can represent Vars or RVars. Used for reorder calls
* which can accept a mix of either. */
struct VarOrRVar {
VarOrRVar(const std::string &n, bool r)
: var(n), rvar(n), is_rvar(r) {
}
VarOrRVar(const Var &v)
: var(v), is_rvar(false) {
}
VarOrRVar(const RVar &r)
: rvar(r), is_rvar(true) {
}
VarOrRVar(const RDom &r)
: rvar(RVar(r)), is_rvar(true) {
}
template<int N>
VarOrRVar(const ImplicitVar<N> &u)
: var(u), is_rvar(false) {
}
const std::string &name() const {
if (is_rvar) {
return rvar.name();
} else {
return var.name();
}
}
Var var;
RVar rvar;
bool is_rvar;
};
class ImageParam;
namespace Internal {
class Function;
struct Split;
struct StorageDim;
} // namespace Internal
/** A single definition of a Func. May be a pure or update definition. */
class Stage {
/** Reference to the Function this stage (or definition) belongs to. */
Internal::Function function;
Internal::Definition definition;
/** Indicate which stage the definition belongs to (0 for initial
* definition, 1 for first update, etc.). */
size_t stage_index;
/** Pure Vars of the Function (from the init definition). */
std::vector<Var> dim_vars;
void set_dim_type(const VarOrRVar &var, Internal::ForType t);
void set_dim_device_api(const VarOrRVar &var, DeviceAPI device_api);
void split(const std::string &old, const std::string &outer, const std::string &inner,
const Expr &factor, bool exact, TailStrategy tail);
void remove(const std::string &var);
Stage &purify(const VarOrRVar &old_name, const VarOrRVar &new_name);
const std::vector<Internal::StorageDim> &storage_dims() const {
return function.schedule().storage_dims();
}
Stage &compute_with(LoopLevel loop_level, const std::map<std::string, LoopAlignStrategy> &align);
public:
Stage(Internal::Function f, Internal::Definition d, size_t stage_index)
: function(std::move(f)), definition(std::move(d)), stage_index(stage_index) {
internal_assert(definition.defined());
dim_vars.reserve(function.args().size());
for (const auto &arg : function.args()) {
dim_vars.emplace_back(arg);
}
internal_assert(definition.args().size() == dim_vars.size());
}
/** Return the current StageSchedule associated with this Stage. For
* introspection only: to modify schedule, use the Func interface. */
const Internal::StageSchedule &get_schedule() const {
return definition.schedule();
}
/** Return a string describing the current var list taking into
* account all the splits, reorders, and tiles. */
std::string dump_argument_list() const;
/** Return the name of this stage, e.g. "f.update(2)" */
std::string name() const;
/** Calling rfactor() on an associative update definition a Func will split
* the update into an intermediate which computes the partial results and
* replaces the current update definition with a new definition which merges
* the partial results. If called on a init/pure definition, this will
* throw an error. rfactor() will automatically infer the associative reduction
* operator and identity of the operator. If it can't prove the operation
* is associative or if it cannot find an identity for that operator, this
* will throw an error. In addition, commutativity of the operator is required
* if rfactor() is called on the inner dimension but excluding the outer
* dimensions.
*
* rfactor() takes as input 'preserved', which is a list of <RVar, Var> pairs.
* The rvars not listed in 'preserved' are removed from the original Func and
* are lifted to the intermediate Func. The remaining rvars (the ones in
* 'preserved') are made pure in the intermediate Func. The intermediate Func's
* update definition inherits all scheduling directives (e.g. split,fuse, etc.)
* applied to the original Func's update definition. The loop order of the
* intermediate Func's update definition is the same as the original, although
* the RVars in 'preserved' are replaced by the new pure Vars. The loop order of the
* intermediate Func's init definition from innermost to outermost is the args'
* order of the original Func's init definition followed by the new pure Vars.
*
* The intermediate Func also inherits storage order from the original Func
* with the new pure Vars added to the outermost.
*
* For example, f.update(0).rfactor({{r.y, u}}) would rewrite a pipeline like this:
\code
f(x, y) = 0;
f(x, y) += g(r.x, r.y);
\endcode
* into a pipeline like this:
\code
f_intm(x, y, u) = 0;
f_intm(x, y, u) += g(r.x, u);
f(x, y) = 0;
f(x, y) += f_intm(x, y, r.y);
\endcode
*
* This has a variety of uses. You can use it to split computation of an associative reduction:
\code
f(x, y) = 10;
RDom r(0, 96);
f(x, y) = max(f(x, y), g(x, y, r.x));
f.update(0).split(r.x, rxo, rxi, 8).reorder(y, x).parallel(x);
f.update(0).rfactor({{rxo, u}}).compute_root().parallel(u).update(0).parallel(u);
\endcode
*
*, which is equivalent to:
\code
parallel for u = 0 to 11:
for y:
for x:
f_intm(x, y, u) = -inf
parallel for x:
for y:
parallel for u = 0 to 11:
for rxi = 0 to 7:
f_intm(x, y, u) = max(f_intm(x, y, u), g(8*u + rxi))
for y:
for x:
f(x, y) = 10
parallel for x:
for y:
for rxo = 0 to 11:
f(x, y) = max(f(x, y), f_intm(x, y, rxo))
\endcode
*
*/
// @{
Func rfactor(std::vector<std::pair<RVar, Var>> preserved);
Func rfactor(const RVar &r, const Var &v);
// @}
/** Schedule the iteration over this stage to be fused with another
* stage 's' from outermost loop to a given LoopLevel. 'this' stage will
* be computed AFTER 's' in the innermost fused dimension. There should not
* be any dependencies between those two fused stages. If either of the
* stages being fused is a stage of an extern Func, this will throw an error.
*
* Note that the two stages that are fused together should have the same
* exact schedule from the outermost to the innermost fused dimension, and
* the stage we are calling compute_with on should not have specializations,
* e.g. f2.compute_with(f1, x) is allowed only if f2 has no specializations.
*
* Also, if a producer is desired to be computed at the fused loop level,
* the function passed to the compute_at() needs to be the "parent". Consider
* the following code:
\code
input(x, y) = x + y;
f(x, y) = input(x, y);
f(x, y) += 5;
g(x, y) = x - y;
g(x, y) += 10;
f.compute_with(g, y);
f.update().compute_with(g.update(), y);
\endcode
*
* To compute 'input' at the fused loop level at dimension y, we specify
* input.compute_at(g, y) instead of input.compute_at(f, y) since 'g' is
* the "parent" for this fused loop (i.e. 'g' is computed first before 'f'
* is computed). On the other hand, to compute 'input' at the innermost
* dimension of 'f', we specify input.compute_at(f, x) instead of
* input.compute_at(g, x) since the x dimension of 'f' is not fused
* (only the y dimension is).
*
* Given the constraints, this has a variety of uses. Consider the
* following code:
\code
f(x, y) = x + y;
g(x, y) = x - y;
h(x, y) = f(x, y) + g(x, y);
f.compute_root();
g.compute_root();
f.split(x, xo, xi, 8);
g.split(x, xo, xi, 8);
g.compute_with(f, xo);
\endcode
*
* This is equivalent to:
\code
for y:
for xo:
for xi:
f(8*xo + xi) = (8*xo + xi) + y
for xi:
g(8*xo + xi) = (8*xo + xi) - y
for y:
for x:
h(x, y) = f(x, y) + g(x, y)
\endcode
*
* The size of the dimensions of the stages computed_with do not have
* to match. Consider the following code where 'g' is half the size of 'f':
\code
Image<int> f_im(size, size), g_im(size/2, size/2);
input(x, y) = x + y;
f(x, y) = input(x, y);
g(x, y) = input(2*x, 2*y);
g.compute_with(f, y);
input.compute_at(f, y);
Pipeline({f, g}).realize({f_im, g_im});
\endcode
*
* This is equivalent to:
\code
for y = 0 to size-1:
for x = 0 to size-1:
input(x, y) = x + y;
for x = 0 to size-1:
f(x, y) = input(x, y)
for x = 0 to size/2-1:
if (y < size/2-1):
g(x, y) = input(2*x, 2*y)
\endcode
*
* 'align' specifies how the loop iteration of each dimension of the
* two stages being fused should be aligned in the fused loop nests
* (see LoopAlignStrategy for options). Consider the following loop nests:
\code
for z = f_min_z to f_max_z:
for y = f_min_y to f_max_y:
for x = f_min_x to f_max_x:
f(x, y, z) = x + y + z
for z = g_min_z to g_max_z:
for y = g_min_y to g_max_y:
for x = g_min_x to g_max_x:
g(x, y, z) = x - y - z
\endcode
*
* If no alignment strategy is specified, the following loop nest will be
* generated:
\code
for z = min(f_min_z, g_min_z) to max(f_max_z, g_max_z):
for y = min(f_min_y, g_min_y) to max(f_max_y, g_max_y):
for x = f_min_x to f_max_x:
if (f_min_z <= z <= f_max_z):
if (f_min_y <= y <= f_max_y):
f(x, y, z) = x + y + z
for x = g_min_x to g_max_x:
if (g_min_z <= z <= g_max_z):
if (g_min_y <= y <= g_max_y):
g(x, y, z) = x - y - z
\endcode
*
* Instead, these alignment strategies:
\code
g.compute_with(f, y, {{z, LoopAlignStrategy::AlignStart}, {y, LoopAlignStrategy::AlignEnd}});
\endcode
* will produce the following loop nest:
\code
f_loop_min_z = f_min_z
f_loop_max_z = max(f_max_z, (f_min_z - g_min_z) + g_max_z)
for z = f_min_z to f_loop_max_z:
f_loop_min_y = min(f_min_y, (f_max_y - g_max_y) + g_min_y)
f_loop_max_y = f_max_y
for y = f_loop_min_y to f_loop_max_y:
for x = f_min_x to f_max_x:
if (f_loop_min_z <= z <= f_loop_max_z):
if (f_loop_min_y <= y <= f_loop_max_y):
f(x, y, z) = x + y + z
for x = g_min_x to g_max_x:
g_shift_z = g_min_z - f_loop_min_z
g_shift_y = g_max_y - f_loop_max_y
if (g_min_z <= (z + g_shift_z) <= g_max_z):
if (g_min_y <= (y + g_shift_y) <= g_max_y):
g(x, y + g_shift_y, z + g_shift_z) = x - (y + g_shift_y) - (z + g_shift_z)
\endcode
*
* LoopAlignStrategy::AlignStart on dimension z will shift the loop iteration
* of 'g' at dimension z so that its starting value matches that of 'f'.
* Likewise, LoopAlignStrategy::AlignEnd on dimension y will shift the loop
* iteration of 'g' at dimension y so that its end value matches that of 'f'.
*/
// @{
Stage &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
Stage &compute_with(LoopLevel loop_level, LoopAlignStrategy align = LoopAlignStrategy::Auto);
Stage &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
Stage &compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrategy align = LoopAlignStrategy::Auto);
// @}
/** Scheduling calls that control how the domain of this stage is
* traversed. See the documentation for Func for the meanings. */
// @{
Stage &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
Stage &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
Stage &serial(const VarOrRVar &var);
Stage ¶llel(const VarOrRVar &var);
Stage &vectorize(const VarOrRVar &var);
Stage &unroll(const VarOrRVar &var);
Stage ¶llel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);
Stage &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
Stage &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
const VarOrRVar &xo, const VarOrRVar &yo,
const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor,
TailStrategy tail = TailStrategy::Auto);
Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
const VarOrRVar &xi, const VarOrRVar &yi,
const Expr &xfactor, const Expr &yfactor,
TailStrategy tail = TailStrategy::Auto);
Stage &tile(const std::vector<VarOrRVar> &previous,
const std::vector<VarOrRVar> &outers,
const std::vector<VarOrRVar> &inners,
const std::vector<Expr> &factors,
const std::vector<TailStrategy> &tails);
Stage &tile(const std::vector<VarOrRVar> &previous,
const std::vector<VarOrRVar> &outers,
const std::vector<VarOrRVar> &inners,
const std::vector<Expr> &factors,
TailStrategy tail = TailStrategy::Auto);
Stage &tile(const std::vector<VarOrRVar> &previous,
const std::vector<VarOrRVar> &inners,
const std::vector<Expr> &factors,
TailStrategy tail = TailStrategy::Auto);
Stage &reorder(const std::vector<VarOrRVar> &vars);
template<typename... Args>
HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
return reorder(collected_args);
}
Stage &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
Stage specialize(const Expr &condition);
void specialize_fail(const std::string &message);
Stage &gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
Stage &gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
Stage &gpu_single_thread(DeviceAPI device_api = DeviceAPI::Default_GPU);
Stage &gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
const VarOrRVar &thread_x, const VarOrRVar &thread_y,
DeviceAPI device_api = DeviceAPI::Default_GPU);
Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z,
DeviceAPI device_api = DeviceAPI::Default_GPU);
Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
TailStrategy tail = TailStrategy::Auto,
DeviceAPI device_api = DeviceAPI::Default_GPU);
Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
TailStrategy tail = TailStrategy::Auto,
DeviceAPI device_api = DeviceAPI::Default_GPU);
Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
const VarOrRVar &bx, const VarOrRVar &by,
const VarOrRVar &tx, const VarOrRVar &ty,
const Expr &x_size, const Expr &y_size,
TailStrategy tail = TailStrategy::Auto,
DeviceAPI device_api = DeviceAPI::Default_GPU);
Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
const VarOrRVar &tx, const VarOrRVar &ty,
const Expr &x_size, const Expr &y_size,
TailStrategy tail = TailStrategy::Auto,
DeviceAPI device_api = DeviceAPI::Default_GPU);
Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
const Expr &x_size, const Expr &y_size, const Expr &z_size,
TailStrategy tail = TailStrategy::Auto,
DeviceAPI device_api = DeviceAPI::Default_GPU);
Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
const Expr &x_size, const Expr &y_size, const Expr &z_size,
TailStrategy tail = TailStrategy::Auto,
DeviceAPI device_api = DeviceAPI::Default_GPU);
Stage &allow_race_conditions();
Stage &atomic(bool override_associativity_test = false);
Stage &hexagon(const VarOrRVar &x = Var::outermost());
Stage &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf);
Stage &prefetch(const Internal::Parameter ¶m, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf);
template<typename T>
Stage &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
return prefetch(image.parameter(), at, from, std::move(offset), strategy);
}
// @}
/** Attempt to get the source file and line where this stage was
* defined by parsing the process's own debug symbols. Returns an
* empty string if no debug symbols were found or the debug
* symbols were not understood. Works on OS X and Linux only. */
std::string source_location() const;
/** Assert that this stage has intentionally been given no schedule, and
* suppress the warning about unscheduled update definitions that would
* otherwise fire. This counts as a schedule, so calling this twice on the
* same Stage will fail the assertion. */
void unscheduled();
};
// For backwards compatibility, keep the ScheduleHandle name.
typedef Stage ScheduleHandle;
class FuncTupleElementRef;
/** A fragment of front-end syntax of the form f(x, y, z), where x, y,
* z are Vars or Exprs. If could be the left hand side of a definition or
* an update definition, or it could be a call to a function. We don't know
* until we see how this object gets used.
*/
class FuncRef {
Internal::Function func;
int implicit_placeholder_pos;
int implicit_count;
std::vector<Expr> args;
std::vector<Expr> args_with_implicit_vars(const std::vector<Expr> &e) const;
/** Helper for function update by Tuple. If the function does not
* already have a pure definition, init_val will be used as RHS of
* each tuple element in the initial function definition. */
template<typename BinaryOp>
Stage func_ref_update(const Tuple &e, int init_val);
/** Helper for function update by Expr. If the function does not
* already have a pure definition, init_val will be used as RHS in
* the initial function definition. */
template<typename BinaryOp>
Stage func_ref_update(Expr e, int init_val);
public:
FuncRef(const Internal::Function &, const std::vector<Expr> &,
int placeholder_pos = -1, int count = 0);
FuncRef(Internal::Function, const std::vector<Var> &,
int placeholder_pos = -1, int count = 0);
/** Use this as the left-hand-side of a definition or an update definition
* (see \ref RDom).
*/
Stage operator=(const Expr &);
/** Use this as the left-hand-side of a definition or an update definition
* for a Func with multiple outputs. */
Stage operator=(const Tuple &);
/** Define a stage that adds the given expression to this Func. If the
* expression refers to some RDom, this performs a sum reduction of the
* expression over the domain. If the function does not already have a
* pure definition, this sets it to zero.
*/
// @{
Stage operator+=(Expr);
Stage operator+=(const Tuple &);
Stage operator+=(const FuncRef &);
// @}
/** Define a stage that adds the negative of the given expression to this
* Func. If the expression refers to some RDom, this performs a sum reduction
* of the negative of the expression over the domain. If the function does
* not already have a pure definition, this sets it to zero.
*/
// @{
Stage operator-=(Expr);
Stage operator-=(const Tuple &);
Stage operator-=(const FuncRef &);
// @}
/** Define a stage that multiplies this Func by the given expression. If the
* expression refers to some RDom, this performs a product reduction of the
* expression over the domain. If the function does not already have a pure
* definition, this sets it to 1.
*/
// @{
Stage operator*=(Expr);
Stage operator*=(const Tuple &);
Stage operator*=(const FuncRef &);
// @}
/** Define a stage that divides this Func by the given expression.
* If the expression refers to some RDom, this performs a product
* reduction of the inverse of the expression over the domain. If the
* function does not already have a pure definition, this sets it to 1.
*/
// @{
Stage operator/=(Expr);
Stage operator/=(const Tuple &);
Stage operator/=(const FuncRef &);
// @}
/* Override the usual assignment operator, so that
* f(x, y) = g(x, y) defines f.
*/
Stage operator=(const FuncRef &);
/** Use this as a call to the function, and not the left-hand-side
* of a definition. Only works for single-output Funcs. */
operator Expr() const;
/** When a FuncRef refers to a function that provides multiple
* outputs, you can access each output as an Expr using
* operator[].
*/
FuncTupleElementRef operator[](int) const;
/** How many outputs does the function this refers to produce. */
size_t size() const;
/** What function is this calling? */
Internal::Function function() const {
return func;
}
};
/** Explicit overloads of min and max for FuncRef. These exist to
* disambiguate calls to min on FuncRefs when a user has pulled both
* Halide::min and std::min into their namespace. */
// @{
inline Expr min(const FuncRef &a, const FuncRef &b) {
return min(Expr(a), Expr(b));
}
inline Expr max(const FuncRef &a, const FuncRef &b) {
return max(Expr(a), Expr(b));
}
// @}
/** A fragment of front-end syntax of the form f(x, y, z)[index], where x, y,
* z are Vars or Exprs. If could be the left hand side of an update
* definition, or it could be a call to a function. We don't know
* until we see how this object gets used.
*/
class FuncTupleElementRef {
FuncRef func_ref;
std::vector<Expr> args; // args to the function
int idx; // Index to function outputs
/** Helper function that generates a Tuple where element at 'idx' is set
* to 'e' and the rests are undef. */
Tuple values_with_undefs(const Expr &e) const;
public:
FuncTupleElementRef(const FuncRef &ref, const std::vector<Expr> &args, int idx);
/** Use this as the left-hand-side of an update definition of Tuple
* component 'idx' of a Func (see \ref RDom). The function must
* already have an initial definition.
*/
Stage operator=(const Expr &e);
/** Define a stage that adds the given expression to Tuple component 'idx'
* of this Func. The other Tuple components are unchanged. If the expression
* refers to some RDom, this performs a sum reduction of the expression over
* the domain. The function must already have an initial definition.
*/
Stage operator+=(const Expr &e);
/** Define a stage that adds the negative of the given expression to Tuple
* component 'idx' of this Func. The other Tuple components are unchanged.
* If the expression refers to some RDom, this performs a sum reduction of
* the negative of the expression over the domain. The function must already
* have an initial definition.
*/
Stage operator-=(const Expr &e);
/** Define a stage that multiplies Tuple component 'idx' of this Func by
* the given expression. The other Tuple components are unchanged. If the
* expression refers to some RDom, this performs a product reduction of
* the expression over the domain. The function must already have an
* initial definition.
*/
Stage operator*=(const Expr &e);
/** Define a stage that divides Tuple component 'idx' of this Func by
* the given expression. The other Tuple components are unchanged.
* If the expression refers to some RDom, this performs a product
* reduction of the inverse of the expression over the domain. The function
* must already have an initial definition.
*/
Stage operator/=(const Expr &e);
/* Override the usual assignment operator, so that
* f(x, y)[index] = g(x, y) defines f.
*/
Stage operator=(const FuncRef &e);
/** Use this as a call to Tuple component 'idx' of a Func, and not the
* left-hand-side of a definition. */
operator Expr() const;
/** What function is this calling? */
Internal::Function function() const {
return func_ref.function();
}
/** Return index to the function outputs. */
int index() const {
return idx;
}
};
namespace Internal {
class IRMutator;
} // namespace Internal
/** Helper class for identifying purpose of an Expr passed to memoize.
*/
class EvictionKey {
protected:
Expr key;
friend class Func;
public:
explicit EvictionKey(const Expr &expr = Expr())
: key(expr) {
}
};
/** A halide function. This class represents one stage in a Halide
* pipeline, and is the unit by which we schedule things. By default
* they are aggressively inlined, so you are encouraged to make lots
* of little functions, rather than storing things in Exprs. */
class Func {
/** A handle on the internal halide function that this
* represents */
Internal::Function func;
/** When you make a reference to this function with fewer
* arguments than it has dimensions, the argument list is bulked
* up with 'implicit' vars with canonical names. This lets you
* pass around partially applied Halide functions. */
// @{
std::pair<int, int> add_implicit_vars(std::vector<Var> &) const;
std::pair<int, int> add_implicit_vars(std::vector<Expr> &) const;
// @}
/** The imaging pipeline that outputs this Func alone. */
Pipeline pipeline_;
/** Get the imaging pipeline that outputs this Func alone,
* creating it (and freezing the Func) if necessary. */
Pipeline pipeline();
// Helper function for recursive reordering support
Func &reorder_storage(const std::vector<Var> &dims, size_t start);
void invalidate_cache();
public:
/** Declare a new undefined function with the given name */
explicit Func(const std::string &name);
/** Declare a new undefined function with the given name.
* The function will be constrained to represent Exprs of required_type.
* If required_dims is not AnyDims, the function will be constrained to exactly
* that many dimensions. */
explicit Func(const Type &required_type, int required_dims, const std::string &name);
/** Declare a new undefined function with the given name.
* If required_types is not empty, the function will be constrained to represent
* Tuples of the same arity and types. (If required_types is empty, there is no constraint.)
* If required_dims is not AnyDims, the function will be constrained to exactly
* that many dimensions. */
explicit Func(const std::vector<Type> &required_types, int required_dims, const std::string &name);
/** Declare a new undefined function with an
* automatically-generated unique name */
Func();
/** Declare a new function with an automatically-generated unique
* name, and define it to return the given expression (which may
* not contain free variables). */
explicit Func(const Expr &e);
/** Construct a new Func to wrap an existing, already-define
* Function object. */
explicit Func(Internal::Function f);
/** Construct a new Func to wrap a Buffer. */
template<typename T, int Dims>
HALIDE_NO_USER_CODE_INLINE explicit Func(Buffer<T, Dims> &im)
: Func() {
(*this)(_) = im(_);
}
/** Evaluate this function over some rectangular domain and return
* the resulting buffer or buffers. Performs compilation if the
* Func has not previously been realized and compile_jit has not
* been called. If the final stage of the pipeline is on the GPU,
* data is copied back to the host before being returned. The
* returned Realization should probably be instantly converted to
* a Buffer class of the appropriate type. That is, do this:
*
\code
f(x) = sin(x);
Buffer<float> im = f.realize(...);
\endcode
*
* If your Func has multiple values, because you defined it using
* a Tuple, then casting the result of a realize call to a buffer
* or image will produce a run-time error. Instead you should do the
* following:
*
\code
f(x) = Tuple(x, sin(x));
Realization r = f.realize(...);
Buffer<int> im0 = r[0];
Buffer<float> im1 = r[1];
\endcode
*
* In Halide formal arguments of a computation are specified using
* Param<T> and ImageParam objects in the expressions defining the
* computation. The param_map argument to realize allows
* specifying a set of per-call parameters to be used for a
* specific computation. This method is thread-safe where the
* globals used by Param<T> and ImageParam are not. Any parameters
* that are not in the param_map are taken from the global values,
* so those can continue to be used if they are not changing
* per-thread.
*
* One can explicitly construct a ParamMap and
* use its set method to insert Parameter to scalar or Buffer
* value mappings:
*
\code
Param<int32> p(42);
ImageParam img(Int(32), 1);
f(x) = img(x) + p;
Buffer<int32_t) arg_img(10, 10);
<fill in arg_img...>
ParamMap params;
params.set(p, 17);
params.set(img, arg_img);
Target t = get_jit_target_from_environment();
Buffer<int32_t> result = f.realize({10, 10}, t, params);
\endcode
*
* Alternatively, an initializer list can be used
* directly in the realize call to pass this information:
*
\code
Param<int32> p(42);
ImageParam img(Int(32), 1);
f(x) = img(x) + p;
Buffer<int32_t) arg_img(10, 10);
<fill in arg_img...>
Target t = get_jit_target_from_environment();
Buffer<int32_t> result = f.realize({10, 10}, t, { { p, 17 }, { img, arg_img } });
\endcode
*
* If the Func cannot be realized into a buffer of the given size
* due to scheduling constraints on scattering update definitions,
* it will be realized into a larger buffer of the minimum size
* possible, and a cropped view at the requested size will be
* returned. It is thus not safe to assume the returned buffers
* are contiguous in memory. This behavior can be disabled with
* the NoBoundsQuery target flag, in which case an error about
* writing out of bounds on the output buffer will trigger
* instead.
*
*/
Realization realize(std::vector<int32_t> sizes = {}, const Target &target = Target(),
const ParamMap ¶m_map = ParamMap::empty_map());
/** Same as above, but takes a custom user-provided context to be
* passed to runtime functions. This can be used to pass state to
* runtime overrides in a thread-safe manner. A nullptr context is
* legal, and is equivalent to calling the variant of realize
* that does not take a context. */
Realization realize(JITUserContext *context,
std::vector<int32_t> sizes = {},
const Target &target = Target(),
const ParamMap ¶m_map = ParamMap::empty_map());
/** Evaluate this function into an existing allocated buffer or
* buffers. If the buffer is also one of the arguments to the
* function, strange things may happen, as the pipeline isn't
* necessarily safe to run in-place. If you pass multiple buffers,
* they must have matching sizes. This form of realize does *not*
* automatically copy data back from the GPU. */
void realize(Pipeline::RealizationArg outputs, const Target &target = Target(),
const ParamMap ¶m_map = ParamMap::empty_map());
/** Same as above, but takes a custom user-provided context to be
* passed to runtime functions. This can be used to pass state to
* runtime overrides in a thread-safe manner. A nullptr context is
* legal, and is equivalent to calling the variant of realize
* that does not take a context. */
void realize(JITUserContext *context,
Pipeline::RealizationArg outputs,
const Target &target = Target(),
const ParamMap ¶m_map = ParamMap::empty_map());
/** For a given size of output, or a given output buffer,
* determine the bounds required of all unbound ImageParams
* referenced. Communicates the result by allocating new buffers
* of the appropriate size and binding them to the unbound
* ImageParams.
*
* Set the documentation for Func::realize regarding the
* ParamMap. There is one difference in that input Buffer<>
* arguments that are being inferred are specified as a pointer to
* the Buffer<> in the ParamMap. E.g.
*
\code
Param<int32> p(42);
ImageParam img(Int(32), 1);
f(x) = img(x) + p;
Target t = get_jit_target_from_environment();
Buffer<> in;
f.infer_input_bounds({10, 10}, t, { { img, &in } });
\endcode
* On return, in will be an allocated buffer of the correct size
* to evaulate f over a 10x10 region.
*/
// @{
void infer_input_bounds(const std::vector<int32_t> &sizes,
const Target &target = get_jit_target_from_environment(),
const ParamMap ¶m_map = ParamMap::empty_map());
void infer_input_bounds(Pipeline::RealizationArg outputs,
const Target &target = get_jit_target_from_environment(),
const ParamMap ¶m_map = ParamMap::empty_map());
// @}
/** Versions of infer_input_bounds that take a custom user context
* to pass to runtime functions. */
// @{
void infer_input_bounds(JITUserContext *context,
const std::vector<int32_t> &sizes,
const Target &target = get_jit_target_from_environment(),
const ParamMap ¶m_map = ParamMap::empty_map());
void infer_input_bounds(JITUserContext *context,
Pipeline::RealizationArg outputs,
const Target &target = get_jit_target_from_environment(),
const ParamMap ¶m_map = ParamMap::empty_map());
// @}
/** Statically compile this function to llvm bitcode, with the
* given filename (which should probably end in .bc), type
* signature, and C function name (which defaults to the same name
* as this halide function */
//@{
void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
const Target &target = get_target_from_environment());
void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &,
const Target &target = get_target_from_environment());
// @}
/** Statically compile this function to llvm assembly, with the
* given filename (which should probably end in .ll), type
* signature, and C function name (which defaults to the same name
* as this halide function */
//@{
void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
const Target &target = get_target_from_environment());
void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &,
const Target &target = get_target_from_environment());
// @}
/** Statically compile this function to an object file, with the
* given filename (which should probably end in .o or .obj), type
* signature, and C function name (which defaults to the same name
* as this halide function. You probably don't want to use this
* directly; call compile_to_static_library or compile_to_file instead. */
//@{
void compile_to_object(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
const Target &target = get_target_from_environment());
void compile_to_object(const std::string &filename, const std::vector<Argument> &,
const Target &target = get_target_from_environment());
// @}
/** Emit a header file with the given filename for this
* function. The header will define a function with the type
* signature given by the second argument, and a name given by the
* third. The name defaults to the same name as this halide
* function. You don't actually have to have defined this function
* yet to call this. You probably don't want to use this directly;
* call compile_to_static_library or compile_to_file instead. */
void compile_to_header(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name = "",
const Target &target = get_target_from_environment());
/** Statically compile this function to text assembly equivalent
* to the object file generated by compile_to_object. This is
* useful for checking what Halide is producing without having to
* disassemble anything, or if you need to feed the assembly into
* some custom toolchain to produce an object file (e.g. iOS) */
//@{
void compile_to_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
const Target &target = get_target_from_environment());
void compile_to_assembly(const std::string &filename, const std::vector<Argument> &,
const Target &target = get_target_from_environment());
// @}
/** Statically compile this function to C source code. This is
* useful for providing fallback code paths that will compile on
* many platforms. Vectorization will fail, and parallelization
* will produce serial code. */
void compile_to_c(const std::string &filename,
const std::vector<Argument> &,
const std::string &fn_name = "",
const Target &target = get_target_from_environment());
/** Write out an internal representation of lowered code. Useful
* for analyzing and debugging scheduling. Can emit html or plain
* text. */
void compile_to_lowered_stmt(const std::string &filename,
const std::vector<Argument> &args,
StmtOutputFormat fmt = Text,
const Target &target = get_target_from_environment());
/** Write out the loop nests specified by the schedule for this
* Function. Helpful for understanding what a schedule is
* doing. */
void print_loop_nest();
/** Compile to object file and header pair, with the given
* arguments. The name defaults to the same name as this halide
* function.
*/
void compile_to_file(const std::string &filename_prefix, const std::vector<Argument> &args,
const std::string &fn_name = "",
const Target &target = get_target_from_environment());
/** Compile to static-library file and header pair, with the given
* arguments. The name defaults to the same name as this halide
* function.
*/
void compile_to_static_library(const std::string &filename_prefix, const std::vector<Argument> &args,
const std::string &fn_name = "",
const Target &target = get_target_from_environment());
/** Compile to static-library file and header pair once for each target;
* each resulting function will be considered (in order) via halide_can_use_target_features()
* at runtime, with the first appropriate match being selected for subsequent use.
* This is typically useful for specializations that may vary unpredictably by machine
* (e.g., SSE4.1/AVX/AVX2 on x86 desktop machines).
* All targets must have identical arch-os-bits.
*/
void compile_to_multitarget_static_library(const std::string &filename_prefix,
const std::vector<Argument> &args,
const std::vector<Target> &targets);
/** Like compile_to_multitarget_static_library(), except that the object files
* are all output as object files (rather than bundled into a static library).
*
* `suffixes` is an optional list of strings to use for as the suffix for each object
* file. If nonempty, it must be the same length as `targets`. (If empty, Target::to_string()
* will be used for each suffix.)
*
* Note that if `targets.size()` > 1, the wrapper code (to select the subtarget)
* will be generated with the filename `${filename_prefix}_wrapper.o`
*
* Note that if `targets.size()` > 1 and `no_runtime` is not specified, the runtime
* will be generated with the filename `${filename_prefix}_runtime.o`
*/
void compile_to_multitarget_object_files(const std::string &filename_prefix,
const std::vector<Argument> &args,
const std::vector<Target> &targets,
const std::vector<std::string> &suffixes);
/** Store an internal representation of lowered code as a self
* contained Module suitable for further compilation. */
Module compile_to_module(const std::vector<Argument> &args, const std::string &fn_name = "",
const Target &target = get_target_from_environment());
/** Compile and generate multiple target files with single call.
* Deduces target files based on filenames specified in
* output_files map.
*/
void compile_to(const std::map<OutputFileType, std::string> &output_files,
const std::vector<Argument> &args,
const std::string &fn_name,
const Target &target = get_target_from_environment());
/** Eagerly jit compile the function to machine code. This
* normally happens on the first call to realize. If you're
* running your halide pipeline inside time-sensitive code and
* wish to avoid including the time taken to compile a pipeline,
* then you can call this ahead of time. Default is to use the Target
* returned from Halide::get_jit_target_from_environment()
*/
void compile_jit(const Target &target = get_jit_target_from_environment());
/** Get a struct containing the currently set custom functions
* used by JIT. This can be mutated. Changes will take effect the
* next time this Func is realized. */
JITHandlers &jit_handlers();
/** Eagerly jit compile the function to machine code and return a callable
* struct that behaves like a function pointer. The calling convention
* will exactly match that of an AOT-compiled version of this Func
* with the same Argument list.
*/
Callable compile_to_callable(const std::vector<Argument> &args,
const Target &target = get_jit_target_from_environment());
/** Add a custom pass to be used during lowering. It is run after
* all other lowering passes. Can be used to verify properties of
* the lowered Stmt, instrument it with extra code, or otherwise
* modify it. The Func takes ownership of the pass, and will call
* delete on it when the Func goes out of scope. So don't pass a
* stack object, or share pass instances between multiple
* Funcs. */
template<typename T>
void add_custom_lowering_pass(T *pass) {
// Template instantiate a custom deleter for this type, then
// wrap in a lambda. The custom deleter lives in user code, so
// that deletion is on the same heap as construction (I hate Windows).
add_custom_lowering_pass(pass, [pass]() { delete_lowering_pass<T>(pass); });
}
/** Add a custom pass to be used during lowering, with the
* function that will be called to delete it also passed in. Set
* it to nullptr if you wish to retain ownership of the object. */
void add_custom_lowering_pass(Internal::IRMutator *pass, std::function<void()> deleter);
/** Remove all previously-set custom lowering passes */
void clear_custom_lowering_passes();
/** Get the custom lowering passes. */
const std::vector<CustomLoweringPass> &custom_lowering_passes();
/** When this function is compiled, include code that dumps its
* values to a file after it is realized, for the purpose of
* debugging.
*
* If filename ends in ".tif" or ".tiff" (case insensitive) the file
* is in TIFF format and can be read by standard tools. Oherwise, the
* file format is as follows:
*
* All data is in the byte-order of the target platform. First, a
* 20 byte-header containing four 32-bit ints, giving the extents
* of the first four dimensions. Dimensions beyond four are
* folded into the fourth. Then, a fifth 32-bit int giving the
* data type of the function. The typecodes are given by: float =
* 0, double = 1, uint8_t = 2, int8_t = 3, uint16_t = 4, int16_t =
* 5, uint32_t = 6, int32_t = 7, uint64_t = 8, int64_t = 9. The
* data follows the header, as a densely packed array of the given
* size and the given type. If given the extension .tmp, this file
* format can be natively read by the program ImageStack. */
void debug_to_file(const std::string &filename);
/** The name of this function, either given during construction,
* or automatically generated. */
const std::string &name() const;
/** Get the pure arguments. */
std::vector<Var> args() const;
/** The right-hand-side value of the pure definition of this
* function. Causes an error if there's no pure definition, or if
* the function is defined to return multiple values. */
Expr value() const;
/** The values returned by this function. An error if the function
* has not been been defined. Returns a Tuple with one element for
* functions defined to return a single value. */
Tuple values() const;
/** Does this function have at least a pure definition. */
bool defined() const;
/** Get the left-hand-side of the update definition. An empty
* vector if there's no update definition. If there are
* multiple update definitions for this function, use the
* argument to select which one you want. */
const std::vector<Expr> &update_args(int idx = 0) const;
/** Get the right-hand-side of an update definition. An error if
* there's no update definition. If there are multiple
* update definitions for this function, use the argument to
* select which one you want. */
Expr update_value(int idx = 0) const;
/** Get the right-hand-side of an update definition for
* functions that returns multiple values. An error if there's no
* update definition. Returns a Tuple with one element for
* functions that return a single value. */
Tuple update_values(int idx = 0) const;
/** Get the RVars of the reduction domain for an update definition, if there is
* one. */
std::vector<RVar> rvars(int idx = 0) const;
/** Does this function have at least one update definition? */
bool has_update_definition() const;
/** How many update definitions does this function have? */
int num_update_definitions() const;
/** Is this function an external stage? That is, was it defined
* using define_extern? */
bool is_extern() const;
/** Add an extern definition for this Func. This lets you define a
* Func that represents an external pipeline stage. You can, for
* example, use it to wrap a call to an extern library such as
* fftw. */
// @{
void define_extern(const std::string &function_name,
const std::vector<ExternFuncArgument> ¶ms, Type t,
int dimensionality,
NameMangling mangling = NameMangling::Default,
DeviceAPI device_api = DeviceAPI::Host) {
define_extern(function_name, params, t,
Internal::make_argument_list(dimensionality), mangling,
device_api);
}
void define_extern(const std::string &function_name,
const std::vector<ExternFuncArgument> ¶ms,
const std::vector<Type> &types, int dimensionality,
NameMangling mangling) {
define_extern(function_name, params, types,
Internal::make_argument_list(dimensionality), mangling);
}
void define_extern(const std::string &function_name,
const std::vector<ExternFuncArgument> ¶ms,
const std::vector<Type> &types, int dimensionality,
NameMangling mangling = NameMangling::Default,
DeviceAPI device_api = DeviceAPI::Host) {
define_extern(function_name, params, types,
Internal::make_argument_list(dimensionality), mangling,
device_api);
}
void define_extern(const std::string &function_name,
const std::vector<ExternFuncArgument> ¶ms, Type t,
const std::vector<Var> &arguments,
NameMangling mangling = NameMangling::Default,
DeviceAPI device_api = DeviceAPI::Host) {
define_extern(function_name, params, std::vector<Type>{t}, arguments,
mangling, device_api);
}
void define_extern(const std::string &function_name,
const std::vector<ExternFuncArgument> ¶ms,
const std::vector<Type> &types,
const std::vector<Var> &arguments,
NameMangling mangling = NameMangling::Default,
DeviceAPI device_api = DeviceAPI::Host);
// @}
/** Get the type(s) of the outputs of this Func.
*
* It is not legal to call type() unless the Func has non-Tuple elements.
*
* If the Func isn't yet defined, and was not specified with required types,
* a runtime error will occur.
*
* If the Func isn't yet defined, but *was* specified with required types,
* the requirements will be returned. */
// @{
const Type &type() const;
const std::vector<Type> &types() const;
// @}
HALIDE_ATTRIBUTE_DEPRECATED("Func::output_type() is deprecated; use Func::type() instead.")
const Type &output_type() const {
return type();
}
HALIDE_ATTRIBUTE_DEPRECATED("Func::output_types() is deprecated; use Func::types() instead.")
const std::vector<Type> &output_types() const {
return types();
}
/** Get the number of outputs of this Func. Corresponds to the
* size of the Tuple this Func was defined to return.
* If the Func isn't yet defined, but was specified with required types,
* the number of outputs specified in the requirements will be returned. */
int outputs() const;
/** Get the name of the extern function called for an extern
* definition. */
const std::string &extern_function_name() const;
/** The dimensionality (number of arguments) of this function.
* If the Func isn't yet defined, but was specified with required dimensionality,
* the dimensionality specified in the requirements will be returned. */
int dimensions() const;
/** Construct either the left-hand-side of a definition, or a call
* to a functions that happens to only contain vars as
* arguments. If the function has already been defined, and fewer
* arguments are given than the function has dimensions, then
* enough implicit vars are added to the end of the argument list
* to make up the difference (see \ref Var::implicit) */
// @{
FuncRef operator()(std::vector<Var>) const;
template<typename... Args>
HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Var, Args...>::value, FuncRef>::type
operator()(Args &&...args) const {
std::vector<Var> collected_args{std::forward<Args>(args)...};
return this->operator()(collected_args);
}
// @}
/** Either calls to the function, or the left-hand-side of
* an update definition (see \ref RDom). If the function has
* already been defined, and fewer arguments are given than the
* function has dimensions, then enough implicit vars are added to
* the end of the argument list to make up the difference. (see
* \ref Var::implicit)*/
// @{
FuncRef operator()(std::vector<Expr>) const;
template<typename... Args>
HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Expr, Args...>::value, FuncRef>::type
operator()(const Expr &x, Args &&...args) const {
std::vector<Expr> collected_args{x, std::forward<Args>(args)...};
return (*this)(collected_args);
}
// @}
/** Creates and returns a new identity Func that wraps this Func. During
* compilation, Halide replaces all calls to this Func done by 'f'
* with calls to the wrapper. If this Func is already wrapped for
* use in 'f', will return the existing wrapper.
*
* For example, g.in(f) would rewrite a pipeline like this:
\code
g(x, y) = ...
f(x, y) = ... g(x, y) ...
\endcode
* into a pipeline like this:
\code
g(x, y) = ...
g_wrap(x, y) = g(x, y)
f(x, y) = ... g_wrap(x, y)
\endcode
*
* This has a variety of uses. You can use it to schedule this
* Func differently in the different places it is used:
\code
g(x, y) = ...
f1(x, y) = ... g(x, y) ...
f2(x, y) = ... g(x, y) ...
g.in(f1).compute_at(f1, y).vectorize(x, 8);
g.in(f2).compute_at(f2, x).unroll(x);
\endcode
*
* You can also use it to stage loads from this Func via some
* intermediate buffer (perhaps on the stack as in
* test/performance/block_transpose.cpp, or in shared GPU memory
* as in test/performance/wrap.cpp). In this we compute the
* wrapper at tiles of the consuming Funcs like so:
\code
g.compute_root()...
g.in(f).compute_at(f, tiles)...
\endcode
*
* Func::in() can also be used to compute pieces of a Func into a
* smaller scratch buffer (perhaps on the GPU) and then copy them
* into a larger output buffer one tile at a time. See
* apps/interpolate/interpolate.cpp for an example of this. In
* this case we compute the Func at tiles of its own wrapper:
\code
f.in(g).compute_root().gpu_tile(...)...
f.compute_at(f.in(g), tiles)...
\endcode
*
* A similar use of Func::in() wrapping Funcs with multiple update
* stages in a pure wrapper. The following code:
\code
f(x, y) = x + y;
f(x, y) += 5;
g(x, y) = f(x, y);
f.compute_root();
\endcode
*
* Is equivalent to:
\code
for y:
for x:
f(x, y) = x + y;
for y:
for x:
f(x, y) += 5
for y:
for x:
g(x, y) = f(x, y)
\endcode
* using Func::in(), we can write:
\code
f(x, y) = x + y;
f(x, y) += 5;
g(x, y) = f(x, y);
f.in(g).compute_root();
\endcode
* which instead produces:
\code
for y:
for x:
f(x, y) = x + y;
f(x, y) += 5
f_wrap(x, y) = f(x, y)
for y:
for x:
g(x, y) = f_wrap(x, y)
\endcode
*/
Func in(const Func &f);
/** Create and return an identity wrapper shared by all the Funcs in
* 'fs'. If any of the Funcs in 'fs' already have a custom wrapper,
* this will throw an error. */
Func in(const std::vector<Func> &fs);
/** Create and return a global identity wrapper, which wraps all calls to
* this Func by any other Func. If a global wrapper already exists,
* returns it. The global identity wrapper is only used by callers for
* which no custom wrapper has been specified.
*/
Func in();
/** Similar to \ref Func::in; however, instead of replacing the call to
* this Func with an identity Func that refers to it, this replaces the
* call with a clone of this Func.
*
* For example, f.clone_in(g) would rewrite a pipeline like this:
\code
f(x, y) = x + y;
g(x, y) = f(x, y) + 2;
h(x, y) = f(x, y) - 3;
\endcode
* into a pipeline like this:
\code
f(x, y) = x + y;
f_clone(x, y) = x + y;
g(x, y) = f_clone(x, y) + 2;
h(x, y) = f(x, y) - 3;
\endcode
*
*/
//@{
Func clone_in(const Func &f);
Func clone_in(const std::vector<Func> &fs);
//@}
/** Declare that this function should be implemented by a call to
* halide_buffer_copy with the given target device API. Asserts
* that the Func has a pure definition which is a simple call to a
* single input, and no update definitions. The wrapper Funcs
* returned by in() are suitable candidates. Consumes all pure
* variables, and rewrites the Func to have an extern definition
* that calls halide_buffer_copy. */
Func copy_to_device(DeviceAPI d = DeviceAPI::Default_GPU);
/** Declare that this function should be implemented by a call to
* halide_buffer_copy with a NULL target device API. Equivalent to
* copy_to_device(DeviceAPI::Host). Asserts that the Func has a
* pure definition which is a simple call to a single input, and
* no update definitions. The wrapper Funcs returned by in() are
* suitable candidates. Consumes all pure variables, and rewrites
* the Func to have an extern definition that calls
* halide_buffer_copy.
*
* Note that if the source Func is already valid in host memory,
* this compiles to code that does the minimum number of calls to
* memcpy.
*/
Func copy_to_host();
/** Split a dimension into inner and outer subdimensions with the
* given names, where the inner dimension iterates from 0 to
* factor-1. The inner and outer subdimensions can then be dealt
* with using the other scheduling calls. It's ok to reuse the old
* variable name as either the inner or outer variable. The final
* argument specifies how the tail should be handled if the split
* factor does not provably divide the extent. */
Func &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
/** Join two dimensions into a single fused dimension. The fused
* dimension covers the product of the extents of the inner and
* outer dimensions given. */
Func &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
/** Mark a dimension to be traversed serially. This is the default. */
Func &serial(const VarOrRVar &var);
/** Mark a dimension to be traversed in parallel */
Func ¶llel(const VarOrRVar &var);
/** Split a dimension by the given task_size, and the parallelize the
* outer dimension. This creates parallel tasks that have size
* task_size. After this call, var refers to the outer dimension of
* the split. The inner dimension has a new anonymous name. If you
* wish to mutate it, or schedule with respect to it, do the split
* manually. */
Func ¶llel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);
/** Mark a dimension to be computed all-at-once as a single
* vector. The dimension should have constant extent -
* e.g. because it is the inner dimension following a split by a
* constant factor. For most uses of vectorize you want the two
* argument form. The variable to be vectorized should be the
* innermost one. */
Func &vectorize(const VarOrRVar &var);
/** Mark a dimension to be completely unrolled. The dimension
* should have constant extent - e.g. because it is the inner
* dimension following a split by a constant factor. For most uses
* of unroll you want the two-argument form. */
Func &unroll(const VarOrRVar &var);
/** Split a dimension by the given factor, then vectorize the
* inner dimension. This is how you vectorize a loop of unknown
* size. The variable to be vectorized should be the innermost
* one. After this call, var refers to the outer dimension of the
* split. 'factor' must be an integer. */
Func &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
/** Split a dimension by the given factor, then unroll the inner
* dimension. This is how you unroll a loop of unknown size by
* some constant factor. After this call, var refers to the outer
* dimension of the split. 'factor' must be an integer. */
Func &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
/** Statically declare that the range over which a function should
* be evaluated is given by the second and third arguments. This
* can let Halide perform some optimizations. E.g. if you know
* there are going to be 4 color channels, you can completely
* vectorize the color channel dimension without the overhead of
* splitting it up. If bounds inference decides that it requires
* more of this function than the bounds you have stated, a
* runtime error will occur when you try to run your pipeline. */
Func &bound(const Var &var, Expr min, Expr extent);
/** Statically declare the range over which the function will be
* evaluated in the general case. This provides a basis for the auto
* scheduler to make trade-offs and scheduling decisions. The auto
* generated schedules might break when the sizes of the dimensions are
* very different from the estimates specified. These estimates are used
* only by the auto scheduler if the function is a pipeline output. */
Func &set_estimate(const Var &var, const Expr &min, const Expr &extent);
/** Set (min, extent) estimates for all dimensions in the Func
* at once; this is equivalent to calling `set_estimate(args()[n], min, extent)`
* repeatedly, but slightly terser. The size of the estimates vector
* must match the dimensionality of the Func. */
Func &set_estimates(const Region &estimates);
/** Expand the region computed so that the min coordinates is
* congruent to 'remainder' modulo 'modulus', and the extent is a
* multiple of 'modulus'. For example, f.align_bounds(x, 2) forces
* the min and extent realized to be even, and calling
* f.align_bounds(x, 2, 1) forces the min to be odd and the extent
* to be even. The region computed always contains the region that
* would have been computed without this directive, so no
* assertions are injected.
*/
Func &align_bounds(const Var &var, Expr modulus, Expr remainder = 0);
/** Expand the region computed so that the extent is a
* multiple of 'modulus'. For example, f.align_extent(x, 2) forces
* the extent realized to be even. The region computed always contains the
* region that would have been computed without this directive, so no
* assertions are injected. (This is essentially equivalent to align_bounds(),
* but always leaving the min untouched.)
*/
Func &align_extent(const Var &var, Expr modulus);
/** Bound the extent of a Func's realization, but not its
* min. This means the dimension can be unrolled or vectorized
* even when its min is not fixed (for example because it is
* compute_at tiles of another Func). This can also be useful for
* forcing a function's allocation to be a fixed size, which often
* means it can go on the stack. */
Func &bound_extent(const Var &var, Expr extent);
/** Split two dimensions at once by the given factors, and then
* reorder the resulting dimensions to be xi, yi, xo, yo from
* innermost outwards. This gives a tiled traversal. */
Func &tile(const VarOrRVar &x, const VarOrRVar &y,
const VarOrRVar &xo, const VarOrRVar &yo,
const VarOrRVar &xi, const VarOrRVar &yi,
const Expr &xfactor, const Expr &yfactor,
TailStrategy tail = TailStrategy::Auto);
/** A shorter form of tile, which reuses the old variable names as
* the new outer dimensions */
Func &tile(const VarOrRVar &x, const VarOrRVar &y,
const VarOrRVar &xi, const VarOrRVar &yi,
const Expr &xfactor, const Expr &yfactor,
TailStrategy tail = TailStrategy::Auto);
/** A more general form of tile, which defines tiles of any dimensionality. */
Func &tile(const std::vector<VarOrRVar> &previous,
const std::vector<VarOrRVar> &outers,
const std::vector<VarOrRVar> &inners,
const std::vector<Expr> &factors,
const std::vector<TailStrategy> &tails);
/** The generalized tile, with a single tail strategy to apply to all vars. */
Func &tile(const std::vector<VarOrRVar> &previous,
const std::vector<VarOrRVar> &outers,
const std::vector<VarOrRVar> &inners,
const std::vector<Expr> &factors,
TailStrategy tail = TailStrategy::Auto);
/** Generalized tiling, reusing the previous names as the outer names. */
Func &tile(const std::vector<VarOrRVar> &previous,
const std::vector<VarOrRVar> &inners,
const std::vector<Expr> &factors,
TailStrategy tail = TailStrategy::Auto);
/** Reorder variables to have the given nesting order, from
* innermost out */
Func &reorder(const std::vector<VarOrRVar> &vars);
template<typename... Args>
HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Func &>::type
reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
return reorder(collected_args);
}
/** Rename a dimension. Equivalent to split with a inner size of one. */
Func &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
/** Specify that race conditions are permitted for this Func,
* which enables parallelizing over RVars even when Halide cannot
* prove that it is safe to do so. Use this with great caution,
* and only if you can prove to yourself that this is safe, as it
* may result in a non-deterministic routine that returns
* different values at different times or on different machines. */
Func &allow_race_conditions();
/** Issue atomic updates for this Func. This allows parallelization
* on associative RVars. The function throws a compile error when
* Halide fails to prove associativity. Use override_associativity_test
* to disable the associativity test if you believe the function is
* associative or the order of reduction variable execution does not
* matter.
* Halide compiles this into hardware atomic operations whenever possible,
* and falls back to a mutex lock per storage element if it is impossible
* to atomically update.
* There are three possible outcomes of the compiled code:
* atomic add, compare-and-swap loop, and mutex lock.
* For example:
*
* hist(x) = 0;
* hist(im(r)) += 1;
* hist.compute_root();
* hist.update().atomic().parallel();
*
* will be compiled to atomic add operations.
*
* hist(x) = 0;
* hist(im(r)) = min(hist(im(r)) + 1, 100);
* hist.compute_root();
* hist.update().atomic().parallel();
*
* will be compiled to compare-and-swap loops.
*
* arg_max() = {0, im(0)};
* Expr old_index = arg_max()[0];
* Expr old_max = arg_max()[1];
* Expr new_index = select(old_max < im(r), r, old_index);
* Expr new_max = max(im(r), old_max);
* arg_max() = {new_index, new_max};
* arg_max.compute_root();
* arg_max.update().atomic().parallel();
*
* will be compiled to updates guarded by a mutex lock,
* since it is impossible to atomically update two different locations.
*
* Currently the atomic operation is supported by x86, CUDA, and OpenCL backends.
* Compiling to other backends results in a compile error.
* If an operation is compiled into a mutex lock, and is vectorized or is
* compiled to CUDA or OpenCL, it also results in a compile error,
* since per-element mutex lock on vectorized operation leads to a
* deadlock.
* Vectorization of predicated RVars (through rdom.where()) on CPU
* is also unsupported yet (see https://github.com/halide/Halide/issues/4298).
* 8-bit and 16-bit atomics on GPU are also not supported. */
Func &atomic(bool override_associativity_test = false);
/** Specialize a Func. This creates a special-case version of the
* Func where the given condition is true. The most effective
* conditions are those of the form param == value, and boolean
* Params. Consider a simple example:
\code
f(x) = x + select(cond, 0, 1);
f.compute_root();
\endcode
* This is equivalent to:
\code
for (int x = 0; x < width; x++) {
f[x] = x + (cond ? 0 : 1);
}
\endcode
* Adding the scheduling directive:
\code
f.specialize(cond)
\endcode
* makes it equivalent to:
\code
if (cond) {
for (int x = 0; x < width; x++) {
f[x] = x;
}
} else {
for (int x = 0; x < width; x++) {
f[x] = x + 1;
}
}
\endcode
* Note that the inner loops have been simplified. In the first
* path Halide knows that cond is true, and in the second path
* Halide knows that it is false.
*
* The specialized version gets its own schedule, which inherits
* every directive made about the parent Func's schedule so far
* except for its specializations. This method returns a handle to
* the new schedule. If you wish to retrieve the specialized
* sub-schedule again later, you can call this method with the
* same condition. Consider the following example of scheduling
* the specialized version:
*
\code
f(x) = x;
f.compute_root();
f.specialize(width > 1).unroll(x, 2);
\endcode
* Assuming for simplicity that width is even, this is equivalent to:
\code
if (width > 1) {
for (int x = 0; x < width/2; x++) {
f[2*x] = 2*x;
f[2*x + 1] = 2*x + 1;
}
} else {
for (int x = 0; x < width/2; x++) {
f[x] = x;
}
}
\endcode
* For this case, it may be better to schedule the un-specialized
* case instead:
\code
f(x) = x;
f.compute_root();
f.specialize(width == 1); // Creates a copy of the schedule so far.
f.unroll(x, 2); // Only applies to the unspecialized case.
\endcode
* This is equivalent to:
\code
if (width == 1) {
f[0] = 0;
} else {
for (int x = 0; x < width/2; x++) {
f[2*x] = 2*x;
f[2*x + 1] = 2*x + 1;
}
}
\endcode
* This can be a good way to write a pipeline that splits,
* vectorizes, or tiles, but can still handle small inputs.
*
* If a Func has several specializations, the first matching one
* will be used, so the order in which you define specializations
* is significant. For example:
*
\code
f(x) = x + select(cond1, a, b) - select(cond2, c, d);
f.specialize(cond1);
f.specialize(cond2);
\endcode
* is equivalent to:
\code
if (cond1) {
for (int x = 0; x < width; x++) {
f[x] = x + a - (cond2 ? c : d);
}
} else if (cond2) {
for (int x = 0; x < width; x++) {
f[x] = x + b - c;
}
} else {
for (int x = 0; x < width; x++) {
f[x] = x + b - d;
}
}
\endcode
*
* Specializations may in turn be specialized, which creates a
* nested if statement in the generated code.
*
\code
f(x) = x + select(cond1, a, b) - select(cond2, c, d);
f.specialize(cond1).specialize(cond2);
\endcode
* This is equivalent to:
\code
if (cond1) {
if (cond2) {
for (int x = 0; x < width; x++) {
f[x] = x + a - c;
}
} else {
for (int x = 0; x < width; x++) {
f[x] = x + a - d;
}
}
} else {
for (int x = 0; x < width; x++) {
f[x] = x + b - (cond2 ? c : d);
}
}
\endcode
* To create a 4-way if statement that simplifies away all of the
* ternary operators above, you could say:
\code
f.specialize(cond1).specialize(cond2);
f.specialize(cond2);
\endcode
* or
\code
f.specialize(cond1 && cond2);
f.specialize(cond1);
f.specialize(cond2);
\endcode
*
* Any prior Func which is compute_at some variable of this Func
* gets separately included in all paths of the generated if
* statement. The Var in the compute_at call to must exist in all
* paths, but it may have been generated via a different path of
* splits, fuses, and renames. This can be used somewhat
* creatively. Consider the following code:
\code
g(x, y) = 8*x;
f(x, y) = g(x, y) + 1;
f.compute_root().specialize(cond);
Var g_loop;
f.specialize(cond).rename(y, g_loop);
f.rename(x, g_loop);
g.compute_at(f, g_loop);
\endcode
* When cond is true, this is equivalent to g.compute_at(f,y).
* When it is false, this is equivalent to g.compute_at(f,x).
*/
Stage specialize(const Expr &condition);
/** Add a specialization to a Func that always terminates execution
* with a call to halide_error(). By itself, this is of limited use,
* but can be useful to terminate chains of specialize() calls where
* no "default" case is expected (thus avoiding unnecessary code generation).
*
* For instance, say we want to optimize a pipeline to process images
* in planar and interleaved format; we might typically do something like:
\code
ImageParam im(UInt(8), 3);
Func f = do_something_with(im);
f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
\endcode
* This code will vectorize along rows for the planar case, and across pixel
* components for the interleaved case... but there is an implicit "else"
* for the unhandled cases, which generates unoptimized code. If we never
* anticipate passing any other sort of images to this, we code streamline
* our code by adding specialize_fail():
\code
ImageParam im(UInt(8), 3);
Func f = do_something(im);
f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
f.specialize_fail("Unhandled image format");
\endcode
* Conceptually, this produces codes like:
\code
if (im.dim(0).stride() == 1) {
do_something_planar();
} else if (im.dim(2).stride() == 1) {
do_something_interleaved();
} else {
halide_error("Unhandled image format");
}
\endcode
*
* Note that calling specialize_fail() terminates the specialization chain
* for a given Func; you cannot create new specializations for the Func
* afterwards (though you can retrieve handles to previous specializations).
*/
void specialize_fail(const std::string &message);
/** Tell Halide that the following dimensions correspond to GPU
* thread indices. This is useful if you compute a producer
* function within the block indices of a consumer function, and
* want to control how that function's dimensions map to GPU
* threads. If the selected target is not an appropriate GPU, this
* just marks those dimensions as parallel. */
// @{
Func &gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
// @}
/** The given dimension corresponds to the lanes in a GPU
* warp. GPU warp lanes are distinguished from GPU threads by the
* fact that all warp lanes run together in lockstep, which
* permits lightweight communication of data from one lane to
* another. */
Func &gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
/** Tell Halide to run this stage using a single gpu thread and
* block. This is not an efficient use of your GPU, but it can be
* useful to avoid copy-back for intermediate update stages that
* touch a very small part of your Func. */
Func &gpu_single_thread(DeviceAPI device_api = DeviceAPI::Default_GPU);
/** Tell Halide that the following dimensions correspond to GPU
* block indices. This is useful for scheduling stages that will
* run serially within each GPU block. If the selected target is
* not ptx, this just marks those dimensions as parallel. */
// @{
Func &gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
// @}
/** Tell Halide that the following dimensions correspond to GPU
* block indices and thread indices. If the selected target is not
* ptx, these just mark the given dimensions as parallel. The
* dimensions are consumed by this call, so do all other
* unrolling, reordering, etc first. */
// @{
Func &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
// @}
/** Short-hand for tiling a domain and mapping the tile indices
* to GPU block indices and the coordinates within each tile to
* GPU thread indices. Consumes the variables given, so do all
* other scheduling first. */
// @{
Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
TailStrategy tail = TailStrategy::Auto,
DeviceAPI device_api = DeviceAPI::Default_GPU);
Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
TailStrategy tail = TailStrategy::Auto,
DeviceAPI device_api = DeviceAPI::Default_GPU);
Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
const VarOrRVar &bx, const VarOrRVar &by,
const VarOrRVar &tx, const VarOrRVar &ty,
const Expr &x_size, const Expr &y_size,
TailStrategy tail = TailStrategy::Auto,
DeviceAPI device_api = DeviceAPI::Default_GPU);
Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
const VarOrRVar &tx, const VarOrRVar &ty,
const Expr &x_size, const Expr &y_size,
TailStrategy tail = TailStrategy::Auto,
DeviceAPI device_api = DeviceAPI::Default_GPU);
Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
const Expr &x_size, const Expr &y_size, const Expr &z_size,
TailStrategy tail = TailStrategy::Auto,
DeviceAPI device_api = DeviceAPI::Default_GPU);
Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
const Expr &x_size, const Expr &y_size, const Expr &z_size,
TailStrategy tail = TailStrategy::Auto,
DeviceAPI device_api = DeviceAPI::Default_GPU);
// @}
/** Schedule for execution on Hexagon. When a loop is marked with
* Hexagon, that loop is executed on a Hexagon DSP. */
Func &hexagon(const VarOrRVar &x = Var::outermost());
/** Prefetch data written to or read from a Func or an ImageParam by a
* subsequent loop iteration, at an optionally specified iteration offset. You may specify
* specification of different vars for the location of the prefetch() instruction
* vs. the location that is being prefetched:
*
* - the first var specified, 'at', indicates the loop in which the prefetch will be placed
* - the second var specified, 'from', determines the var used to find the bounds to prefetch
* (in conjunction with 'offset')
*
* If 'at' and 'from' are distinct vars, then 'from' must be at a nesting level outside 'at.'
* Note that the value for 'offset' applies only to 'from', not 'at'.
*
* The final argument specifies how prefetch of region outside bounds
* should be handled.
*
* For example, consider this pipeline:
\code
Func f, g;
Var x, y, z;
f(x, y) = x + y;
g(x, y) = 2 * f(x, y);
h(x, y) = 3 * f(x, y);
\endcode
*
* The following schedule:
\code
f.compute_root();
g.prefetch(f, x, x, 2, PrefetchBoundStrategy::NonFaulting);
h.prefetch(f, x, y, 2, PrefetchBoundStrategy::NonFaulting);
\endcode
*
* will inject prefetch call at the innermost loop of 'g' and 'h' and generate
* the following loop nest:
\code
for y = ...
for x = ...
f(x, y) = x + y
for y = ..
for x = ...
prefetch(&f[x + 2, y], 1, 16);
g(x, y) = 2 * f(x, y)
for y = ..
for x = ...
prefetch(&f[x, y + 2], 1, 16);
h(x, y) = 3 * f(x, y)
\endcode
*
* Note that the 'from' nesting level need not be adjacent to 'at':
\code
Func f, g;
Var x, y, z, w;
f(x, y, z, w) = x + y + z + w;
g(x, y, z, w) = 2 * f(x, y, z, w);
\endcode
*
* The following schedule:
\code
f.compute_root();
g.prefetch(f, y, w, 2, PrefetchBoundStrategy::NonFaulting);
\endcode
*
* will produce code that prefetches a tile of data:
\code
for w = ...
for z = ...
for y = ...
for x = ...
f(x, y, z, w) = x + y + z + w
for w = ...
for z = ...
for y = ...
for x0 = ...
prefetch(&f[x0, y, z, w + 2], 1, 16);
for x = ...
g(x, y, z, w) = 2 * f(x, y, z, w)
\endcode
*
* Note that calling prefetch() with the same var for both 'at' and 'from'
* is equivalent to calling prefetch() with that var.
*/
// @{
Func &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf);
Func &prefetch(const Internal::Parameter ¶m, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf);
template<typename T>
Func &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
return prefetch(image.parameter(), at, from, std::move(offset), strategy);
}
// @}
/** Specify how the storage for the function is laid out. These
* calls let you specify the nesting order of the dimensions. For
* example, foo.reorder_storage(y, x) tells Halide to use
* column-major storage for any realizations of foo, without
* changing how you refer to foo in the code. You may want to do
* this if you intend to vectorize across y. When representing
* color images, foo.reorder_storage(c, x, y) specifies packed
* storage (red, green, and blue values adjacent in memory), and
* foo.reorder_storage(x, y, c) specifies planar storage (entire
* red, green, and blue images one after the other in memory).
*
* If you leave out some dimensions, those remain in the same
* positions in the nesting order while the specified variables
* are reordered around them. */
// @{
Func &reorder_storage(const std::vector<Var> &dims);
Func &reorder_storage(const Var &x, const Var &y);
template<typename... Args>
HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Var, Args...>::value, Func &>::type
reorder_storage(const Var &x, const Var &y, Args &&...args) {
std::vector<Var> collected_args{x, y, std::forward<Args>(args)...};
return reorder_storage(collected_args);
}
// @}
/** Pad the storage extent of a particular dimension of
* realizations of this function up to be a multiple of the
* specified alignment. This guarantees that the strides for the
* dimensions stored outside of dim will be multiples of the
* specified alignment, where the strides and alignment are
* measured in numbers of elements.
*
* For example, to guarantee that a function foo(x, y, c)
* representing an image has scanlines starting on offsets
* aligned to multiples of 16, use foo.align_storage(x, 16). */
Func &align_storage(const Var &dim, const Expr &alignment);
/** Store realizations of this function in a circular buffer of a
* given extent. This is more efficient when the extent of the
* circular buffer is a power of 2. If the fold factor is too
* small, or the dimension is not accessed monotonically, the
* pipeline will generate an error at runtime.
*
* The fold_forward option indicates that the new values of the
* producer are accessed by the consumer in a monotonically
* increasing order. Folding storage of producers is also
* supported if the new values are accessed in a monotonically
* decreasing order by setting fold_forward to false.
*
* For example, consider the pipeline:
\code
Func f, g;
Var x, y;
g(x, y) = x*y;
f(x, y) = g(x, y) + g(x, y+1);
\endcode
*
* If we schedule f like so:
*
\code
g.compute_at(f, y).store_root().fold_storage(y, 2);
\endcode
*
* Then g will be computed at each row of f and stored in a buffer
* with an extent in y of 2, alternately storing each computed row
* of g in row y=0 or y=1.
*/
Func &fold_storage(const Var &dim, const Expr &extent, bool fold_forward = true);
/** Compute this function as needed for each unique value of the
* given var for the given calling function f.
*
* For example, consider the simple pipeline:
\code
Func f, g;
Var x, y;
g(x, y) = x*y;
f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
\endcode
*
* If we schedule f like so:
*
\code
g.compute_at(f, x);
\endcode
*
* Then the C code equivalent to this pipeline will look like this
*
\code
int f[height][width];
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
int g[2][2];
g[0][0] = x*y;
g[0][1] = (x+1)*y;
g[1][0] = x*(y+1);
g[1][1] = (x+1)*(y+1);
f[y][x] = g[0][0] + g[1][0] + g[0][1] + g[1][1];
}
}
\endcode
*
* The allocation and computation of g is within f's loop over x,
* and enough of g is computed to satisfy all that f will need for
* that iteration. This has excellent locality - values of g are
* used as soon as they are computed, but it does redundant
* work. Each value of g ends up getting computed four times. If
* we instead schedule f like so:
*
\code
g.compute_at(f, y);
\endcode
*
* The equivalent C code is:
*
\code
int f[height][width];
for (int y = 0; y < height; y++) {
int g[2][width+1];
for (int x = 0; x < width; x++) {
g[0][x] = x*y;
g[1][x] = x*(y+1);
}
for (int x = 0; x < width; x++) {
f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
}
}
\endcode
*
* The allocation and computation of g is within f's loop over y,
* and enough of g is computed to satisfy all that f will need for
* that iteration. This does less redundant work (each point in g
* ends up being evaluated twice), but the locality is not quite
* as good, and we have to allocate more temporary memory to store
* g.
*/
Func &compute_at(const Func &f, const Var &var);
/** Schedule a function to be computed within the iteration over
* some dimension of an update domain. Produces equivalent code
* to the version of compute_at that takes a Var. */
Func &compute_at(const Func &f, const RVar &var);
/** Schedule a function to be computed within the iteration over
* a given LoopLevel. */
Func &compute_at(LoopLevel loop_level);
/** Schedule the iteration over the initial definition of this function
* to be fused with another stage 's' from outermost loop to a
* given LoopLevel. */
// @{
Func &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
Func &compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrategy align = LoopAlignStrategy::Auto);
Func &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
Func &compute_with(LoopLevel loop_level, LoopAlignStrategy align = LoopAlignStrategy::Auto);
/** Compute all of this function once ahead of time. Reusing
* the example in \ref Func::compute_at :
*
\code
Func f, g;
Var x, y;
g(x, y) = x*y;
f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
g.compute_root();
\endcode
*
* is equivalent to
*
\code
int f[height][width];
int g[height+1][width+1];
for (int y = 0; y < height+1; y++) {
for (int x = 0; x < width+1; x++) {
g[y][x] = x*y;
}
}
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
f[y][x] = g[y][x] + g[y+1][x] + g[y][x+1] + g[y+1][x+1];
}
}
\endcode
*
* g is computed once ahead of time, and enough is computed to
* satisfy all uses of it. This does no redundant work (each point
* in g is evaluated once), but has poor locality (values of g are
* probably not still in cache when they are used by f), and
* allocates lots of temporary memory to store g.
*/
Func &compute_root();
/** Use the halide_memoization_cache_... interface to store a
* computed version of this function across invocations of the
* Func.
*
* If an eviction_key is provided, it must be constructed with
* Expr of integer or handle type. The key Expr will be promoted
* to a uint64_t and can be used with halide_memoization_cache_evict
* to remove memoized entries using this eviction key from the
* cache. Memoized computations that do not provide an eviction
* key will never be evicted by this mechanism.
*/
Func &memoize(const EvictionKey &eviction_key = EvictionKey());
/** Produce this Func asynchronously in a separate
* thread. Consumers will be run by the task system when the
* production is complete. If this Func's store level is different
* to its compute level, consumers will be run concurrently,
* blocking as necessary to prevent reading ahead of what the
* producer has computed. If storage is folded, then the producer
* will additionally not be permitted to run too far ahead of the
* consumer, to avoid clobbering data that has not yet been
* used.
*
* Take special care when combining this with custom thread pool
* implementations, as avoiding deadlock with producer-consumer
* parallelism requires a much more sophisticated parallel runtime
* than with data parallelism alone. It is strongly recommended
* you just use Halide's default thread pool, which guarantees no
* deadlock and a bound on the number of threads launched.
*/
Func &async();
/** Bound the extent of a Func's storage, but not extent of its
* compute. This can be useful for forcing a function's allocation
* to be a fixed size, which often means it can go on the stack.
* If bounds inference decides that it requires more storage for
* this function than the allocation size you have stated, a runtime
* error will occur when you try to run the pipeline. */
Func &bound_storage(const Var &dim, const Expr &bound);
/** Allocate storage for this function within f's loop over
* var. Scheduling storage is optional, and can be used to
* separate the loop level at which storage occurs from the loop
* level at which computation occurs to trade off between locality
* and redundant work. This can open the door for two types of
* optimization.
*
* Consider again the pipeline from \ref Func::compute_at :
\code
Func f, g;
Var x, y;
g(x, y) = x*y;
f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
\endcode
*
* If we schedule it like so:
*
\code
g.compute_at(f, x).store_at(f, y);
\endcode
*
* Then the computation of g takes place within the loop over x,
* but the storage takes place within the loop over y:
*
\code
int f[height][width];
for (int y = 0; y < height; y++) {
int g[2][width+1];
for (int x = 0; x < width; x++) {
g[0][x] = x*y;
g[0][x+1] = (x+1)*y;
g[1][x] = x*(y+1);
g[1][x+1] = (x+1)*(y+1);
f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
}
}
\endcode
*
* Provided the for loop over x is serial, halide then
* automatically performs the following sliding window
* optimization:
*
\code
int f[height][width];
for (int y = 0; y < height; y++) {
int g[2][width+1];
for (int x = 0; x < width; x++) {
if (x == 0) {
g[0][x] = x*y;
g[1][x] = x*(y+1);
}
g[0][x+1] = (x+1)*y;
g[1][x+1] = (x+1)*(y+1);
f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
}
}
\endcode
*
* Two of the assignments to g only need to be done when x is
* zero. The rest of the time, those sites have already been
* filled in by a previous iteration. This version has the
* locality of compute_at(f, x), but allocates more memory and
* does much less redundant work.
*
* Halide then further optimizes this pipeline like so:
*
\code
int f[height][width];
for (int y = 0; y < height; y++) {
int g[2][2];
for (int x = 0; x < width; x++) {
if (x == 0) {
g[0][0] = x*y;
g[1][0] = x*(y+1);
}
g[0][(x+1)%2] = (x+1)*y;
g[1][(x+1)%2] = (x+1)*(y+1);
f[y][x] = g[0][x%2] + g[1][x%2] + g[0][(x+1)%2] + g[1][(x+1)%2];
}
}
\endcode
*
* Halide has detected that it's possible to use a circular buffer
* to represent g, and has reduced all accesses to g modulo 2 in
* the x dimension. This optimization only triggers if the for
* loop over x is serial, and if halide can statically determine
* some power of two large enough to cover the range needed. For
* powers of two, the modulo operator compiles to more efficient
* bit-masking. This optimization reduces memory usage, and also
* improves locality by reusing recently-accessed memory instead
* of pulling new memory into cache.
*
*/
Func &store_at(const Func &f, const Var &var);
/** Equivalent to the version of store_at that takes a Var, but
* schedules storage within the loop over a dimension of a
* reduction domain */
Func &store_at(const Func &f, const RVar &var);
/** Equivalent to the version of store_at that takes a Var, but
* schedules storage at a given LoopLevel. */
Func &store_at(LoopLevel loop_level);
/** Equivalent to \ref Func::store_at, but schedules storage
* outside the outermost loop. */
Func &store_root();
/** Aggressively inline all uses of this function. This is the
* default schedule, so you're unlikely to need to call this. For
* a Func with an update definition, that means it gets computed
* as close to the innermost loop as possible.
*
* Consider once more the pipeline from \ref Func::compute_at :
*
\code
Func f, g;
Var x, y;
g(x, y) = x*y;
f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
\endcode
*
* Leaving g as inline, this compiles to code equivalent to the following C:
*
\code
int f[height][width];
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
f[y][x] = x*y + x*(y+1) + (x+1)*y + (x+1)*(y+1);
}
}
\endcode
*/
Func &compute_inline();
/** Get a handle on an update step for the purposes of scheduling
* it. */
Stage update(int idx = 0);
/** Set the type of memory this Func should be stored in. Controls
* whether allocations go on the stack or the heap on the CPU, and
* in global vs shared vs local on the GPU. See the documentation
* on MemoryType for more detail. */
Func &store_in(MemoryType memory_type);
/** Trace all loads from this Func by emitting calls to
* halide_trace. If the Func is inlined, this has no
* effect. */
Func &trace_loads();
/** Trace all stores to the buffer backing this Func by emitting
* calls to halide_trace. If the Func is inlined, this call
* has no effect. */
Func &trace_stores();
/** Trace all realizations of this Func by emitting calls to
* halide_trace. */
Func &trace_realizations();
/** Add a string of arbitrary text that will be passed thru to trace
* inspection code if the Func is realized in trace mode. (Funcs that are
* inlined won't have their tags emitted.) Ignored entirely if
* tracing is not enabled for the Func (or globally).
*/
Func &add_trace_tag(const std::string &trace_tag);
/** Get a handle on the internal halide function that this Func
* represents. Useful if you want to do introspection on Halide
* functions */
Internal::Function function() const {
return func;
}
/** You can cast a Func to its pure stage for the purposes of
* scheduling it. */
operator Stage() const;
/** Get a handle on the output buffer for this Func. Only relevant
* if this is the output Func in a pipeline. Useful for making
* static promises about strides, mins, and extents. */
// @{
OutputImageParam output_buffer() const;
std::vector<OutputImageParam> output_buffers() const;
// @}
/** Use a Func as an argument to an external stage. */
operator ExternFuncArgument() const;
/** Infer the arguments to the Func, sorted into a canonical order:
* all buffers (sorted alphabetically by name), followed by all non-buffers
* (sorted alphabetically by name).
This lets you write things like:
\code
func.compile_to_assembly("/dev/stdout", func.infer_arguments());
\endcode
*/
std::vector<Argument> infer_arguments() const;
/** Get the source location of the pure definition of this
* Func. See Stage::source_location() */
std::string source_location() const;
/** Return the current StageSchedule associated with this initial
* Stage of this Func. For introspection only: to modify schedule,
* use the Func interface. */
const Internal::StageSchedule &get_schedule() const {
return Stage(*this).get_schedule();
}
};
namespace Internal {
template<typename Last>
inline void check_types(const Tuple &t, int idx) {
using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
user_assert(t[idx].type() == type_of<T>())
<< "Can't evaluate expression "
<< t[idx] << " of type " << t[idx].type()
<< " as a scalar of type " << type_of<T>() << "\n";
}
template<typename First, typename Second, typename... Rest>
inline void check_types(const Tuple &t, int idx) {
check_types<First>(t, idx);
check_types<Second, Rest...>(t, idx + 1);
}
template<typename Last>
inline void assign_results(Realization &r, int idx, Last last) {
using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
*last = Buffer<T>(r[idx])();
}
template<typename First, typename Second, typename... Rest>
inline void assign_results(Realization &r, int idx, First first, Second second, Rest &&...rest) {
assign_results<First>(r, idx, first);
assign_results<Second, Rest...>(r, idx + 1, second, rest...);
}
} // namespace Internal
/** JIT-Compile and run enough code to evaluate a Halide
* expression. This can be thought of as a scalar version of
* \ref Func::realize */
template<typename T>
HALIDE_NO_USER_CODE_INLINE T evaluate(JITUserContext *ctx, const Expr &e) {
user_assert(e.type() == type_of<T>())
<< "Can't evaluate expression "
<< e << " of type " << e.type()
<< " as a scalar of type " << type_of<T>() << "\n";
Func f;
f() = e;
Buffer<T, 0> im = f.realize(ctx);
return im();
}
/** evaluate with a default user context */
template<typename T>
HALIDE_NO_USER_CODE_INLINE T evaluate(const Expr &e) {
return evaluate<T>(nullptr, e);
}
/** JIT-compile and run enough code to evaluate a Halide Tuple. */
template<typename First, typename... Rest>
HALIDE_NO_USER_CODE_INLINE void evaluate(JITUserContext *ctx, Tuple t, First first, Rest &&...rest) {
Internal::check_types<First, Rest...>(t, 0);
Func f;
f() = t;
Realization r = f.realize(ctx);
Internal::assign_results(r, 0, first, rest...);
}
/** JIT-compile and run enough code to evaluate a Halide Tuple. */
template<typename First, typename... Rest>
HALIDE_NO_USER_CODE_INLINE void evaluate(Tuple t, First first, Rest &&...rest) {
evaluate<First, Rest...>(nullptr, std::move(t), std::forward<First>(first), std::forward<Rest...>(rest...));
}
namespace Internal {
inline void schedule_scalar(Func f) {
Target t = get_jit_target_from_environment();
if (t.has_gpu_feature()) {
f.gpu_single_thread();
}
if (t.has_feature(Target::HVX)) {
f.hexagon();
}
}
} // namespace Internal
/** JIT-Compile and run enough code to evaluate a Halide
* expression. This can be thought of as a scalar version of
* \ref Func::realize. Can use GPU if jit target from environment
* specifies one.
*/
template<typename T>
HALIDE_NO_USER_CODE_INLINE T evaluate_may_gpu(const Expr &e) {
user_assert(e.type() == type_of<T>())
<< "Can't evaluate expression "
<< e << " of type " << e.type()
<< " as a scalar of type " << type_of<T>() << "\n";
Func f;
f() = e;
Internal::schedule_scalar(f);
Buffer<T, 0> im = f.realize();
return im();
}
/** JIT-compile and run enough code to evaluate a Halide Tuple. Can
* use GPU if jit target from environment specifies one. */
// @{
template<typename First, typename... Rest>
HALIDE_NO_USER_CODE_INLINE void evaluate_may_gpu(Tuple t, First first, Rest &&...rest) {
Internal::check_types<First, Rest...>(t, 0);
Func f;
f() = t;
Internal::schedule_scalar(f);
Realization r = f.realize();
Internal::assign_results(r, 0, first, rest...);
}
// @}
} // namespace Halide
#endif