#include "HalidePlugin.h" #include #include #include #include #include #include "Halide.h" namespace Halide { namespace Internal { using std::make_pair; using std::map; using std::pair; using std::set; using std::string; using std::vector; namespace { // Substitute parameter estimates into the exprs describing the box bounds. void substitute_estimates_box(Box &box) { box.used = substitute_var_estimates(box.used); for (auto &b : box.bounds) { b.min = substitute_var_estimates(b.min); b.max = substitute_var_estimates(b.max); } } // Substitute parameter estimates into the boxes in 'region'. void substitute_estimates_region(map ®ion) { for (auto &iter : region) { substitute_estimates_box(iter.second); } } // Return true if any of the box dimension is unbounded. bool is_box_unbounded(const Box &b) { for (size_t i = 0; i < b.size(); i++) { if (!b[i].is_bounded()) { return true; } } return false; } // Helper function to simplify the upper and lower bounds of each dimension of a box. void simplify_box(Box &b) { for (size_t i = 0; i < b.size(); i++) { b[i].min = simplify(b[i].min); b[i].max = simplify(b[i].max); } } // Helper function to merge the partial region map into the result region map. void merge_regions(map &result, const map &partial) { // Merge regions from 'partial' with an existing region in 'result'. for (const auto ® : partial) { auto iter = result.find(reg.first); if (iter == result.end()) { result.emplace(reg.first, reg.second); } else { merge_boxes(iter->second, reg.second); } } } // Replace all occurrences of non-alphanumeric chars in 'name' with '_'. string get_sanitized_name(string name) { if (isdigit(name[0])) { name = "_" + name; } for (size_t i = 0; i < name.size(); ++i) { if (!isalnum(name[i])) { name[i] = '_'; } } return name; } // Representation of a function stage in the pipeline. struct FStage { Function func; uint32_t stage_num; FStage(Function func, uint32_t stage_num) : func(std::move(func)), stage_num(stage_num) { } bool operator==(const FStage &other_stage) const { return (func.name() == other_stage.func.name()) && (stage_num == other_stage.stage_num); } bool operator<(const FStage &other_stage) const { return func.name() < other_stage.func.name() || ((func.name() == other_stage.func.name()) && (stage_num < other_stage.stage_num)); } friend std::ostream &operator<<(std::ostream &stream, const FStage &s) { if (s.stage_num == 0) { stream << s.func.name(); } else { stream << s.func.name() << ".update(" << (s.stage_num - 1) << ")"; } return stream; } }; // Check if all the pipeline outputs have estimates specified // on each of their dimensions; otherwise, throw an assertion. void check_estimates_on_outputs(const vector &outputs) { for (const auto &out : outputs) { const vector &estimates = out.schedule().estimates(); // Check if the estimate for each dimension of the output is available // and is an integer. If there are duplicates for the estimate of a // dimension, we only check the last defined estimate (which min and // extent values are defined) since it is the one that would be // eventually used. Bound est; for (const auto &arg : out.args()) { bool found = false; for (int i = (int)estimates.size() - 1; i >= 0; --i) { if ((estimates[i].var == arg) && estimates[i].min.defined() && estimates[i].extent.defined()) { found = true; est = estimates[i]; break; } } user_assert(found && est.min.type().is_int() && est.extent.type().is_int()) << "Please provide a valid estimate for dimension " << arg << " of output \"" << out.name() << "\"\n"; } } } struct DependenceAnalysis { // Map containing all the functions in the pipeline. map env; vector order; FuncValueBounds func_val_bounds; struct RegionsRequiredQuery { string f; int stage; set prods; bool only_regions_computed; RegionsRequiredQuery(const string &f, int stage, const set &prods, bool only_regions_computed) : f(f), stage(stage), prods(prods), only_regions_computed(only_regions_computed) { } bool operator==(const RegionsRequiredQuery &other) const { return (f == other.f) && (stage == other.stage) && (prods == other.prods) && (only_regions_computed == other.only_regions_computed); } bool operator<(const RegionsRequiredQuery &other) const { if (f < other.f) { return true; } else if (f > other.f) { return false; } if (stage < other.stage) { return true; } else if (stage > other.stage) { return false; } if (only_regions_computed < other.only_regions_computed) { return true; } else if (only_regions_computed > other.only_regions_computed) { return false; } return prods < other.prods; } }; struct RegionsRequired { DimBounds bounds; // Regions required to compute 'bounds' given a particular // RegionsRequiredQuery. map regions; RegionsRequired(const DimBounds &b, const map &r) : bounds(b), regions(r) { } }; // Cache for bounds queries (bound queries with the same parameters are // common during the grouping process). map> regions_required_cache; DependenceAnalysis(const map &env, const vector &order, const FuncValueBounds &func_val_bounds) : env(env), order(order), func_val_bounds(func_val_bounds) { } // Return the regions of the producers ('prods') required to compute the region // of the function stage ('f', 'stage_num') specified by 'bounds'. When // 'only_regions_computed' is set to true, this only returns the computed // regions and not the total allocated regions. map regions_required(const Function &f, int stage_num, const DimBounds &bounds, const set &prods, bool only_regions_computed, const Scope *input_estimates); // Return the regions of the producers ('prods') required to compute the region // of the function specified by 'pure_bounds'. When 'only_regions_computed' // is set to true, this only returns the computed regions and not the total // allocated regions. map regions_required(const Function &f, const DimBounds &pure_bounds, const set &prods, bool only_regions_computed, const Scope *input_estimates); // Return redundantly computed regions of producers ('prods') while computing // a region of the function stage ('f', 'stage_num') specified by 'bounds'. // 'var' is the dimension along which redundant computation is accounted for. // When 'only_regions_computed' is set to true, this only returns the computed // regions and not the total allocated regions. When 'only_regions_computed' // is set to true, this only returns the computed regions and not the total // allocated regions. map redundant_regions(const Function &f, int stage_num, const string &var, const DimBounds &bounds, const set &prods, bool only_regions_computed, const Scope *input_estimates); // Return overlapping regions of producers ('prods') while computing a function // stage along each of the dimensions. vector> overlap_regions(const Function &f, int stage_num, const DimBounds &bounds, const set &prods, bool only_regions_computed, const Scope *input_estimates); }; // Return the regions of the producers ('prods') required to compute the region // of the function specified by 'pure_bounds'. map DependenceAnalysis::regions_required(const Function &f, const DimBounds &pure_bounds, const set &prods, bool only_regions_computed, const Scope *input_estimates) { // Find the regions required for each stage and merge them. map regions; int num_stages = f.updates().size() + 1; for (int s = 0; s < num_stages; s++) { DimBounds bounds = get_stage_bounds(f, s, pure_bounds); map stage_regions = regions_required(f, s, bounds, prods, only_regions_computed, input_estimates); merge_regions(regions, stage_regions); } return regions; } struct StageBounds { FStage f_stage; DimBounds bounds; StageBounds(const FStage &fs, const DimBounds &b) : f_stage(fs), bounds(b) { } StageBounds(Function func, uint32_t stage_num, const DimBounds &b) : f_stage(FStage(std::move(func), stage_num)), bounds(b) { } bool operator==(const StageBounds &other) const { return (f_stage == other.f_stage) && (bounds == other.bounds); } bool operator<(const StageBounds &other) const { return (f_stage < other.f_stage) || ((f_stage == other.f_stage) && (bounds.size() < other.bounds.size())); } }; // Helper function to queue regions that need to be traversed. 'fs_bounds' is // the queue into which the regions specified by 'prod_func' and 'region' // will be added. void queue_func_regions(map &fs_bounds, const Function &prod_func, const Box ®ion, const set &visited) { DimBounds prod_pure_bounds; const vector &args = prod_func.args(); internal_assert(region.size() == args.size()); // The region only specifies the extent of each dimension // by position. Populating a map which is keyed by name. for (size_t v = 0; v < args.size(); v++) { prod_pure_bounds[args[v]] = region[v]; } // Get the bounds of all stages in a function from the // bounds on the pure dimensions. vector prod_bounds = get_stage_bounds(prod_func, prod_pure_bounds); size_t num_stages = prod_func.updates().size() + 1; internal_assert(prod_bounds.size() == num_stages); // Add all stages of a function into the queue. for (size_t prod_s = 0; prod_s < num_stages; prod_s++) { StageBounds sb(prod_func, prod_s, prod_bounds[prod_s]); if (visited.find(sb) == visited.end()) { auto iter = fs_bounds.find(sb.f_stage); if (iter == fs_bounds.end()) { fs_bounds.emplace(sb.f_stage, sb.bounds); } else { for (const auto &b : sb.bounds) { DimBounds &curr_bounds = iter->second; auto b_iter = curr_bounds.find(b.first); if (b_iter == curr_bounds.end()) { curr_bounds.emplace(b.first, b.second); } else { if (b_iter->second.has_lower_bound() && b.second.has_lower_bound()) { b_iter->second.min = simplify(Interval::make_min(b_iter->second.min, b.second.min)); } else { b_iter->second.min = Interval::neg_inf(); } if (b_iter->second.has_upper_bound() && b.second.has_upper_bound()) { b_iter->second.max = simplify(Interval::make_max(b_iter->second.max, b.second.max)); } else { b_iter->second.max = Interval::pos_inf(); } } } } } } } // Helper function for merging 'curr_regions' to the global map of regions // and adding them to the queue of regions that need to be traversed. // 'prods' is the set of producer functions that are under consideration. void merge_and_queue_regions(map &fs_bounds, map ®ions, map &curr_regions, const set &prods, const map &env, bool only_regions_computed, const string &curr_func_name, const set &visited) { for (const auto ® : curr_regions) { // Merge region with an existing region of a function in the // global map. Do not merge the parent function itself to the region // when querying only for the values computed. if (!only_regions_computed || (only_regions_computed && (reg.first != curr_func_name))) { auto iter = regions.find(reg.first); if (iter == regions.end()) { regions.emplace(reg.first, reg.second); } else { merge_boxes(iter->second, reg.second); } } // Skip adding the current region into to the queue if the function // is not in 'prods'. if (prods.find(reg.first) == prods.end()) { continue; } const auto &it = env.find(reg.first); if ((it != env.end()) && (reg.first != curr_func_name)) { // Add all stages of the function representing the // region into the queue. queue_func_regions(fs_bounds, it->second, reg.second, visited); } } } // Return the regions of the producers ('prods') required to compute the region // of the function stage ('f', 'stage_num') specified by 'bounds'. map DependenceAnalysis::regions_required(const Function &f, int stage_num, const DimBounds &bounds, const set &prods, bool only_regions_computed, const Scope *input_estimates) { // Iteratively compute the required regions by traversing the chain // of dependencies. // Check the cache if we've already computed this previously. RegionsRequiredQuery query(f.name(), stage_num, prods, only_regions_computed); const auto &iter = regions_required_cache.find(query); if (iter != regions_required_cache.end()) { const auto &it = std::find_if(iter->second.begin(), iter->second.end(), [&bounds](const RegionsRequired &r) { return (r.bounds == bounds); }); if (it != iter->second.end()) { internal_assert((iter->first == query) && (it->bounds == bounds)); return it->regions; } } // Map of all the required regions. map regions; map fs_bounds; set visited; // Add the query function and its region to the queue. fs_bounds.emplace(FStage(f, stage_num), bounds); while (!fs_bounds.empty()) { for (int i = order.size() - 1; i >= 0; --i) { const Function &f = env.find(order[i])->second; int num_stages = f.updates().size() + 1; for (int stage_num = 0; stage_num < num_stages; ++stage_num) { FStage s(f, stage_num); const auto &iter = fs_bounds.find(s); if (iter == fs_bounds.end()) { continue; } DimBounds curr_bounds = iter->second; visited.insert(StageBounds(s, curr_bounds)); // Scope for containing all the estimates on parameters and intervals. Scope curr_scope; curr_scope.set_containing_scope(input_estimates); // If the function has an extern definition, there is no visibility into // the expression defining the function. So the regions required will be // the entire domain of the inputs to the extern func. Use the estimates // on the inputs to the extern function if available. // // TODO: Query the extern function for bounds of the functions which it // it depends on. This can be done by calling the extern func in the // bounds query mode. if (s.func.has_extern_definition()) { for (const ExternFuncArgument &arg : s.func.extern_arguments()) { if (arg.is_func()) { // If the argument is an entire function, the bounds of the // function required are unknown. Create an infinite region // of the correct dimension, update the region map, and // add it to the queue. string prod_name = Function(arg.func).name(); const Function &prod_func = get_element(env, prod_name); map prod_reg; const vector &args = prod_func.args(); for (size_t v = 0; v < args.size(); v++) { prod_reg[prod_name].push_back(Interval()); } merge_and_queue_regions(fs_bounds, regions, prod_reg, prods, env, only_regions_computed, s.func.name(), visited); } else if (arg.is_expr()) { // Find the boxes required for the expression and add the regions // to the queue. Expr subs_arg = substitute_var_estimates(arg.expr); map arg_regions = boxes_required(subs_arg, curr_scope, func_val_bounds); substitute_estimates_region(arg_regions); merge_and_queue_regions(fs_bounds, regions, arg_regions, prods, env, only_regions_computed, s.func.name(), visited); } else if (arg.is_image_param() || arg.is_buffer()) { // If the argument is an image or a buffer, the required // bounds are unknown. Create an infinite region of the // correct dimension and update the region map. Buffer<> buf; if (arg.is_image_param()) { buf = arg.image_param.buffer(); } else { buf = arg.buffer; } map buf_reg; for (int v = 0; v < buf.dimensions(); v++) { buf_reg[buf.name()].push_back(Interval()); } merge_regions(regions, buf_reg); } } } else { Definition def = get_stage_definition(s.func, s.stage_num); const vector &dims = def.schedule().dims(); // Substitute parameter estimates into the bounds and add them to the // current scope. for (int d = 0; d < (int)dims.size() - 1; d++) { Interval simple_bounds = get_element(curr_bounds, dims[d].var); simple_bounds.min = substitute_var_estimates(simple_bounds.min); simple_bounds.max = substitute_var_estimates(simple_bounds.max); curr_scope.push(dims[d].var, simple_bounds); } // Find the regions required for each value of the current function stage, // update the region map, and add them to the queue. for (const auto &val : def.values()) { // Substitute the parameter estimates into the expression and get // the regions required for the expression. Expr subs_val = substitute_var_estimates(val); map curr_regions = boxes_required(subs_val, curr_scope, func_val_bounds); substitute_estimates_region(curr_regions); // Arguments to the definition may require regions of functions. // For example, update definitions in histograms where the bin is // based on the value of a function. Box left_reg; for (const Expr &arg : def.args()) { Expr subs_arg = substitute_var_estimates(arg); map arg_regions = boxes_required(subs_arg, curr_scope, func_val_bounds); substitute_estimates_region(arg_regions); // Merge the regions with the regions found while looking at // the values. merge_regions(curr_regions, arg_regions); Interval arg_bounds = bounds_of_expr_in_scope(arg, curr_scope, func_val_bounds); left_reg.push_back(arg_bounds); } auto iter_curr = curr_regions.find(s.func.name()); if (iter_curr == curr_regions.end()) { curr_regions.emplace(s.func.name(), left_reg); } else { merge_boxes(iter_curr->second, left_reg); } // Update the region map, and add 'curr_regions' to the queue. merge_and_queue_regions(fs_bounds, regions, curr_regions, prods, env, only_regions_computed, s.func.name(), visited); } } // Remove processed region from the queue. fs_bounds.erase(iter); } } } // Simplify the bounds on each region and substitute global pipeline // bounds for function regions which lower and upper bounds could not be // determined. map concrete_regions; for (auto &f_reg : regions) { simplify_box(f_reg.second); Box concrete_box; for (size_t i = 0; i < f_reg.second.size(); i++) { Expr lower = f_reg.second[i].min; Expr upper = f_reg.second[i].max; auto iter = env.find(f_reg.first); bool in_env = (iter != env.end()); if (!lower.as() && in_env) { const Function &curr_f = iter->second; for (const auto &b : curr_f.schedule().estimates()) { size_t num_pure_args = curr_f.args().size(); if ((i < num_pure_args) && (b.var == curr_f.args()[i])) { lower = b.min; } } } if (!upper.as() && in_env) { const Function &curr_f = iter->second; for (const auto &b : curr_f.schedule().estimates()) { size_t num_pure_args = curr_f.args().size(); if ((i < num_pure_args) && (b.var == curr_f.args()[i])) { const IntImm *bmin = b.min.as(); const IntImm *bextent = b.extent.as(); upper = IntImm::make(Int(32), bmin->value + bextent->value - 1); } } } Interval concrete_bounds = Interval(lower, upper); concrete_box.push_back(concrete_bounds); } concrete_regions[f_reg.first] = concrete_box; } regions_required_cache[query].push_back(RegionsRequired(bounds, concrete_regions)); return concrete_regions; } // Return redundantly computed regions of producers ('prods') while computing a // region of the function stage ('f', 'stage_num') specified by 'bounds'. 'var' // is the dimension along which redundant computation is accounted for. map DependenceAnalysis::redundant_regions(const Function &f, int stage_num, const string &var, const DimBounds &bounds, const set &prods, bool only_regions_computed, const Scope *input_estimates) { // Find the regions required to compute the region of 'f' specified // by 'bounds'. map regions = regions_required( f, stage_num, bounds, prods, only_regions_computed, input_estimates); // Shift the bounds by the size of the interval along the direction // of var. DimBounds shifted_bounds; for (const auto &b : bounds) { if (b.first == var) { Expr len = b.second.max - b.second.min + 1; Interval bound = Interval(b.second.min + len, b.second.max + len); shifted_bounds[b.first] = bound; } else { shifted_bounds[b.first] = b.second; } } // Find the regions required to compute the region of f specified // by shifted_bounds. map regions_shifted = regions_required( f, stage_num, shifted_bounds, prods, only_regions_computed, input_estimates); // Compute the overlaps between 'regions_shifted' and the original // regions required. map overlaps; for (const auto ® : regions) { auto iter = regions_shifted.find(reg.first); if (iter == regions.end()) { // It will be interesting to log cases where this actually happens // i.e., the shifted regions do not contain a function that was // there in the original regions. continue; } const Box &b = reg.second; const Box &b_shifted = iter->second; // The boxes should be of the same size. internal_assert(b.size() == b_shifted.size()); Box b_intersect; for (uint32_t i = 0; i < b.size(); i++) { b_intersect.push_back(Interval::make_intersection(b[i], b_shifted[i])); } // A function should appear once in the regions and therefore cannot // already be present in the overlaps map. internal_assert(overlaps.find(reg.first) == overlaps.end()); overlaps.emplace(reg.first, b_intersect); } // Simplify the bounds of each of the overlap regions. for (auto &f : overlaps) { simplify_box(f.second); } return overlaps; } // Return overlapping regions of producers ('prods') while computing a function // stage along each of the dimensions. vector> DependenceAnalysis::overlap_regions(const Function &f, int stage_num, const DimBounds &bounds, const set &prods, bool only_regions_computed, const Scope *input_estimates) { vector> conc_overlaps; const vector &dims = get_stage_dims(f, stage_num); // Get the redundant regions along each dimension of f. for (int d = 0; d < (int)dims.size() - 1; d++) { map conc_reg = redundant_regions(f, stage_num, dims[d].var, bounds, prods, only_regions_computed, input_estimates); conc_overlaps.push_back(conc_reg); } return conc_overlaps; } // Return the regions of each function required for computing the // outputs of the pipeline. map get_pipeline_bounds(DependenceAnalysis &analysis, const vector &outputs, const Scope *input_estimates) { map pipeline_bounds; // Find the regions required for each of the outputs and merge them // to compute the full pipeline_bounds. for (const auto &out : outputs) { DimBounds pure_bounds; Box out_box; // Use the estimates on the output for determining the output bounds. // If there are duplicates, use the most recent estimate. const auto &estimates = out.schedule().estimates(); for (const auto &arg : out.args()) { int i; for (i = estimates.size() - 1; i >= 0; --i) { const auto &est = estimates[i]; if ((est.var == arg) && est.min.defined() && est.extent.defined()) { Interval in = Interval(est.min, simplify(est.min + est.extent - 1)); pure_bounds.emplace(arg, in); out_box.push_back(in); break; } } internal_assert(i >= 0) << "Could not find estimate for " << arg << "\n"; } set prods; for (const pair &fpair : analysis.env) { prods.insert(fpair.first); } map regions = analysis.regions_required(out, pure_bounds, prods, false, input_estimates); // Add the output region to the pipeline bounds as well. regions.emplace(out.name(), out_box); merge_regions(pipeline_bounds, regions); } return pipeline_bounds; } struct AutoSchedule { struct Stage { string function; size_t stage; Stage(const string &f, size_t s) : function(f), stage(s) { } bool operator==(const Stage &other) const { return (function == other.function) && (stage == other.stage); } bool operator<(const Stage &other) const { return (function < other.function) || ((function == other.function) && (stage < other.stage)); } }; const map &env; // Contain maps from function name to the topological order of the pipeline. map topological_order; // Cache for storing all internal vars/rvars that have been declared during // the course of schedule generation, to ensure that we don't introduce any // duplicates in the string representation of the schedules. map internal_vars; // Store the list of schedules applied to some function stages (most recent // schedule is placed last in the list). map>> func_schedules; // Store the list of vars/rvars used in the schedule applied to some // function stages. map>> used_vars; AutoSchedule(const map &env, const vector &order) : env(env) { for (size_t i = 0; i < order.size(); ++i) { topological_order.emplace(order[i], i); } // Allocate a slot in 'used_vars' for each function stages in the pipeline for (const auto &iter : env) { for (size_t i = 0; i < iter.second.updates().size() + 1; ++i) { used_vars[iter.first][i]; } } } // Given a function name, return a string representation of getting the // function handle string get_func_handle(const string &name) const { size_t index = get_element(topological_order, name); return "pipeline.get_func(" + std::to_string(index) + ")"; } friend std::ostream &operator<<(std::ostream &stream, const AutoSchedule &sched) { for (const auto &iter : sched.internal_vars) { if (iter.second.is_rvar) { stream << "RVar "; } else { stream << "Var "; } stream << iter.first << "(\"" << iter.first << "\");\n"; } stream << "\n"; // Declare all the functions + schedules std::ostringstream func_ss; std::ostringstream schedule_ss; for (const auto &f : sched.func_schedules) { const string &fname = get_sanitized_name(f.first); func_ss << "Func " << fname << " = " << sched.get_func_handle(f.first) << ";\n"; schedule_ss << "{\n"; // Declare all the Vars and RVars that are actually used in the schedule const Function &func = get_element(sched.env, f.first); for (size_t i = 0; i < func.args().size(); ++i) { if (sched.used_vars.at(func.name()).at(0).find(func.args()[i]) != sched.used_vars.at(func.name()).at(0).end()) { schedule_ss << " Var " << func.args()[i] << " = " << fname << ".args()[" << i << "];\n"; } } set declared_rvars; for (size_t i = 0; i < func.updates().size(); ++i) { const vector &rvars = func.updates()[i].schedule().rvars(); const set &var_list = sched.used_vars.at(func.name()).at(i + 1); for (size_t j = 0; j < rvars.size(); ++j) { if ((var_list.find(rvars[j].var) == var_list.end()) || (declared_rvars.find(rvars[j].var) != declared_rvars.end())) { continue; } declared_rvars.insert(rvars[j].var); schedule_ss << " RVar " << rvars[j].var << "(" << fname << ".update(" << i << ").get_schedule().rvars()[" << j << "].var);\n"; } } for (const auto &s : f.second) { internal_assert(!s.second.empty()); schedule_ss << " " << fname; if (s.first > 0) { schedule_ss << ".update(" << std::to_string(s.first - 1) << ")"; } for (size_t i = 0; i < s.second.size(); ++i) { schedule_ss << "\n ." << s.second[i]; } schedule_ss << ";\n"; } schedule_ss << "}\n"; } stream << func_ss.str() << "\n"; stream << schedule_ss.str() << "\n"; return stream; } void push_schedule(const string &stage_name, size_t stage_num, const string &sched, const set &vars) { vector v = split_string(stage_name, "."); internal_assert(!v.empty()); used_vars[v[0]][stage_num].insert(vars.begin(), vars.end()); // If the previous schedule applied is the same as this one, // there is no need to re-apply the schedule auto &schedules = func_schedules[v[0]][stage_num]; if (schedules.empty()) { schedules.push_back(sched); } else { if (schedules[schedules.size() - 1] != sched) { schedules.push_back(sched); } } } }; // Implement the grouping algorithm and the cost model for making the grouping // choices. struct Partitioner { // GroupingChoice encodes the grouping of the 'prod' function into the 'cons' stage. struct GroupingChoice { string prod; FStage cons; GroupingChoice(const string &prod, const FStage &cons) : prod(prod), cons(cons) { } bool operator==(const GroupingChoice &other) const { return (prod == other.prod) && (cons == other.cons); } bool operator<(const GroupingChoice &other) const { return (prod < other.prod) || ((prod == other.prod) && (cons < other.cons)); } friend std::ostream &operator<<(std::ostream &stream, const GroupingChoice &choice) { stream << "Choice: " << choice.prod << " -> " << choice.cons << "\n"; return stream; } }; // A group is a sub-pipeline with a single output. Members of a group are // either inlined into the consumer functions within the group or computed // at tiles of the output, specified by 'tile_sizes'. // // TODO: The restriction of computing either at the inline or tile level // makes the space of scheduling choices for a group very tractable. // However, the restriction might miss good schedules which can only be // realized by computing the members of the group at different levels of // the group. // // There are two approaches to extend the space of schedules considered: // 1) Recursive grouping: Treat the problem of determining the compute levels // within a group as a smaller instance of the grouping problem with // different parameters for the input, output sizes, and cache model. // // 2) Tightening: Always compute a function at the lowest level possible // without introducing redundant work. This is a restricted form of recursive // grouping which does not explore the trade-off between redundant work and // locality. // // Either approach can be implemented as a post process for each group // after the initial grouping process finishes. The cost model may // already make sub-optimal higher level partitioning when it is not aware // of the benefits of the post processing. However, it should strictly be // an improvement over the initial grouping. As a first step, it is good // to make it a post process. // // Incorporating the recursive grouping process into the cost model can be // tricky and can potentially make the cost of analyzing a group // prohibitive, as it requires solving smaller instances of the grouping // problem for analyzing each configuration. On the other hand, tightening // can be integrated into the cost model with out significantly increasing // the time to analyze a grouping configuration. // // TODO: Add sliding window optimizations. For start, it may be enough to // implement sliding window as a post-pass by moving the store level of all // the members of the group to the outermost serial loop. This could possibly // be incorporated in the cost model with some effort. Line-buffering // presents additional challenges for this post-processing strategy though. // A typical line-buffer would use terrible tile size for tiling, but its // performance will improve significantly once sliding window is turned on. // // TODO: Register tiling is an important transformation especially for // benchmarks with significant reuse of the data (like matrix multiply and // convolutional layers). The mechanism for realizing register tiling is to // completely unroll small tiles of the innermost kernels. Unrolling // interacts with vectorization, storage layout, and depends on the outer // level tiling. struct Group { // The output stage representing the group. FStage output; // Functions that belong to the group. vector members; // Members of the group which are inlined. set inlined; // Tile sizes along dimensions of the output function of the group. map tile_sizes; Group(const FStage &output, const vector &members) : output(output), members(members) { } friend std::ostream &operator<<(std::ostream &stream, const Group &g) { stream << "Output FStage: " << g.output << "\n" << "Members: {"; for (size_t i = 0; i < g.members.size(); ++i) { if (i > 0) { stream << ", "; } stream << g.members[i]; } stream << "}\n"; stream << "Inlined: {"; for (auto iter = g.inlined.begin(); iter != g.inlined.end(); ++iter) { if (std::distance(g.inlined.begin(), iter) > 0) { stream << ", "; } stream << *iter; } stream << "}\n"; stream << "Tile sizes: {"; for (auto iter = g.tile_sizes.begin(); iter != g.tile_sizes.end(); ++iter) { if (std::distance(g.tile_sizes.begin(), iter) > 0) { stream << ", "; } stream << "(" << iter->first << ", " << iter->second << ")"; } stream << "}\n"; return stream; } }; // Result of the analysis of a group. struct GroupAnalysis { // Estimate of the arithmetic and memory cost for computing the group. Cost cost; // Estimate of the parallelism that can be exploited while computing // the group. Expr parallelism; GroupAnalysis() : cost(Cost()), parallelism(Expr()) { } GroupAnalysis(const Cost &c, Expr p) : cost(c), parallelism(std::move(p)) { } inline bool defined() const { return cost.defined() && parallelism.defined(); } void simplify() { cost.simplify(); if (parallelism.defined()) { parallelism = Internal::simplify(parallelism); } } friend std::ostream &operator<<(std::ostream &stream, const GroupAnalysis &analysis) { stream << "[arith cost:" << analysis.cost.arith << ", " << "memory cost:" << analysis.cost.memory << ", " << "parallelism:" << analysis.parallelism << "]\n"; return stream; } }; // Configuration of a group and the corresponding analysis. A group is the // set of functions that are computed together in tiles and the group config // specifies at what granularity they are computed together ('tile_sizes'). struct GroupConfig { map tile_sizes; GroupAnalysis analysis; GroupConfig(const map &tile_sizes, const GroupAnalysis &analysis) : tile_sizes(tile_sizes), analysis(analysis) { } GroupConfig() : tile_sizes(map()), analysis(GroupAnalysis()) { } }; // Cache for storing the best configuration for the grouping choice. During // the grouping process, the impact of grouping two groups together is only // limited to the producers and consumers of the groups that are being grouped // together. The best grouping choices for the rest of the pipeline need not be // re-evaluated and caching them improves performance significantly. map grouping_cache; // Each group in the pipeline has a single output stage. A group is comprised // of function stages that are computed together in tiles (stages of a function // are always grouped together). 'groups' is the mapping from the output stage // of the group to the group. map groups; // The child stages of each stage (i.e. stages that depend on or use the values // computed by a particular stage) in the pipeline. map> children; // Map from the output stage of the group to the analysis of the group. The mapping // needs to be updated whenever the grouping changes. map group_costs; // Levels that are targeted by the grouping algorithm. In the 'Inline' mode, the grouping // algorithm groups the functions by inlining the expression for the producer function // into the consumer stage. In the 'FastMem' mode, the grouping is done at the level of // tiles of the group output stage. enum class Level { Inline, FastMem }; // Bounds of each function stage in the pipeline. These bounds are inferred from the // estimates of the outputs and other functions in the pipeline. const map &pipeline_bounds; // Parameters of the machine model that is used for estimating the cost of each // group in the pipeline. const MachineParams &arch_params; // Dependency analysis of the pipeline. This support queries on regions // accessed and computed for producing some regions of some functions. DependenceAnalysis &dep_analysis; // The arithmetic and memory costs of evaluating the expressions which define // each function in the pipeline. RegionCosts &costs; // Output functions of the pipeline. const vector &outputs; Partitioner(const map &_pipeline_bounds, const MachineParams &_arch_params, const vector &_outputs, DependenceAnalysis &_dep_analysis, RegionCosts &_costs); void initialize_groups(); // Merge 'prod_group' into 'cons_group'. The output stage of 'cons_group' // will be the output stage of the merged group. Group merge_groups(const Group &prod_group, const Group &cons_group); // Merge 'prods' in 'choice' into 'cons'. Set the tile size of the new group // to the one specified by 'eval'. If 'level' is set to Inline, all members // of 'prods' will be inlined in the new group. void merge_groups(const GroupingChoice &choice, const GroupConfig &eval, Partitioner::Level level); // Given a grouping 'g', compute the estimated cost (arithmetic + memory) and // parallelism that can be potentially exploited when computing that group. GroupAnalysis analyze_group(const Group &g, bool show_analysis); // For each group in the partition, return the regions of the producers // need to be allocated to compute a tile of the group's output. map> group_storage_bounds(); // For each group in the partition, return the regions of the producers // required to compute a tile of the group's output. map> group_loop_bounds(); // Partition the pipeline by iteratively merging groups until a fixpoint is // reached. void group(Partitioner::Level level); // Given a grouping choice, return a configuration for the group that gives // the highest estimated benefits. GroupConfig evaluate_choice(const GroupingChoice &group, Partitioner::Level level); // Pick the best choice among all the grouping options currently available. Uses // the cost model to estimate the benefit of each choice. This returns a vector of // choice and configuration pairs which describe the best grouping choice. vector> choose_candidate_grouping(const vector> &cands, Partitioner::Level level); // Return the bounds required to produce a function stage. DimBounds get_bounds(const FStage &stg); // Return the bounds required to produce a tile of a function stage. DimBounds get_bounds_from_tile_sizes(const FStage &stg, const map &tile_sizes); // Return the estimated size of the bounds. map bounds_to_estimates(const DimBounds &bounds); // Given a function stage, return a vector of possible tile configurations for // that function stage. vector> generate_tile_configs(const FStage &stg); // Find the best tiling configuration for a group 'g' among a set of tile // configurations. This returns a pair of configuration with the highest // estimated benefit and the estimated benefit. pair, GroupAnalysis> find_best_tile_config(const Group &g); // Estimate the benefit (arithmetic + memory) of 'new_grouping' over 'old_grouping'. // Positive values indicates that 'new_grouping' may be preferrable over 'old_grouping'. // When 'ensure_parallelism' is set to true, this will return an undefined cost // if the estimated parallelism is smaller than the machine parameters. // If 'no_redundant_work' is set, we only consider the arithmetic cost, i.e. if // the arithmetic benefit is negative, we will treat it as no benefits and we // should not perform the new grouping. Expr estimate_benefit(const GroupAnalysis &old_grouping, const GroupAnalysis &new_grouping, bool no_redundant_work, bool ensure_parallelism) const; // Same as above; however, 'new_grouping' is a vector of function pairs that // are to be grouped together. Expr estimate_benefit(const vector> &new_grouping, bool no_redundant_work, bool ensure_parallelism) const; // Return the total estimate on arithmetic and memory costs of computing all // groups within the pipeline. Cost get_pipeline_cost(); // Return the maximum access stride to allocation of 'func_acc' along any // loop variable specified in 'vars'. Access expressions along each dimension // of the allocation are specified by 'acc_exprs'. The dimension bounds of the // allocation are specified by 'buffer_bounds'. Expr find_max_access_stride(const Scope<> &vars, const string &func_acc, const vector &acc_exprs, const Box &buffer_bounds); // Return the sum of access strides along each of the loop variables in // a function stage. The bounds of all the allocations accessed are specified // in 'allocation_bounds'. Return an empty map if it can't figure out any of // the stride dimension. map analyze_spatial_locality( const FStage &stg, const map &parent_bounds, const set &inlines = set()); map evaluate_reuse(const FStage &stg, const set &prods); // Generate and apply schedules for all functions within a pipeline by // following their grouping structure. // // TODO: A mode where schedules are not applied to the functions might be // interesting. // // TODO: The current form of the schedule returned is not very useful since it // cannot be manipulated and introspected very easily. The problem is that all // of the scheduling uses internal function and variable names which are not // visible to the user. Additionally, functions like sum and maximum are not // user visible. More thought needs to go into interaction between the user and // auto scheduling. void generate_cpu_schedule(const Target &t, AutoSchedule &sched); // Same as \ref Partitioner::generate_cpu_schedule, but this generates and // applies schedules for a group of function stages. void generate_group_cpu_schedule(const Group &g, const Target &t, const map &group_loop_bounds, const map &group_storage_bounds, const set &inlines, AutoSchedule &sched); // Split the dimension of stage 'f_handle' along 'v' into inner and outer // dimensions. Modify 'estimates' according to the split and append the split // schedule to 'sched'. pair split_dim( const Group &g, Stage f_handle, int stage_num, const Definition &def, bool is_group_output, const VarOrRVar &v, const Expr &factor, const string &in_suffix, const string &out_suffix, map &estimates, AutoSchedule &sched); // Loop over the dimensions of function stage 'f_handle' starting from innermost // and vectorize the first pure dimension encountered. void vectorize_stage( const Group &g, Stage f_handle, int stage_num, Definition def, const Function &func, bool is_group_output, const Target &t, set &rvars, map &estimates, AutoSchedule &sched); // Reorder the dimensions to preserve spatial locality. This function // checks the stride of each access. The dimensions of the loop are reordered // such that the dimension with the smallest access stride is innermost. // This takes the strides along each dimension as input. void reorder_dims(Stage f_handle, int stage_num, Definition def, map strides, AutoSchedule &sched); // Helper functions to display partition information of the pipeline. void disp_pipeline_costs(); void disp_pipeline_bounds(); void disp_pipeline_graph(); void disp_grouping(); }; void Partitioner::disp_grouping() { debug(0) << "\n=========\n" << "Grouping:\n" << "=========\n"; for (const auto &g : groups) { debug(0) << g.second << "\n"; } debug(0) << "=========\n"; } void Partitioner::disp_pipeline_graph() { debug(0) << "\n================\n" << "Pipeline graph:\n" << "================\n"; for (const auto &f : children) { debug(0) << f.first << ": {"; for (auto iter = f.second.begin(); iter != f.second.end(); ++iter) { if (std::distance(f.second.begin(), iter) > 0) { debug(0) << ", "; } debug(0) << *iter; } debug(0) << "}\n"; } debug(0) << "================\n"; } void Partitioner::disp_pipeline_bounds() { debug(0) << "\n================\n" << "Pipeline bounds:\n" << "================\n"; disp_regions(pipeline_bounds); debug(0) << "===============\n"; } Cost Partitioner::get_pipeline_cost() { internal_assert(!group_costs.empty()); Cost total_cost(0, 0); for (const pair &g : groups) { const GroupAnalysis &analysis = get_element(group_costs, g.first); if (!analysis.cost.defined()) { return Cost(); } total_cost.arith += analysis.cost.arith; total_cost.memory += analysis.cost.memory; } total_cost.simplify(); return total_cost; } void Partitioner::disp_pipeline_costs() { internal_assert(!group_costs.empty()); Cost total_cost(0, 0); debug(0) << "\n===============\n" << "Pipeline costs:\n" << "===============\n" << "Group: (name) [arith cost, mem cost, parallelism]\n"; for (const pair &g : groups) { const GroupAnalysis &analysis = get_element(group_costs, g.first); if (!total_cost.arith.defined()) { continue; } else if (!analysis.cost.arith.defined()) { total_cost.arith = Expr(); } else { total_cost.arith += analysis.cost.arith; } if (!total_cost.memory.defined()) { continue; } else if (!analysis.cost.memory.defined()) { total_cost.memory = Expr(); } else { total_cost.memory += analysis.cost.memory; } debug(0) << "Group: " << g.first << " ["; debug(0) << analysis.cost.arith << ", " << analysis.cost.memory << ", " << analysis.parallelism << "]\n"; } total_cost.simplify(); debug(0) << "Total arithmetic cost: " << total_cost.arith << "\n" << "Total memory cost: " << total_cost.memory << "\n" << "===============\n"; } // Construct a partitioner and build the pipeline graph on which the grouping // algorithm operates. Partitioner::Partitioner(const map &_pipeline_bounds, const MachineParams &_arch_params, const vector &_outputs, DependenceAnalysis &_dep_analysis, RegionCosts &_costs) : pipeline_bounds(_pipeline_bounds), arch_params(_arch_params), dep_analysis(_dep_analysis), costs(_costs), outputs(_outputs) { // Place each stage of a function in its own group. Each stage is // a node in the pipeline graph. for (const auto &f : dep_analysis.env) { if (!pipeline_bounds.count(f.first)) { // If a function does not have a pipeline bound (i.e. it can be // statically proven that no one ever uses it), we should not // consider it during the grouping. debug(5) << "Creating partitioner: ignore function \"" << f.first << "\" since it has empty pipeline bounds\n"; continue; } int num_stages = f.second.updates().size() + 1; for (int s = 0; s < num_stages; s++) { FStage stg(f.second, s); Group g(stg, {stg}); groups.insert(make_pair(stg, g)); } } // Find the consumers of each function and use it to populate the children map. for (const auto &f : dep_analysis.env) { int num_stages = f.second.updates().size() + 1; for (int s = 0; s < num_stages; s++) { set parents = get_parents(f.second, s); for (const string &c : parents) { // Filter out the calls to pipeline inputs. 'env' only contains // the functions computed and not the inputs. auto iter = dep_analysis.env.find(c); if ((c != f.first) && (iter != dep_analysis.env.end())) { // Consumer depends only on the last stage of a producer // with multiple stages. const Function &prod_func = iter->second; int final_stage = prod_func.updates().size(); FStage prod_stage(prod_func, final_stage); FStage cons_stage(f.second, s); children[prod_stage].insert(cons_stage); } } if (s > 0) { // Update the children map to reflect the dependencies between // different stages of the same function. FStage prod_stage(f.second, s - 1); FStage cons_stage(f.second, s); children[prod_stage].insert(cons_stage); } } } } void Partitioner::initialize_groups() { for (pair &g : groups) { pair, GroupAnalysis> best = find_best_tile_config(g.second); g.second.tile_sizes = best.first; group_costs.emplace(g.second.output, best.second); } grouping_cache.clear(); } map Partitioner::evaluate_reuse(const FStage &stg, const set &prods) { map reuse; Function f = stg.func; // TODO: Check if tile size of 1 in each dimension gives a reasonable // answer or reuse should be evaluated at a much larger granularity or // symbolically. Using a symbolic version might be better if the objective // is to prove the dimension has no reuse. The only downside with the // symbolic method is that it is totally at the mercy of the simplifier. // Another option is sampling or using a larger granularity. map tile_sizes; const vector &dims = get_stage_dims(stg.func, stg.stage_num); for (int d = 0; d < (int)dims.size() - 1; d++) { tile_sizes[dims[d].var] = 1; } DimBounds bounds = get_bounds_from_tile_sizes(stg, tile_sizes); vector> reuse_regions = dep_analysis.overlap_regions(stg.func, stg.stage_num, bounds, prods, false, &costs.input_estimates); for (int d = 0; d < (int)dims.size() - 1; d++) { Expr total_reuse = make_zero(Int(64)); if (debug::debug_level() >= 3) { disp_regions(reuse_regions[d]); } for (const auto ® : reuse_regions[d]) { Expr size = box_size(reg.second); if (!size.defined()) { total_reuse = Expr(); break; } else { total_reuse += size; } } reuse.emplace(dims[d].var, simplify(total_reuse)); } return reuse; } vector> Partitioner::choose_candidate_grouping(const vector> &cands, Partitioner::Level level) { vector> best_grouping; Expr best_benefit = make_zero(Int(64)); for (const auto &p : cands) { // Compute the aggregate benefit of inlining into all the children. vector> grouping; const Function &prod_f = get_element(dep_analysis.env, p.first); int final_stage = prod_f.updates().size(); FStage prod(prod_f, final_stage); for (const FStage &c : get_element(children, prod)) { GroupConfig best_config; GroupingChoice cand_choice(prod_f.name(), c); // Check if the candidate has been evaluated for grouping before const auto &iter = grouping_cache.find(cand_choice); if (iter != grouping_cache.end()) { best_config = iter->second; } else { best_config = evaluate_choice(cand_choice, level); // Cache the result of the evaluation for the pair grouping_cache.emplace(cand_choice, best_config); } grouping.emplace_back(cand_choice, best_config); } bool no_redundant_work = false; Expr overall_benefit = estimate_benefit(grouping, no_redundant_work, true); debug(3) << "Candidate grouping:\n"; for (const auto &g : grouping) { debug(3) << " " << g.first; } debug(3) << "Candidate benefit: " << overall_benefit << "\n"; // TODO: The grouping process can be non-deterministic when the costs // of two choices are equal if (overall_benefit.defined() && can_prove(best_benefit < overall_benefit)) { best_grouping = grouping; best_benefit = overall_benefit; } } debug(3) << "\nBest grouping:\n"; for (const auto &g : best_grouping) { debug(3) << " " << g.first; } if (!best_grouping.empty()) { debug(3) << "Best benefit: " << best_benefit << "\n"; } return best_grouping; } inline bool operator==(const map &m1, const map &m2) { if (m1.size() != m2.size()) { return false; } for (const auto &it1 : m1) { const auto &it2 = m2.find(it1.first); if (it2 == m2.end()) { return false; } else if (!equal(it1.second, it2->second)) { return false; } } return true; } vector> Partitioner::generate_tile_configs(const FStage &stg) { // TODO: This is a wart due to the cost model not taking vectorization // and pre-fetching into account. Ensuring the innermost dimension has // at least size of 64 gives enough values for vectorization and can help // with prefetching. This also interacts with the number of parallel tasks // that are generated. int min_inner_dim_size = 64; const vector &dims = get_stage_dims(stg.func, stg.stage_num); // Get the dimensions that are going to be tiled in this stage. // Skipping rvars for now. vector tile_vars; for (int d = 0; d < (int)dims.size() - 1; d++) { if (!dims[d].is_rvar()) { tile_vars.push_back(dims[d].var); } } vector size_variants = {1, 4, 8, 16, 32, 64, 128, 256}; vector> tile_configs; // For all the tile configurations generated, we force the innermost dimension // to be at least of size 64 to ensure enough values for vectorization. // Skewed tile configurations for (size_t i = 0; i < tile_vars.size(); i++) { for (const auto &dim_size : size_variants) { map tiling; tiling.emplace(tile_vars[i], (i == 0) ? std::max(dim_size, min_inner_dim_size) : dim_size); for (size_t j = 0; j < tile_vars.size(); j++) { if (j < i) { tiling.emplace(tile_vars[j], size_variants[size_variants.size() - 1]); } else if (j > i) { tiling.emplace(tile_vars[j], size_variants[0]); } } if (!tiling.empty()) { bool is_duplicate = std::find_if(tile_configs.begin(), tile_configs.end(), [&tiling](const map &m) { return (tiling == m); }) != tile_configs.end(); if (!is_duplicate) { tile_configs.push_back(tiling); } } } } // Almost square tile configurations for (const auto &dim_size : size_variants) { map tiling; for (size_t j = 0; j < tile_vars.size(); j++) { tiling.emplace(tile_vars[j], (j == 0) ? std::max(dim_size, min_inner_dim_size) : dim_size); } if (!tiling.empty()) { bool is_duplicate = std::find_if(tile_configs.begin(), tile_configs.end(), [&tiling](const map &m) { return (tiling == m); }) != tile_configs.end(); if (!is_duplicate) { tile_configs.push_back(tiling); } } } // Reorder tile configurations for (int i = 0; i < (1 << (tile_vars.size())); i++) { map tiling; for (size_t j = 0; j < tile_vars.size(); j++) { if (((i >> (j)) & 1) == 1) { if (j == 0) { tiling.emplace(tile_vars[j], min_inner_dim_size); } else { tiling.emplace(tile_vars[j], 1); } } } if (!tiling.empty()) { bool is_duplicate = std::find_if(tile_configs.begin(), tile_configs.end(), [&tiling](const map &m) { return (tiling == m); }) != tile_configs.end(); if (!is_duplicate) { tile_configs.push_back(tiling); } } } return tile_configs; } pair, Partitioner::GroupAnalysis> Partitioner::find_best_tile_config(const Group &g) { // Initialize to no tiling map no_tile_config; Group no_tile = g; no_tile.tile_sizes = no_tile_config; bool show_analysis = false; GroupAnalysis no_tile_analysis = analyze_group(no_tile, show_analysis); GroupAnalysis best_analysis = no_tile_analysis; map best_config = no_tile_config; if (!best_analysis.cost.defined()) { return make_pair(best_config, best_analysis); } // Generate tiling configurations vector> configs = generate_tile_configs(g.output); Group best_group = g; for (const auto &config : configs) { Group new_group = g; new_group.tile_sizes = config; GroupAnalysis new_analysis = analyze_group(new_group, show_analysis); bool no_redundant_work = false; Expr benefit = estimate_benefit(best_analysis, new_analysis, no_redundant_work, true); if (show_analysis) { debug(0) << "Benefit relative to not tiling:" << benefit << "\n"; debug(0) << "Best analysis:" << new_analysis; debug(0) << "No tile analysis:" << no_tile_analysis; debug(0) << "arith cost:" << cast(new_analysis.cost.arith / no_tile_analysis.cost.arith) << ", mem cost:" << cast(new_analysis.cost.memory / no_tile_analysis.cost.memory) << "\n"; } if (benefit.defined() && can_prove(benefit > 0)) { best_config = config; best_analysis = new_analysis; best_group = new_group; } } return make_pair(best_config, best_analysis); } void Partitioner::group(Partitioner::Level level) { bool fixpoint = false; while (!fixpoint) { Cost pre_merge = get_pipeline_cost(); fixpoint = true; vector> cand; for (const pair &g : groups) { bool is_output = false; for (const Function &f : outputs) { if (g.first.func.name() == f.name()) { is_output = true; break; } } // All stages of a function are computed at a single location. // The last stage of the function represents the candidate choice // of grouping the function into a consumer. const Function &prod_f = get_element(dep_analysis.env, g.first.func.name()); bool is_final_stage = (g.first.stage_num == prod_f.updates().size()); if (is_output || !is_final_stage) { continue; } const auto &iter = children.find(g.first); if (iter != children.end()) { // All the stages belonging to a function are considered to be a // single child. set child_groups; for (const FStage &s : iter->second) { child_groups.insert(s.func.name()); } int num_children = child_groups.size(); // Only groups with a single child are considered for grouping // when grouping for computing in tiles. // TODO: The current scheduling model does not allow functions // to be computed at different points. if ((num_children == 1) && (level == Partitioner::Level::FastMem)) { const string &prod_name = prod_f.name(); const string &cons_name = (*child_groups.begin()); cand.emplace_back(prod_name, cons_name); } else if ((level == Partitioner::Level::Inline) && prod_f.is_pure()) { const string &prod_name = prod_f.name(); cand.emplace_back(prod_name, ""); } } } debug(3) << "\n============================\n" << "Current grouping candidates:\n" << "============================\n"; for (size_t i = 0; i < cand.size(); ++i) { debug(3) << "{" << cand[i].first << ", " << cand[i].second << "}\n"; } vector> best = choose_candidate_grouping(cand, level); if (best.empty()) { continue; } else { fixpoint = false; } // The following code makes the assumption that all the stages of a function // will be in the same group. 'choose_candidate_grouping' ensures that the // grouping choice being returned adheres to this constraint. const string &prod = best[0].first.prod; const Function &prod_f = get_element(dep_analysis.env, prod); size_t num_stages = prod_f.updates().size() + 1; FStage final_stage(prod_f, num_stages - 1); set prod_group_children = get_element(children, final_stage); // Invalidate entries of the grouping cache set invalid_keys; for (const auto &c : prod_group_children) { for (const auto &entry : grouping_cache) { if ((entry.first.prod == c.func.name()) || (entry.first.cons == c)) { invalid_keys.insert(entry.first); } } } for (const auto &key : invalid_keys) { grouping_cache.erase(key); } for (const auto &group : best) { internal_assert(group.first.prod == prod); merge_groups(group.first, group.second, level); } for (size_t s = 0; s < num_stages; s++) { FStage prod_group(prod_f, s); groups.erase(prod_group); group_costs.erase(prod_group); // Update the children mapping children.erase(prod_group); for (auto &f : children) { set &cons = f.second; auto iter = cons.find(prod_group); if (iter != cons.end()) { cons.erase(iter); // For a function with multiple stages, all the stages will // be in the same group and the consumers of the function // only depend on the last stage. Therefore, when the // producer group has multiple stages, parents of the // producers should point to the consumers of the last // stage of the producer. cons.insert(prod_group_children.begin(), prod_group_children.end()); } } } Cost post_merge = get_pipeline_cost(); if (debug::debug_level() >= 3) { disp_pipeline_costs(); } } } DimBounds Partitioner::get_bounds(const FStage &s) { DimBounds bounds; const vector &args = s.func.args(); for (size_t d = 0; d < args.size(); d++) { bounds[args[d]] = get_element(pipeline_bounds, s.func.name())[d]; } return get_stage_bounds(s.func, s.stage_num, bounds); } DimBounds Partitioner::get_bounds_from_tile_sizes(const FStage &s, const map &tile_sizes) { map bounds; const map &def_bounds = get_bounds(s); const vector &dims = get_stage_dims(s.func, s.stage_num); for (int d = 0; d < (int)dims.size() - 1; d++) { string var = dims[d].var; const Interval &bound = get_element(def_bounds, var); const auto &iter = tile_sizes.find(var); if (iter != tile_sizes.end()) { const Expr &size = iter->second; // Check if the bounds allow for tiling with the given tile size, // i.e. ensure at least 2 tiles Expr extent = get_extent(bound); internal_assert(extent.defined()); if (can_prove(extent >= 2 * size)) { // TODO: Maybe shift this to the center of the pipeline bound bounds[var] = Interval(0, simplify(size - 1)); } else { // If the dimension is too small, do not tile it and set the // extent of the bounds to that of the dimension estimate bounds[var] = bound; } } else { bounds[var] = bound; } } return bounds; } Partitioner::GroupAnalysis Partitioner::analyze_group(const Group &g, bool show_analysis) { set group_inputs; set group_members; for (const auto &stg : g.members) { group_members.insert(stg.func.name()); set parents = get_parents(stg.func, stg.stage_num); for (const auto &c : parents) { bool is_member = false; for (const auto &m : g.members) { if (m.func.name() == c) { is_member = true; break; } } if (!is_member) { group_inputs.insert(c); } } } // Count the number of tiles Expr estimate_tiles = make_one(Int(64)); Expr parallelism = make_one(Int(64)); if (!g.output.func.has_extern_definition()) { // Get the definition corresponding to the group output Definition def = get_stage_definition(g.output.func, g.output.stage_num); const vector &dims = def.schedule().dims(); DimBounds stg_bounds = get_bounds(g.output); for (int d = 0; d < (int)dims.size() - 1; d++) { const string &var = dims[d].var; const auto &iter = g.tile_sizes.find(var); if (iter != g.tile_sizes.end()) { const Expr &size = iter->second; Expr extent = get_extent(get_element(stg_bounds, var)); if (!extent.defined()) { return GroupAnalysis(); } Expr dim_tiles = simplify((extent + size - 1) / size); estimate_tiles *= dim_tiles; // Since all Vars are inherently parallelizable by construct, we // only need to take RVars into account for the analysis. if (can_parallelize_rvar(var, g.output.func.name(), def)) { parallelism *= dim_tiles; } } } } // Get the regions of the pipeline required to compute a tile of the group DimBounds tile_bounds = get_bounds_from_tile_sizes(g.output, g.tile_sizes); map alloc_regions = dep_analysis.regions_required( g.output.func, g.output.stage_num, tile_bounds, group_members, false, &costs.input_estimates); map compute_regions = dep_analysis.regions_required( g.output.func, g.output.stage_num, tile_bounds, group_members, true, &costs.input_estimates); map group_reg, prod_reg, input_reg; // Separating into regions that computed within the group and regions that // are input to the group for (const auto ® : compute_regions) { if ((group_members.find(reg.first) != group_members.end()) && (reg.first != g.output.func.name())) { group_reg.emplace(reg.first, reg.second); } else if (group_inputs.find(reg.first) != group_inputs.end()) { if (dep_analysis.env.find(reg.first) != dep_analysis.env.end()) { prod_reg.emplace(reg.first, reg.second); } else { input_reg.emplace(reg.first, reg.second); } } } // Aggregate costs for intermediate functions in a tile and the // tile output Cost tile_cost = costs.region_cost(group_reg, g.inlined); if (!tile_cost.defined()) { return GroupAnalysis(); } Cost out_cost = costs.stage_region_cost(g.output.func.name(), g.output.stage_num, tile_bounds, g.inlined); if (!out_cost.defined()) { return GroupAnalysis(); } for (const auto ® : alloc_regions) { if (!box_size(reg.second).defined()) { return GroupAnalysis(); } } Cost group_cost(simplify(tile_cost.arith + out_cost.arith), simplify(tile_cost.memory + out_cost.memory)); // Detailed load costs for all the group intermediates map group_load_costs = costs.detailed_load_costs(group_reg, g.inlined); map out_load_costs = costs.stage_detailed_load_costs(g.output.func.name(), g.output.stage_num, tile_bounds, g.inlined); combine_load_costs(group_load_costs, out_load_costs); Box out_tile_extent; if (g.output.stage_num == 0) { const vector &args = g.output.func.args(); for (size_t d = 0; d < args.size(); d++) { const auto &iter = tile_bounds.find(args[d]); if (iter != tile_bounds.end()) { out_tile_extent.push_back(iter->second); } else { out_tile_extent.push_back(Interval()); } } } Cost per_tile_cost(group_cost.arith, make_zero(Int(64))); // This is the old cost model; keeping it here for reference, for now. /* if (tile_inter_size > arch_params.l1_size) { // Conservative estimate of accesses to memory //per_tile_mem_cost = tile_inter_size; // Aggressive estimate of accesses to memory per_tile_mem_cost = tile_cost.second; } else { // The tile_input_size captures the region of the input // required to compute the tile. However, all of it many not be // accessed during the computation of the tile when the access // is sparse. A better estimate is given by the smaller of // the number of memory accesses and the region size per_tile_mem_cost = std::min(tile_input_size + tile_output_size, tile_cost.second); }*/ // TODO: Use smooth step curve from Jon to better model cache behavior, // where each step corresponds to different cache level. // // The current cost model drops off linearly. Larger memory footprint is // penalized more than smaller memory footprint (since smaller one can fit // more in the cache). The cost is clamped at 'balance', which is roughly at // memory footprint equal to or larger than the last level cache size. // If 'model_reuse' is set, the cost model should take into account memory // reuse within the tile, e.g. matrix multiply reuses inputs multiple times. // TODO: Implement a better reuse model. bool model_reuse = false; // Linear dropoff float load_slope = arch_params.balance / arch_params.last_level_cache_size; for (const auto &f_load : group_load_costs) { internal_assert(g.inlined.find(f_load.first) == g.inlined.end()) << "Intermediates of inlined pure fuction \"" << f_load.first << "\" should not have been in the group_load_costs\n"; const auto &alloc_reg = get_element(alloc_regions, f_load.first); Expr footprint; bool is_group_member = (group_members.find(f_load.first) != group_members.end()); bool is_output = (f_load.first == g.output.func.name()); // We use allocated region as conservative estimate of the footprint since // the loads could be from any random locations of the allocated regions. if (!is_output && is_group_member) { footprint = costs.region_size(f_load.first, alloc_reg); } else { Expr initial_footprint; const auto &f_load_pipeline_bounds = get_element(pipeline_bounds, f_load.first); bool is_function = (dep_analysis.env.find(f_load.first) != dep_analysis.env.end()); if (!is_function) { // It is a load to some input buffer // Initial loads initial_footprint = costs.input_region_size(f_load.first, f_load_pipeline_bounds); // Subsequent loads footprint = costs.input_region_size(f_load.first, alloc_reg); } else if (is_output) { // Load to the output function of the group internal_assert(is_group_member) << "Output " << f_load.first << " should have been a group member\n"; // Initial loads initial_footprint = costs.region_size(f_load.first, f_load_pipeline_bounds); // Subsequent loads footprint = costs.region_size(f_load.first, out_tile_extent); } else { // Load to some non-member function (i.e. function from other groups) // Initial loads initial_footprint = costs.region_size(f_load.first, f_load_pipeline_bounds); // Subsequent loads footprint = costs.region_size(f_load.first, alloc_reg); } if (model_reuse) { Expr initial_factor = cast(min(1 + initial_footprint * load_slope, arch_params.balance)); per_tile_cost.memory += initial_factor * footprint; } else { footprint = initial_footprint; } if (!footprint.defined()) { return GroupAnalysis(); } } Expr cost_factor = cast(min(1 + footprint * load_slope, arch_params.balance)); per_tile_cost.memory += cost_factor * f_load.second; } if (show_analysis) { debug(0) << "\nDetailed loads:\n"; for (const auto &f_load : group_load_costs) { debug(0) << "(" << f_load.first << "," << f_load.second << ")"; } debug(0) << "\n"; debug(0) << "\nPer tile memory cost:" << per_tile_cost.memory << "\n"; debug(0) << "Per tile arith cost:" << per_tile_cost.arith << "\n"; } GroupAnalysis g_analysis( Cost(per_tile_cost.arith * estimate_tiles, per_tile_cost.memory * estimate_tiles), parallelism); g_analysis.simplify(); return g_analysis; } Partitioner::Group Partitioner::merge_groups(const Group &prod_group, const Group &cons_group) { vector group_members; for (const auto &s : prod_group.members) { group_members.push_back(s); } for (const auto &s : cons_group.members) { group_members.push_back(s); } Group group(cons_group.output, group_members); for (const auto &f : prod_group.inlined) { group.inlined.insert(f); } for (const auto &f : cons_group.inlined) { group.inlined.insert(f); } return group; } void Partitioner::merge_groups(const GroupingChoice &choice, const GroupConfig &eval, Partitioner::Level level) { const Function &prod_f = get_element(dep_analysis.env, choice.prod); size_t num_stages = prod_f.updates().size() + 1; const FStage &child = choice.cons; Group &child_group = get_element(groups, child); for (size_t s = 0; s < num_stages; s++) { FStage cand(prod_f, s); Group &cand_group = get_element(groups, cand); child_group.members.insert(child_group.members.end(), cand_group.members.begin(), cand_group.members.end()); if (level == Partitioner::Level::Inline) { for (const auto &stg : cand_group.members) { child_group.inlined.insert(stg.func.name()); } } else { for (const auto &in : cand_group.inlined) { child_group.inlined.insert(in); } } } child_group.tile_sizes = eval.tile_sizes; // Update group costs. // We could just reuse the analysis from 'eval' since it was computed // by assuming the merge had happened. group_costs[child] = eval.analysis; } Partitioner::GroupConfig Partitioner::evaluate_choice(const GroupingChoice &choice, Partitioner::Level level) { // Create a group that reflects the grouping choice and evaluate the cost // of the group. const Function &prod_f = get_element(dep_analysis.env, choice.prod); int num_prod_stages = prod_f.updates().size() + 1; vector prod_groups; for (int s = 0; s < num_prod_stages; s++) { FStage prod_s(prod_f, s); prod_groups.push_back(get_element(groups, prod_s)); } Group cons = get_element(groups, choice.cons); Group group = cons; for (const auto &prod_g : prod_groups) { group = merge_groups(prod_g, group); } GroupAnalysis group_analysis; map best_tile_config; if (level == Partitioner::Level::Inline) { // Set the tile sizes to one along all dimensions of the consumer group map tile_sizes; const Function &cons_f = cons.output.func; const vector &dims = get_stage_dims(cons_f, cons.output.stage_num); for (int d = 0; d < (int)dims.size() - 1; d++) { tile_sizes[dims[d].var] = 1; } group.tile_sizes = tile_sizes; for (const auto &prod_g : prod_groups) { for (const FStage &s : prod_g.members) { group.inlined.insert(s.func.name()); } } for (const string &f : cons.inlined) { group.inlined.insert(f); } group_analysis = analyze_group(group, false); best_tile_config = tile_sizes; } else { pair, GroupAnalysis> config = find_best_tile_config(group); best_tile_config = config.first; group_analysis = config.second; } return GroupConfig(best_tile_config, group_analysis); } Expr Partitioner::estimate_benefit(const GroupAnalysis &old_grouping, const GroupAnalysis &new_grouping, bool no_redundant_work, bool ensure_parallelism) const { // TODO: Instead of having a hard parallelism constraint, it may be better // to consider other metric, such as arith_cost/parallelism if (ensure_parallelism && (!new_grouping.parallelism.defined() || !can_prove(new_grouping.parallelism >= arch_params.parallelism))) { return Expr(); } if (!old_grouping.cost.defined() || !new_grouping.cost.defined()) { return Expr(); } Expr arith_benefit = old_grouping.cost.arith - new_grouping.cost.arith; if (no_redundant_work && !can_prove(arith_benefit >= 0)) { return Expr(); } Expr mem_benefit = old_grouping.cost.memory - new_grouping.cost.memory; return simplify(mem_benefit + arith_benefit); } Expr Partitioner::estimate_benefit( const vector> &new_grouping, bool no_redundant_work, bool ensure_parallelism) const { set old_groups; GroupAnalysis new_group_analysis(Cost(0, 0), Int(64).max()); for (const auto &g : new_grouping) { const Function &prod_f = get_element(dep_analysis.env, g.first.prod); int num_prod_stages = prod_f.updates().size() + 1; for (int s = 0; s < num_prod_stages; s++) { FStage prod_s(prod_f, s); old_groups.insert(prod_s); } old_groups.insert(g.first.cons); GroupAnalysis analysisg = g.second.analysis; if (analysisg.defined()) { new_group_analysis.cost.arith += analysisg.cost.arith; new_group_analysis.cost.memory += analysisg.cost.memory; new_group_analysis.parallelism = min(new_group_analysis.parallelism, analysisg.parallelism); } else { new_group_analysis.cost = Cost(); new_group_analysis.parallelism = Expr(); break; } } new_group_analysis.simplify(); GroupAnalysis old_group_analysis(Cost(0, 0), Int(64).max()); for (const auto &g : old_groups) { const auto &iter = group_costs.find(g); internal_assert(iter != group_costs.end()); GroupAnalysis analysisg = iter->second; if (analysisg.defined()) { old_group_analysis.cost.arith += analysisg.cost.arith; old_group_analysis.cost.memory += analysisg.cost.memory; old_group_analysis.parallelism = min(old_group_analysis.parallelism, analysisg.parallelism); } else { old_group_analysis.cost = Cost(); old_group_analysis.parallelism = Expr(); break; } } old_group_analysis.simplify(); return estimate_benefit(old_group_analysis, new_group_analysis, no_redundant_work, ensure_parallelism); } map Partitioner::bounds_to_estimates(const DimBounds &bounds) { map estimates; for (const auto &bound : bounds) { estimates.emplace(bound.first, get_extent(bound.second)); } return estimates; } map> Partitioner::group_storage_bounds() { map> group_storage_bounds; for (const pair &gpair : groups) { const Group &g = gpair.second; DimBounds bounds = get_bounds_from_tile_sizes(g.output, g.tile_sizes); set prods; for (const FStage &s : g.members) { prods.insert(s.func.name()); } map reg_alloc = dep_analysis.regions_required(g.output.func, g.output.stage_num, bounds, prods, false, &costs.input_estimates); map group_alloc; for (const FStage &s : g.members) { const auto &iter = reg_alloc.find(s.func.name()); if ((iter != reg_alloc.end()) && (s.func.name() != g.output.func.name())) { group_alloc[s.func.name()] = iter->second; } } group_storage_bounds[gpair.first] = group_alloc; } return group_storage_bounds; } map> Partitioner::group_loop_bounds() { map> group_bounds; for (const pair &gpair : groups) { Group g = gpair.second; map mem_bounds; DimBounds bounds = get_bounds_from_tile_sizes(g.output, g.tile_sizes); set prods; for (const FStage &s : g.members) { prods.insert(s.func.name()); } map reg_computed = dep_analysis.regions_required(g.output.func, g.output.stage_num, bounds, prods, true, &costs.input_estimates); for (const FStage &s : g.members) { const auto &iter = reg_computed.find(s.func.name()); if (iter != reg_computed.end()) { map tile_sizes; const vector &args = s.func.args(); for (size_t arg = 0; arg < args.size(); arg++) { tile_sizes[args[arg]] = get_extent(iter->second[arg]); } mem_bounds[s] = get_bounds_from_tile_sizes(s, tile_sizes); } } group_bounds[gpair.first] = mem_bounds; } return group_bounds; } // We need to get the base name of the dimension for scheduling (i.e. it // can't have any dots). For example, in split case, if "x" is the starting // dimension name, after split(x, x0, xi, ...), we will end up with something // like "x.x0" and "x.xi". If we want to later schedule "x.x0", we need to // pass "x0" instead of "x.x0". string get_base_name(string name) { size_t dot_pos = name.rfind('.'); if (dot_pos != string::npos) { return name.substr(dot_pos + 1); } return name; } pair Partitioner::split_dim( const Group &g, Stage f_handle, int stage_num, const Definition &def, bool is_group_output, const VarOrRVar &v, const Expr &factor, const string &in_suffix, const string &out_suffix, map &estimates, AutoSchedule &sched) { // Create new variables for the split dimensions string arg_name = v.name(); string inner_name = arg_name + in_suffix; string outer_name = arg_name + out_suffix; VarOrRVar inner(inner_name, v.is_rvar), outer(outer_name, v.is_rvar); { const auto &iter = sched.internal_vars.find(inner.name()); if (iter == sched.internal_vars.end()) { sched.internal_vars.emplace(inner.name(), inner); } else { internal_assert(iter->second.is_rvar == inner.is_rvar); } } { const auto &iter = sched.internal_vars.find(outer.name()); if (iter == sched.internal_vars.end()) { sched.internal_vars.emplace(outer.name(), outer); } else { internal_assert(iter->second.is_rvar == outer.is_rvar); } } // The default tail strategy is good enough for most use cases (see docs on // TailStrategy::Auto). However, the default of pure vars in update definitions // is RoundUp, which may introduces an out-of-bound error if it is an access // to inputs or outputs. // // We could have just used GuardWithIf when splitting pure vars in update // definition to ensure no out-of-bounds error. However, this is only // necessary, if the update definition involves accesses to inputs or outputs. // For other accesses, we could potentially use a more aggressive tail strategy // such as RoundUp or ShiftInwards. Note that if we use RoundUp or ShiftInwards, // any nested loops (generated by compute_at) will be affected as well. However, // since in the current auto-scheduler model, we always compute_at at the group // output, if the update definition is not the group output, we do not need to // care for the nested loops. If it is the update definition of the group output // however, we'd better make sure that no other member of the groups accesses // the inputs or outputs. TailStrategy strategy = TailStrategy::Auto; if ((stage_num > 0) && !v.is_rvar) { // TODO: It's safe to use RoundUp here if we know there are no // loads from any input, but at this point we've lost track of // loads from inputs that happen inside inlined Funcs. strategy = TailStrategy::GuardWithIf; } f_handle.split(v, outer, inner, factor, strategy); std::ostringstream oss; oss << "split(" << arg_name << ", " << outer_name << ", " << inner_name << ", " << factor; switch (strategy) { case TailStrategy::RoundUp: oss << ", TailStrategy::RoundUp)"; break; case TailStrategy::GuardWithIf: oss << ", TailStrategy::GuardWithIf)"; break; case TailStrategy::ShiftInwards: oss << ", TailStrategy::ShiftInwards)"; break; case TailStrategy::Auto: oss << ")"; break; default: internal_error; } sched.push_schedule(f_handle.name(), stage_num, oss.str(), {arg_name, outer_name, inner_name}); const Expr &est = get_element(estimates, arg_name); internal_assert(est.defined()); estimates[inner_name] = factor; estimates[outer_name] = simplify((est + factor - 1) / factor); estimates.erase(arg_name); return make_pair(inner, outer); } void Partitioner::vectorize_stage(const Group &g, Stage f_handle, int stage_num, Definition def, const Function &func, bool is_group_output, const Target &t, set &rvars, map &estimates, AutoSchedule &sched) { vector &dims = def.schedule().dims(); int vec_dim_index = -1; // Set the vector length as the maximum of the natural vector size of all // values produced by the function. int vec_len = 0; for (const auto &type : func.output_types()) { vec_len = std::max(vec_len, t.natural_vector_size(type)); } for (int d = 0; d < (int)dims.size() - 1; d++) { string dim_name = get_base_name(dims[d].var); bool can_vectorize = true; if (rvars.find(dim_name) != rvars.end()) { can_vectorize = can_parallelize_rvar(dim_name, func.name(), def); } const auto &iter = estimates.find(dim_name); if ((iter != estimates.end()) && iter->second.defined()) { if (can_vectorize && can_prove(iter->second >= vec_len)) { vec_dim_index = d; break; } } } if (vec_dim_index >= 0) { string vec_dim_name = get_base_name(dims[vec_dim_index].var); bool is_rvar = (rvars.find(vec_dim_name) != rvars.end()); internal_assert(is_rvar == dims[vec_dim_index].is_rvar()); VarOrRVar vec_var(vec_dim_name, is_rvar); pair split_vars = split_dim(g, f_handle, stage_num, def, is_group_output, vec_var, vec_len, "_vi", "_vo", estimates, sched); f_handle.vectorize(split_vars.first); sched.push_schedule(f_handle.name(), stage_num, "vectorize(" + split_vars.first.name() + ")", {split_vars.first.name()}); if (is_rvar) { rvars.erase(vec_dim_name); rvars.insert(split_vars.first.name()); rvars.insert(split_vars.second.name()); } // TODO: Reorder vector dim to innermost if it is the innermost // storage dimension of the func. // // TODO: Check if the warning is necessary. if (vec_dim_index > 0) { user_warning << "Outer dim vectorization of var \"" << vec_dim_name << "\" in function \"" << f_handle.name() << "\"\n"; } } } // Return true if the vars/rvars in 'ordering' are in the same order as the // dim list. inline bool operator==(const vector &dims, const vector &ordering) { if (dims.size() != ordering.size() + 1) { // The dim list also contains '__outermost' return false; } for (size_t i = 0; i < ordering.size(); ++i) { if (dims[i].var != ordering[i].name()) { return false; } } return true; } // Return true if the vars/rvars in 'ordering' are not in the same order as the // dim list. inline bool operator!=(const vector &dims, const vector &ordering) { return !(dims == ordering); } void Partitioner::reorder_dims(Stage f_handle, int stage_num, Definition def, map strides, AutoSchedule &sched) { vector &dims = def.schedule().dims(); internal_assert(dims.size() > 1); vector> order; for (int d = 0; d < (int)dims.size() - 1; d++) { internal_assert(strides.find(dims[d].var) != strides.end()); } // Iterate until all the dimensions have been assigned an order while (!strides.empty()) { // Find the pure dimension (can be vars or rvars) with the smallest stride bool found_pure_dim = false; Expr min_pure_stride = Int(64).max(); string min_pure_var; int min_pure_index = -1; for (int d = 0; d < (int)dims.size() - 1; d++) { string var_name = get_base_name(dims[d].var); const auto &iter = strides.find(var_name); if ((iter != strides.end()) && dims[d].is_pure()) { const Expr &dim_stride = iter->second; internal_assert(dim_stride.defined()); if (can_prove(dim_stride < min_pure_stride)) { min_pure_stride = dim_stride; min_pure_var = var_name; min_pure_index = d; } found_pure_dim = true; } } if (found_pure_dim && min_pure_var.empty()) { // Since none of the pure strides can be proven as the minimum, we // should break here otherwise it may cause infinite loop. return; } // Check if the stride of the pure dimension is smaller than // the first impure dimension that has not yet been assigned // an order Expr min_impure_stride = Int(64).max(); string min_impure_var; int min_impure_index = -1; for (int d = 0; d < (int)dims.size() - 1; d++) { string var_name = get_base_name(dims[d].var); const auto &iter = strides.find(var_name); if ((iter != strides.end()) && !dims[d].is_pure()) { const Expr &dim_stride = iter->second; internal_assert(dim_stride.defined()); if (can_prove(dim_stride < min_impure_stride)) { min_impure_stride = dim_stride; min_impure_var = var_name; min_impure_index = d; // Impure dimensions cannot be reordered relative to // each other. Stop after encountering the first impure // dimension. break; } } } if (min_pure_var.empty() && min_impure_var.empty()) { // Since none of the pure and impure strides can be proven as the // minimum, we should break here otherwise it may cause infinite loop. return; } pair curr_min_var; if (!min_impure_var.empty() && can_prove(min_impure_stride < min_pure_stride)) { curr_min_var.first = min_impure_var; curr_min_var.second = min_impure_index; internal_assert(dims[min_impure_index].is_rvar()); } else { curr_min_var.first = min_pure_var; curr_min_var.second = min_pure_index; } order.push_back(curr_min_var); strides.erase(curr_min_var.first); } vector ordering; for (const auto &o : order) { VarOrRVar o_var(o.first, dims[o.second].is_rvar()); ordering.push_back(o_var); } internal_assert(!ordering.empty()); set var_list = {ordering[0].name()}; string var_order = ordering[0].name(); for (size_t o = 1; o < ordering.size(); o++) { var_order += ", " + ordering[o].name(); var_list.insert(ordering[o].name()); } if (dims != ordering) { f_handle.reorder(ordering); sched.push_schedule(f_handle.name(), stage_num, "reorder(" + var_order + ")", var_list); } } // Visitor to find all the variables the depend on a variable. class FindVarsUsingVar : public IRVisitor { using IRVisitor::visit; void visit(const Let *let) override { if (expr_uses_vars(let->value, vars)) { vars.push(let->name); } let->value.accept(this); let->body.accept(this); } public: Scope<> vars; FindVarsUsingVar(const string &var) { vars.push(var); } }; void Partitioner::generate_group_cpu_schedule( const Group &g, const Target &t, const map &group_loop_bounds, const map &group_storage_bounds, const set &inlines, AutoSchedule &sched) { string out_f_name = g.output.func.name(); Function g_out = g.output.func; debug(3) << "\n================\n" << "Scheduling group:\n" << "================\n" << g; if (g.output.func.has_extern_definition()) { internal_assert(g.members.size() == 1); Func(g_out).compute_root(); sched.push_schedule(g_out.name(), g.output.stage_num, "compute_root()", {}); return; } // Get the estimates for stage bounds DimBounds stg_bounds = get_bounds(g.output); map stg_estimates = bounds_to_estimates(stg_bounds); Stage f_handle = Stage(Func(g_out)); // Get a function handle for scheduling the stage if (g.output.stage_num > 0) { int stage_num = g.output.stage_num; f_handle = Func(g_out).update(stage_num - 1); } else { Func(g_out).compute_root(); sched.push_schedule(f_handle.name(), g.output.stage_num, "compute_root()", {}); } // Realize tiling and update the dimension estimates vector outer_dims; vector inner_dims; // Get the definition corresponding to the stage Definition def = get_stage_definition(g_out, g.output.stage_num); // 'dims' will get modified since we are going to apply the schedules // (e.g. tiling, reordering, etc.) vector &dims = def.schedule().dims(); // Keep track of the rvars set rvars; for (int d = 0; d < (int)dims.size() - 1; d++) { if (dims[d].is_rvar()) { rvars.insert(get_base_name(dims[d].var)); } } // Reorder the dimensions for better spatial locality (i.e. smallest stride // is innermost). If we only have one dimension (excluding __outermost), // there is nothing to reorder. if (dims.size() > 2) { map strides = analyze_spatial_locality(g.output, group_storage_bounds, inlines); if (!strides.empty()) { reorder_dims(f_handle, g.output.stage_num, def, strides, sched); } } vector dim_vars(dims.size() - 1); for (int d = 0; d < (int)dims.size() - 1; d++) { dim_vars[d] = get_base_name(dims[d].var); } // Apply tiling to output of the group for (const auto &var : dim_vars) { bool is_rvar = (rvars.find(var) != rvars.end()); VarOrRVar v(var, is_rvar); const auto &iter = g.tile_sizes.find(var); if ((iter != g.tile_sizes.end()) && get_element(stg_estimates, var).defined() && can_prove(get_element(stg_estimates, var) > iter->second)) { const Expr &tile_size = iter->second; if (can_prove(tile_size == 1)) { outer_dims.push_back(v); } else { pair tile_vars = split_dim(g, f_handle, g.output.stage_num, def, true, v, tile_size, "_i", "_o", stg_estimates, sched); inner_dims.push_back(tile_vars.first); outer_dims.push_back(tile_vars.second); if (is_rvar) { rvars.erase(var); rvars.insert(tile_vars.first.name()); rvars.insert(tile_vars.second.name()); } } } else { inner_dims.push_back(v); } } // Reorder the tile dimensions if (!outer_dims.empty()) { vector ordering; for (const auto &v : inner_dims) { ordering.push_back(v); } for (const auto &v : outer_dims) { ordering.push_back(v); } set var_list; string var_order = ordering[0].name(); for (size_t o = 1; o < ordering.size(); o++) { var_order += ", " + ordering[o].name(); var_list.insert(ordering[o].name()); } if (dims != ordering) { f_handle.reorder(ordering); sched.push_schedule(f_handle.name(), g.output.stage_num, "reorder(" + var_order + ")", var_list); } } vectorize_stage(g, f_handle, g.output.stage_num, def, g_out, true, t, rvars, stg_estimates, sched); // Parallelize definition Expr def_par = 1; // TODO: Investigate if it is better to pull one large dimension and // parallelize over it or to generate nested parallelism. // // Go from the outer to the innermost loop until sufficient parallelism // is achieved. Stop the search once we find a vectorized dimension since // it doesn't make any sense to have a parallelized inner loop within a // vectorized outer loop. bool nested_parallelism = true; if (nested_parallelism) { int dim_start = dims.size() - 2; string seq_var; for (int d = dim_start; d >= 0; d--) { if (dims[d].for_type == ForType::Vectorized) { break; } string var = get_base_name(dims[d].var); bool is_rvar = (rvars.find(var) != rvars.end()); internal_assert(is_rvar == dims[d].is_rvar()); VarOrRVar v(var, is_rvar); if (is_rvar && !can_parallelize_rvar(var, g_out.name(), def)) { if (seq_var.empty()) { seq_var = var; } continue; } if (can_prove(def_par >= arch_params.parallelism)) { // Enough parallelism to saturate target machine break; } const auto &iter = stg_estimates.find(var); if ((iter != stg_estimates.end()) && iter->second.defined()) { if (!seq_var.empty()) { VarOrRVar seq(seq_var, (rvars.find(seq_var) != rvars.end())); f_handle.reorder(seq, v); sched.push_schedule(f_handle.name(), g.output.stage_num, "reorder(" + seq_var + ", " + var + ")", {seq_var, var}); } f_handle.parallel(v); sched.push_schedule(f_handle.name(), g.output.stage_num, "parallel(" + var + ")", {var}); def_par = simplify(def_par * iter->second); } else { break; } } } if (can_prove(def_par < arch_params.parallelism)) { user_warning << "Insufficient parallelism for " << f_handle.name() << "\n"; } // Find the level at which group members will be computed. int tile_inner_index = dims.size() - outer_dims.size() - 1; VarOrRVar tile_inner_var(Var::outermost()); if (!outer_dims.empty()) { string var_name = get_base_name(dims[tile_inner_index].var); bool is_rvar = (rvars.find(var_name) != rvars.end()); tile_inner_var = VarOrRVar(var_name, is_rvar); } for (const FStage &mem : g.members) { // Skip member stages that have been inlined or stage that is the // output stage of the group if ((g.inlined.find(mem.func.name()) != g.inlined.end()) || (mem.func.name() == g_out.name())) { continue; } // Get the definition corresponding to the stage Definition mem_def = get_stage_definition(mem.func, mem.stage_num); // Get the estimates for the dimensions of the member stage map mem_estimates = bounds_to_estimates(get_element(group_loop_bounds, mem)); set mem_rvars; vector &mem_dims = mem_def.schedule().dims(); for (int d = 0; d < (int)mem_dims.size() - 1; d++) { if (mem_dims[d].is_rvar()) { mem_rvars.insert(get_base_name(mem_dims[d].var)); } } // Get a function handle for scheduling the stage Stage mem_handle = Stage(Func(mem.func)); if (mem.stage_num > 0) { mem_handle = Func(mem.func).update(mem.stage_num - 1); } else { if (!outer_dims.empty()) { if (tile_inner_var.is_rvar) { Func(mem.func).compute_at(Func(g_out), tile_inner_var.rvar); } else { Func(mem.func).compute_at(Func(g_out), tile_inner_var.var); } string sanitized_g_out = get_sanitized_name(g_out.name()); sched.push_schedule(mem_handle.name(), mem.stage_num, "compute_at(" + sanitized_g_out + ", " + tile_inner_var.name() + ")", {sanitized_g_out, tile_inner_var.name()}); } else { user_warning << "Degenerate tiling. No dimensions are tiled" << "\n"; user_warning << "Computing \"" << mem.func.name() << "\" at root" << "\n"; Func(mem.func).compute_root(); sched.push_schedule(mem_handle.name(), mem.stage_num, "compute_root()", {}); } } // Reorder the dimensions for better spatial locality. If we only have // one dimension (excluding __outermost), there is nothing to reorder. if (dims.size() > 2) { map mem_strides = analyze_spatial_locality(mem, group_storage_bounds, inlines); if (!mem_strides.empty()) { reorder_dims(mem_handle, mem.stage_num, mem_def, mem_strides, sched); } } vectorize_stage(g, mem_handle, mem.stage_num, mem_def, mem.func, false, t, mem_rvars, mem_estimates, sched); } } void Partitioner::generate_cpu_schedule(const Target &t, AutoSchedule &sched) { // Grab the group bounds early as they rely on the dimensions of the group // outputs which will be altered by modifying schedules. map> loop_bounds = group_loop_bounds(); map> storage_bounds = group_storage_bounds(); set inlines; // Mark all functions that are inlined. for (const pair &g : groups) { for (const string &inline_func : g.second.inlined) { inlines.insert(inline_func); } } // TODO: Inlining functions with update definitions has different // behavior than pure functions. They may need to be computed above // the innermost vector loop to avoid complications with varying // extents across different vector lanes. // // Since the default schedule is compute inline, we don't need to // explicitly call compute_inline() on the function. // Realize schedule for each group in the pipeline. for (const auto &g : groups) { generate_group_cpu_schedule(g.second, t, get_element(loop_bounds, g.first), get_element(storage_bounds, g.first), inlines, sched); } } Expr Partitioner::find_max_access_stride(const Scope<> &vars, const string &func_acc, const vector &acc_exprs, const Box &buffer_bounds) { size_t num_storage_dims = 0; Expr bytes_per_ele = make_zero(Int(64)); // Get the number of dimensions of the allocated storage and the // number of bytes required to store a single value of func_acc. const auto &iter = dep_analysis.env.find(func_acc); if (iter != dep_analysis.env.end()) { const Function &f = iter->second; for (const auto &e : f.values()) { bytes_per_ele += e.type().bytes(); } num_storage_dims = f.schedule().storage_dims().size(); } else { bytes_per_ele = get_element(costs.inputs, func_acc).bytes(); num_storage_dims = buffer_bounds.size(); } Expr curr_stride = bytes_per_ele; Expr stride = make_zero(Int(64)); internal_assert(num_storage_dims <= acc_exprs.size()); for (size_t sdim = 0; sdim < num_storage_dims; sdim++) { // Check if the access expression depends on any of the loop variables // in 'vars'. Expressions that do not involve the variable have stride 0. if (expr_uses_vars(acc_exprs[sdim], vars)) { stride = max(stride, curr_stride); } const Interval &dim_range = buffer_bounds[sdim]; Expr dim_extent = get_extent(dim_range); if (!dim_extent.defined()) { return Expr(); } curr_stride *= dim_extent; } return simplify(stride); } map Partitioner::analyze_spatial_locality(const FStage &stg, const map &allocation_bounds, const set &inlines) { internal_assert(!stg.func.has_extern_definition()); // Handle inlining. When a function is inlined into another, the stride of // the accesses should be computed on the expression post inlining. // For example: // f(x, y) = ...; // g(x, y) = f(y, x); // transpose // h(x, y) = g(y, x); // transpose // // If both g and f are inlined into h, then the resulting expression for h // will look like: // h(x, y) = f(x, y); // // Computing the stride of a loop over x in the function h will be incorrect // if inlining is not taken into account. // Get all the allocations accessed in the definition corresponding to 'stg'. FindAllCalls find; Definition def = get_stage_definition(stg.func, stg.stage_num); // Perform inlining on the all the values and the args in the stage. for (auto &val : def.values()) { val = perform_inline(val, dep_analysis.env, inlines, dep_analysis.order); } for (auto &arg : def.args()) { arg = perform_inline(arg, dep_analysis.env, inlines, dep_analysis.order); } def.accept(&find); // Arguments on the left hand side might themselves involve accesses // to allocations and thus need to be accounted for when computing the // strides along each dimension. vector>> &call_args = find.call_args; // Account for the spatial locality of the store. Add the access on the // left hand side to call_args. call_args.emplace_back(stg.func.name(), def.args()); // Map for holding the strides across each dimension map var_strides; const vector &dims = def.schedule().dims(); for (int d = 0; d < (int)dims.size() - 1; d++) { // Get all the variables involving the dimension in the definition. FindVarsUsingVar dep_vars(dims[d].var); def.accept(&dep_vars); // Accumulate the stride of each access to a loop dimension. Expr total_stride = 0; for (const pair> &call : call_args) { Box call_alloc_reg; const auto &iter = allocation_bounds.find(call.first); if (iter != allocation_bounds.end()) { call_alloc_reg = iter->second; } else { call_alloc_reg = get_element(pipeline_bounds, call.first); } Expr current_stride = find_max_access_stride(dep_vars.vars, call.first, call.second, call_alloc_reg); if (!current_stride.defined()) { return map(); } total_stride += current_stride; } var_strides.emplace(dims[d].var, simplify(total_stride)); } return var_strides; } // Verify that function 'f' does not have partially specified schedules/bounds. // The current auto scheduler cannots handle such cases. void validate_no_partial_schedules(const Function &f, bool is_output) { if (f.has_extern_definition()) { return; } // Verify no compute_root or bounds are specified user_assert(f.schedule().compute_level().is_inlined()) << "AutoSchedule: cannot auto-schedule function \"" << f.name() << "\" since it is scheduled to be computed at root\n"; user_assert(is_output || f.schedule().bounds().empty()) << "AutoSchedule: cannot auto-schedule function \"" << f.name() << "\" since it has partially specified bounds\n"; int num_stages = f.updates().size() + 1; for (int stage = 0; stage < num_stages; ++stage) { const Definition &def = get_stage_definition(f, stage); const StageSchedule &schedule = def.schedule(); // Verify no splits are specified user_assert(schedule.splits().empty()) << "AutoSchedule: cannot auto-schedule function \"" << f.name() << "\" since it has partially specified schedules at stage " << stage << "\n"; // Verify that none of the dimensions are scheduled to be parallelized or // vectorized, or unrolled. for (const auto &d : schedule.dims()) { user_assert(d.for_type == ForType::Serial) << "AutoSchedule: cannot auto-schedule function \"" << f.name() << "\" since stage " << stage << " is not serial at dim " << d.var << "\n"; } if (stage == 0) { // Since we can only specialize on a Func, we only need to check for no // specializations for the initial stage. user_assert(def.specializations().empty()) << "AutoSchedule: cannot auto-schedule function \"" << f.name() << "\" since it has specializations\n"; // Verify that there is no loop reordering on the initial definition // (i.e. the Vars in the dim list should be in the same order as // the args in the LHS of the definition). internal_assert(schedule.dims().size() - 1 == def.args().size()); for (size_t i = 0; i < def.args().size(); ++i) { const Variable *arg = def.args()[i].as(); internal_assert(arg); user_assert(arg->name == schedule.dims()[i].var) << "AutoSchedule: cannot auto-schedule function \"" << f.name() << "\" since dim \"" << arg->name << "\" at stage " << stage << " has been reordered\n"; } } else { // Verify that there is no loop reordering on the update definition // (i.e. the Vars in the dim list should be in the same order as // the args in the LHS of the definition, the RVars in the dim list // should be in the same order as the RVars in the rvar list, and // all RVars should come before all Vars). const vector &dims = schedule.dims(); const vector &rvars = schedule.rvars(); const vector &args = f.definition().args(); internal_assert(dims.size() - 1 >= rvars.size()); for (size_t i = 0; i < rvars.size(); ++i) { const Dim &d = dims[i]; user_assert(d.is_rvar() && (d.var == rvars[i].var)) << "AutoSchedule: cannot auto-schedule function \"" << f.name() << "\" since dim \"" << i << "\" at stage " << stage << " has been reordered\n"; } internal_assert(dims.size() - rvars.size() - 1 <= args.size()); int last_index = -1; for (int i = rvars.size(); i < (int)dims.size() - 1; ++i) { const Dim &d = dims[i]; user_assert(!d.is_rvar()) << "AutoSchedule: cannot auto-schedule function \"" << f.name() << "\" since dim \"" << i << "\" at stage " << stage << " has been reordered\n"; const auto &iter = std::find_if(args.begin(), args.end(), [&d](const Expr &arg) { const Variable *v = arg.as(); return (d.var == v->name); }); internal_assert(iter != args.end()); int current_index = iter - args.begin(); user_assert(current_index > last_index) << "AutoSchedule: cannot auto-schedule function \"" << f.name() << "\" since dim \"" << i << "\" at stage " << stage << " has been reordered\n"; last_index = current_index; } } } } // Return true if 'f' is used by some extern Func. bool used_by_extern_func(const map &env, const Function &f) { for (const auto &iter : env) { for (const ExternFuncArgument &arg : iter.second.extern_arguments()) { if (arg.is_func()) { if (Function(arg.func).name() == f.name()) { return true; } } } } return false; } // If the bounds of a Func are undefined, then we should just inline the Func // as long as it is legal to inline or used by some extern Func. set get_unbounded_functions(const map &pipeline_bounds, const map &env) { set unbounded; for (const auto &iter : env) { if (!pipeline_bounds.count(iter.first)) { debug(5) << "...Skip checking function \"" << iter.first << "\" since it does not have pipeline bounds\n"; continue; } const Function &f = iter.second; if (!f.can_be_inlined() || used_by_extern_func(env, f)) { continue; } const Box &bound = get_element(pipeline_bounds, iter.first); if (is_box_unbounded(bound)) { unbounded.insert(iter.first); } } return unbounded; } bool inline_unbounded(const vector &outputs, const vector &order, const map &env, const set &unbounded) { bool inlined = false; // The very last few functions in 'order' are the last to be realized in the // pipeline (the final producers) so there is no point in checking it. for (int i = 0; i < (int)order.size() - (int)outputs.size(); ++i) { const Function &f1 = env.at(order[i]); if (!unbounded.count(f1.name())) { continue; } inlined = true; debug(4) << "Function \"" << order[i] << "\" is unbounded\n"; for (int j = i + 1; j < (int)order.size(); ++j) { internal_assert(order[i] != order[j]); const Function &f2 = env.at(order[j]); debug(5) << "Inline unbounded function \"" << f1.name() << "\" inside \"" << f2.name() << "\"\n"; inline_function(f2, f1); } } return inlined; } } // anonymous namespace // Generate schedules for all functions in the pipeline required to compute the // outputs. This applies the schedules and returns a string representation of // the schedules. The target architecture is specified by 'target'. string generate_schedules(const vector &outputs, const Target &target, const MachineParams &arch_params) { // Make an environment map which is used throughout the auto scheduling process. map env; for (const Function &f : outputs) { map more_funcs = find_transitive_calls(f); env.insert(more_funcs.begin(), more_funcs.end()); } // Finalize all the LoopLevels for (auto &iter : env) { iter.second.lock_loop_levels(); } // Compute the topological order, before any trivial inlining (i.e. before // we remove any functions from 'env'). We need the full topological // order to pass to get_func() when generating the string representation // of the schedule. debug(2) << "Computing topological order...\n"; vector top_order = topological_order(outputs, env); // Validate that none of the functions in the pipeline have partial schedules. debug(2) << "Validating no partial schedules...\n"; for (const auto &iter : env) { bool is_output = false; for (const auto &o : outputs) { is_output |= iter.second.same_as(o); } validate_no_partial_schedules(iter.second, is_output); } // The auto scheduling algorithm requires estimates on the outputs of the // pipeline to get quantitative estimates of costs for computing functions // in the pipeline. debug(2) << "Checking estimates on outputs...\n"; check_estimates_on_outputs(outputs); // Run a pre-pass that inline all trivial Funcs (i.e. if the cost of // computing a Func is about the same as calling that Func, we should // just inline it). debug(2) << "Inlining all trivial functions...\n"; if (inline_all_trivial_functions(outputs, top_order, env)) { // If any of the Funcs is inlined, we need to recompute 'env', since some // of the Funcs are no longer used and need to be removed from 'env'. // // Instead of recomputing 'env', we could also remove the inlined Func // within inline_all_trivial_functions(); however, it is a bit tricky // to do when dealing with inlined tuple. Consider the following case: // f(x, y) = x + y; // g(x, y) = {x, f(x, y)}; // h(x, y) = g(x, y)[0]; // When 'g' is inlined in 'h', no one uses 'f' anymore and it can // be removed from 'env'. However, to know this, we need to trace // all the function calls within the pipeline. Thus, we might as well // recompute the 'env' from scratch. env.clear(); for (const Function &f : outputs) { map more_funcs = find_transitive_calls(f); env.insert(more_funcs.begin(), more_funcs.end()); } } // Compute the realization order of the functions within the pipeline. vector order = realization_order(outputs, env).first; // Run a pre-pass that inline all Funcs which values are accessed by // another single Func in element-wise manner. We need to do this // repeatedly since some inlining decisions may enable further inlining // that previously not possible. Consider the following case: // f1(x) = x; // f2(x) = f1(x) + 2; // f3(x) = f1(x) * 2; // f4(x) = f2(x) + f3(x); // f5(x) = f4(x) + 3; // In the first iteration, we cannot inline 'f1' since it is used by two // functions: 'f2' and 'f3'. If 'f2' and 'f4' get inlined and 'f3' is only // used by 'f4', then 'f1' can now also be inlined. debug(2) << "Inlining all element-wise functions...\n"; while (inline_all_element_wise_functions(outputs, order, env)) { // We need to recompute 'env' for the same reason as with // inline_all_trivial_functions env.clear(); for (const Function &f : outputs) { map more_funcs = find_transitive_calls(f); env.insert(more_funcs.begin(), more_funcs.end()); } order = realization_order(outputs, env).first; } // Compute the bounds of function values which are used for dependence analysis. debug(2) << "Computing function value bounds...\n"; FuncValueBounds func_val_bounds = compute_function_value_bounds(order, env); // Initialize the cost model. // Compute the expression costs for each function in the pipeline. debug(2) << "Initializing region costs...\n"; RegionCosts costs(env, order); if (debug::debug_level() >= 3) { costs.disp_func_costs(); } debug(2) << "Initializing dependence analysis...\n"; DependenceAnalysis dep_analysis(env, order, func_val_bounds); // Compute bounds of all functions in the pipeline given estimates on // outputs. Also report functions which bounds could not be inferred. debug(2) << "Computing pipeline bounds...\n"; map pipeline_bounds = get_pipeline_bounds(dep_analysis, outputs, &costs.input_estimates); // Determine all unbounded functions that are not extern Func or // used by some extern Funcs. debug(2) << "Determining all unbounded functions...\n"; set unbounded = get_unbounded_functions(pipeline_bounds, env); if (!unbounded.empty()) { // If some functions are unbounded, we should inline those directly. // Also, we need to recompute 'env' and re-initialize 'costs' and // 'dep_analysis' debug(2) << "Inlining all unbounded functions...\n"; internal_assert(inline_unbounded(outputs, order, env, unbounded)); env.clear(); for (const Function &f : outputs) { map more_funcs = find_transitive_calls(f); env.insert(more_funcs.begin(), more_funcs.end()); } order = realization_order(outputs, env).first; debug(2) << "Re-computing function value bounds...\n"; func_val_bounds = compute_function_value_bounds(order, env); debug(2) << "Re-initializing region costs...\n"; RegionCosts costs(env, order); debug(2) << "Re-initializing dependence analysis...\n"; dep_analysis = DependenceAnalysis(env, order, func_val_bounds); debug(2) << "Re-computing pipeline bounds...\n"; pipeline_bounds = get_pipeline_bounds(dep_analysis, outputs, &costs.input_estimates); } debug(2) << "Initializing partitioner...\n"; Partitioner part(pipeline_bounds, arch_params, outputs, dep_analysis, costs); // Compute and display reuse /* TODO: Use the reuse estimates to reorder loops for (const auto &f : env) { FindAllCalls find; f.second.accept(&find); int num_stages = f.second.updates().size() + 1; for (int s = 0; s < num_stages; s++) { FStage curr_s(f.second, s); map reuse = part.evaluate_reuse(curr_s, find.funcs_called); debug(0) << curr_s << "\n"; for (const auto &dir : reuse) { debug(0) << dir.first << " " << dir.second << ","; } debug(0) << "\n"; } }*/ // Display the current pipeline graph. // TODO: Output the graph in dot format. if (debug::debug_level() >= 3) { part.disp_pipeline_graph(); part.disp_pipeline_bounds(); } debug(2) << "Partitioner initializing groups...\n"; part.initialize_groups(); if (debug::debug_level() >= 3) { part.disp_pipeline_costs(); } debug(2) << "Partitioner computing inline group...\n"; part.group(Partitioner::Level::Inline); if (debug::debug_level() >= 3) { part.disp_grouping(); } debug(2) << "Partitioner computing fast-mem group...\n"; part.grouping_cache.clear(); part.group(Partitioner::Level::FastMem); if (debug::debug_level() >= 3) { part.disp_pipeline_costs(); part.disp_grouping(); part.disp_pipeline_graph(); } debug(2) << "Initializing AutoSchedule...\n"; AutoSchedule sched(env, top_order); debug(2) << "Generating CPU schedule...\n"; part.generate_cpu_schedule(target, sched); std::ostringstream oss; oss << sched; string sched_string = oss.str(); debug(3) << "\n\n*******************************\nSchedule:\n" << "*******************************\n" << sched_string << "\n\n"; // TODO: Unify both inlining and grouping for fast mem // TODO: GPU scheduling // TODO: Hierarchical tiling return sched_string; } struct Mullapudi2016 { void operator()(const Pipeline &pipeline, const Target &target, const MachineParams &arch_params, AutoSchedulerResults *outputs) { AutoSchedulerResults results; results.target = target; results.machine_params_string = arch_params.to_string(); user_assert(target.arch == Target::X86 || target.arch == Target::ARM || target.arch == Target::POWERPC || target.arch == Target::MIPS) << "The Mullapudi2016 autoscheduler is not supported for the target: " << target.to_string(); results.scheduler_name = "Mullapudi2016"; std::vector pipeline_outputs; for (const Func &f : pipeline.outputs()) { pipeline_outputs.push_back(f.function()); } results.schedule_source = generate_schedules(pipeline_outputs, target, arch_params); // this autoscheduler has no featurization *outputs = results; } }; REGISTER_AUTOSCHEDULER(Mullapudi2016) } // namespace Internal } // namespace Halide