Content - d8227393e2867fd4b24ef3966098c009bdcaf61f - 36a01e7/src/autoschedulers/mullapudi2016/AutoSchedule.cpp

visit type:
Tip revision: cd555949cbbc64fe76a96c47c01128ee86b6d2b8 authored by Steven Johnson on 20 July 2021, 16:45:20 UTC
Merge branch 'master' into srj/hannk-error-checking
Tip revision: cd55594
AutoSchedule.cpp
#include "HalidePlugin.h"

#include <algorithm>
#include <map>
#include <regex>
#include <set>
#include <utility>

#include "Halide.h"

namespace Halide {
namespace Internal {

using std::make_pair;
using std::map;
using std::pair;
using std::set;
using std::string;
using std::vector;

namespace {

// Substitute parameter estimates into the exprs describing the box bounds.
void substitute_estimates_box(Box &box) {
    box.used = substitute_var_estimates(box.used);
    for (auto &b : box.bounds) {
        b.min = substitute_var_estimates(b.min);
        b.max = substitute_var_estimates(b.max);
    }
}

// Substitute parameter estimates into the boxes in 'region'.
void substitute_estimates_region(map<string, Box> &region) {
    for (auto &iter : region) {
        substitute_estimates_box(iter.second);
    }
}

// Return true if any of the box dimension is unbounded.
bool is_box_unbounded(const Box &b) {
    for (size_t i = 0; i < b.size(); i++) {
        if (!b[i].is_bounded()) {
            return true;
        }
    }
    return false;
}

// Helper function to simplify the upper and lower bounds of each dimension of a box.
void simplify_box(Box &b) {
    for (size_t i = 0; i < b.size(); i++) {
        b[i].min = simplify(b[i].min);
        b[i].max = simplify(b[i].max);
    }
}

// Helper function to merge the partial region map into the result region map.
void merge_regions(map<string, Box> &result, const map<string, Box> &partial) {
    // Merge regions from 'partial' with an existing region in 'result'.
    for (const auto &reg : partial) {
        auto iter = result.find(reg.first);
        if (iter == result.end()) {
            result.emplace(reg.first, reg.second);
        } else {
            merge_boxes(iter->second, reg.second);
        }
    }
}

// Replace all occurrences of non-alphanumeric chars in 'name' with '_'.
string get_sanitized_name(string name) {
    if (isdigit(name[0])) {
        name = "_" + name;
    }
    for (size_t i = 0; i < name.size(); ++i) {
        if (!isalnum(name[i])) {
            name[i] = '_';
        }
    }
    return name;
}

// Representation of a function stage in the pipeline.
struct FStage {
    Function func;
    uint32_t stage_num;
    FStage(Function func, uint32_t stage_num)
        : func(std::move(func)), stage_num(stage_num) {
    }

    bool operator==(const FStage &other_stage) const {
        return (func.name() == other_stage.func.name()) &&
               (stage_num == other_stage.stage_num);
    }

    bool operator<(const FStage &other_stage) const {
        return func.name() < other_stage.func.name() ||
               ((func.name() == other_stage.func.name()) &&
                (stage_num < other_stage.stage_num));
    }

    friend std::ostream &operator<<(std::ostream &stream, const FStage &s) {
        if (s.stage_num == 0) {
            stream << s.func.name();
        } else {
            stream << s.func.name() << ".update(" << (s.stage_num - 1) << ")";
        }
        return stream;
    }
};

// Check if all the pipeline outputs have estimates specified
// on each of their dimensions; otherwise, throw an assertion.
void check_estimates_on_outputs(const vector<Function> &outputs) {
    for (const auto &out : outputs) {
        const vector<Bound> &estimates = out.schedule().estimates();
        // Check if the estimate for each dimension of the output is available
        // and is an integer. If there are duplicates for the estimate of a
        // dimension, we only check the last defined estimate (which min and
        // extent values are defined) since it is the one that would be
        // eventually used.
        Bound est;
        for (const auto &arg : out.args()) {
            bool found = false;
            for (int i = (int)estimates.size() - 1; i >= 0; --i) {
                if ((estimates[i].var == arg) && estimates[i].min.defined() &&
                    estimates[i].extent.defined()) {
                    found = true;
                    est = estimates[i];
                    break;
                }
            }
            user_assert(found && est.min.type().is_int() && est.extent.type().is_int())
                << "Please provide a valid estimate for dimension "
                << arg << " of output \"" << out.name() << "\"\n";
        }
    }
}

struct DependenceAnalysis {
    // Map containing all the functions in the pipeline.
    map<string, Function> env;
    vector<string> order;
    FuncValueBounds func_val_bounds;

    struct RegionsRequiredQuery {
        string f;
        int stage;
        set<string> prods;
        bool only_regions_computed;

        RegionsRequiredQuery(const string &f, int stage, const set<string> &prods,
                             bool only_regions_computed)
            : f(f), stage(stage), prods(prods),
              only_regions_computed(only_regions_computed) {
        }

        bool operator==(const RegionsRequiredQuery &other) const {
            return (f == other.f) && (stage == other.stage) && (prods == other.prods) &&
                   (only_regions_computed == other.only_regions_computed);
        }
        bool operator<(const RegionsRequiredQuery &other) const {
            if (f < other.f) {
                return true;
            } else if (f > other.f) {
                return false;
            }
            if (stage < other.stage) {
                return true;
            } else if (stage > other.stage) {
                return false;
            }
            if (only_regions_computed < other.only_regions_computed) {
                return true;
            } else if (only_regions_computed > other.only_regions_computed) {
                return false;
            }
            return prods < other.prods;
        }
    };
    struct RegionsRequired {
        DimBounds bounds;
        // Regions required to compute 'bounds' given a particular
        // RegionsRequiredQuery.
        map<string, Box> regions;
        RegionsRequired(const DimBounds &b, const map<string, Box> &r)
            : bounds(b), regions(r) {
        }
    };
    // Cache for bounds queries (bound queries with the same parameters are
    // common during the grouping process).
    map<RegionsRequiredQuery, vector<RegionsRequired>> regions_required_cache;

    DependenceAnalysis(const map<string, Function> &env, const vector<string> &order,
                       const FuncValueBounds &func_val_bounds)
        : env(env), order(order), func_val_bounds(func_val_bounds) {
    }

    // Return the regions of the producers ('prods') required to compute the region
    // of the function stage ('f', 'stage_num') specified by 'bounds'. When
    // 'only_regions_computed' is set to true, this only returns the computed
    // regions and not the total allocated regions.
    map<string, Box> regions_required(const Function &f, int stage_num,
                                      const DimBounds &bounds,
                                      const set<string> &prods,
                                      bool only_regions_computed,
                                      const Scope<Interval> *input_estimates);

    // Return the regions of the producers ('prods') required to compute the region
    // of the function specified by 'pure_bounds'. When 'only_regions_computed'
    // is set to true, this only returns the computed regions and not the total
    // allocated regions.
    map<string, Box> regions_required(const Function &f,
                                      const DimBounds &pure_bounds,
                                      const set<string> &prods,
                                      bool only_regions_computed,
                                      const Scope<Interval> *input_estimates);

    // Return redundantly computed regions of producers ('prods') while computing
    // a region of the function stage ('f', 'stage_num') specified by 'bounds'.
    // 'var' is the dimension along which redundant computation is accounted for.
    // When 'only_regions_computed' is set to true, this only returns the computed
    // regions and not the total allocated regions. When 'only_regions_computed'
    // is set to true, this only returns the computed regions and not the total
    // allocated regions.
    map<string, Box> redundant_regions(const Function &f, int stage_num, const string &var,
                                       const DimBounds &bounds,
                                       const set<string> &prods,
                                       bool only_regions_computed,
                                       const Scope<Interval> *input_estimates);

    // Return overlapping regions of producers ('prods') while computing a function
    // stage along each of the dimensions.
    vector<map<string, Box>>
    overlap_regions(const Function &f, int stage_num, const DimBounds &bounds,
                    const set<string> &prods, bool only_regions_computed,
                    const Scope<Interval> *input_estimates);
};

// Return the regions of the producers ('prods') required to compute the region
// of the function specified by 'pure_bounds'.
map<string, Box>
DependenceAnalysis::regions_required(const Function &f, const DimBounds &pure_bounds,
                                     const set<string> &prods,
                                     bool only_regions_computed,
                                     const Scope<Interval> *input_estimates) {
    // Find the regions required for each stage and merge them.
    map<string, Box> regions;
    int num_stages = f.updates().size() + 1;
    for (int s = 0; s < num_stages; s++) {
        DimBounds bounds = get_stage_bounds(f, s, pure_bounds);
        map<string, Box> stage_regions =
            regions_required(f, s, bounds, prods, only_regions_computed, input_estimates);

        merge_regions(regions, stage_regions);
    }
    return regions;
}

struct StageBounds {
    FStage f_stage;
    DimBounds bounds;

    StageBounds(const FStage &fs, const DimBounds &b)
        : f_stage(fs), bounds(b) {
    }
    StageBounds(Function func, uint32_t stage_num, const DimBounds &b)
        : f_stage(FStage(std::move(func), stage_num)), bounds(b) {
    }

    bool operator==(const StageBounds &other) const {
        return (f_stage == other.f_stage) && (bounds == other.bounds);
    }
    bool operator<(const StageBounds &other) const {
        return (f_stage < other.f_stage) ||
               ((f_stage == other.f_stage) && (bounds.size() < other.bounds.size()));
    }
};

// Helper function to queue regions that need to be traversed. 'fs_bounds' is
// the queue into which the regions specified by 'prod_func' and 'region'
// will be added.
void queue_func_regions(map<FStage, DimBounds> &fs_bounds,
                        const Function &prod_func, const Box &region,
                        const set<StageBounds> &visited) {
    DimBounds prod_pure_bounds;
    const vector<string> &args = prod_func.args();

    internal_assert(region.size() == args.size());

    // The region only specifies the extent of each dimension
    // by position. Populating a map which is keyed by name.
    for (size_t v = 0; v < args.size(); v++) {
        prod_pure_bounds[args[v]] = region[v];
    }

    // Get the bounds of all stages in a function from the
    // bounds on the pure dimensions.
    vector<DimBounds> prod_bounds = get_stage_bounds(prod_func, prod_pure_bounds);

    size_t num_stages = prod_func.updates().size() + 1;

    internal_assert(prod_bounds.size() == num_stages);

    // Add all stages of a function into the queue.
    for (size_t prod_s = 0; prod_s < num_stages; prod_s++) {
        StageBounds sb(prod_func, prod_s, prod_bounds[prod_s]);
        if (visited.find(sb) == visited.end()) {
            auto iter = fs_bounds.find(sb.f_stage);
            if (iter == fs_bounds.end()) {
                fs_bounds.emplace(sb.f_stage, sb.bounds);
            } else {
                for (const auto &b : sb.bounds) {
                    DimBounds &curr_bounds = iter->second;
                    auto b_iter = curr_bounds.find(b.first);
                    if (b_iter == curr_bounds.end()) {
                        curr_bounds.emplace(b.first, b.second);
                    } else {
                        if (b_iter->second.has_lower_bound() && b.second.has_lower_bound()) {
                            b_iter->second.min = simplify(Interval::make_min(b_iter->second.min, b.second.min));
                        } else {
                            b_iter->second.min = Interval::neg_inf();
                        }

                        if (b_iter->second.has_upper_bound() && b.second.has_upper_bound()) {
                            b_iter->second.max = simplify(Interval::make_max(b_iter->second.max, b.second.max));
                        } else {
                            b_iter->second.max = Interval::pos_inf();
                        }
                    }
                }
            }
        }
    }
}

// Helper function for merging 'curr_regions' to the global map of regions
// and adding them to the queue of regions that need to be traversed.
// 'prods' is the set of producer functions that are under consideration.
void merge_and_queue_regions(map<FStage, DimBounds> &fs_bounds,
                             map<string, Box> &regions,
                             map<string, Box> &curr_regions,
                             const set<string> &prods,
                             const map<string, Function> &env,
                             bool only_regions_computed,
                             const string &curr_func_name,
                             const set<StageBounds> &visited) {
    for (const auto &reg : curr_regions) {
        // Merge region with an existing region of a function in the
        // global map. Do not merge the parent function itself to the region
        // when querying only for the values computed.
        if (!only_regions_computed || (only_regions_computed && (reg.first != curr_func_name))) {
            auto iter = regions.find(reg.first);
            if (iter == regions.end()) {
                regions.emplace(reg.first, reg.second);
            } else {
                merge_boxes(iter->second, reg.second);
            }
        }

        // Skip adding the current region into to the queue if the function
        // is not in 'prods'.
        if (prods.find(reg.first) == prods.end()) {
            continue;
        }

        const auto &it = env.find(reg.first);
        if ((it != env.end()) && (reg.first != curr_func_name)) {
            // Add all stages of the function representing the
            // region into the queue.
            queue_func_regions(fs_bounds, it->second, reg.second, visited);
        }
    }
}

// Return the regions of the producers ('prods') required to compute the region
// of the function stage ('f', 'stage_num') specified by 'bounds'.
map<string, Box>
DependenceAnalysis::regions_required(const Function &f, int stage_num,
                                     const DimBounds &bounds,
                                     const set<string> &prods,
                                     bool only_regions_computed,
                                     const Scope<Interval> *input_estimates) {
    // Iteratively compute the required regions by traversing the chain
    // of dependencies.

    // Check the cache if we've already computed this previously.
    RegionsRequiredQuery query(f.name(), stage_num, prods, only_regions_computed);
    const auto &iter = regions_required_cache.find(query);
    if (iter != regions_required_cache.end()) {
        const auto &it = std::find_if(iter->second.begin(), iter->second.end(),
                                      [&bounds](const RegionsRequired &r) { return (r.bounds == bounds); });
        if (it != iter->second.end()) {
            internal_assert((iter->first == query) && (it->bounds == bounds));
            return it->regions;
        }
    }

    // Map of all the required regions.
    map<string, Box> regions;
    map<FStage, DimBounds> fs_bounds;
    set<StageBounds> visited;

    // Add the query function and its region to the queue.
    fs_bounds.emplace(FStage(f, stage_num), bounds);

    while (!fs_bounds.empty()) {
        for (int i = order.size() - 1; i >= 0; --i) {
            const Function &f = env.find(order[i])->second;
            int num_stages = f.updates().size() + 1;
            for (int stage_num = 0; stage_num < num_stages; ++stage_num) {
                FStage s(f, stage_num);

                const auto &iter = fs_bounds.find(s);
                if (iter == fs_bounds.end()) {
                    continue;
                }

                DimBounds curr_bounds = iter->second;
                visited.insert(StageBounds(s, curr_bounds));

                // Scope for containing all the estimates on parameters and intervals.
                Scope<Interval> curr_scope;
                curr_scope.set_containing_scope(input_estimates);

                // If the function has an extern definition, there is no visibility into
                // the expression defining the function. So the regions required will be
                // the entire domain of the inputs to the extern func. Use the estimates
                // on the inputs to the extern function if available.
                //
                // TODO: Query the extern function for bounds of the functions which it
                // it depends on. This can be done by calling the extern func in the
                // bounds query mode.
                if (s.func.has_extern_definition()) {
                    for (const ExternFuncArgument &arg : s.func.extern_arguments()) {
                        if (arg.is_func()) {
                            // If the argument is an entire function, the bounds of the
                            // function required are unknown. Create an infinite region
                            // of the correct dimension, update the region map, and
                            // add it to the queue.
                            string prod_name = Function(arg.func).name();
                            const Function &prod_func = get_element(env, prod_name);
                            map<string, Box> prod_reg;
                            const vector<string> &args = prod_func.args();
                            for (size_t v = 0; v < args.size(); v++) {
                                prod_reg[prod_name].push_back(Interval());
                            }
                            merge_and_queue_regions(fs_bounds, regions, prod_reg, prods, env,
                                                    only_regions_computed, s.func.name(), visited);
                        } else if (arg.is_expr()) {
                            // Find the boxes required for the expression and add the regions
                            // to the queue.
                            Expr subs_arg = substitute_var_estimates(arg.expr);
                            map<string, Box> arg_regions = boxes_required(subs_arg, curr_scope, func_val_bounds);
                            substitute_estimates_region(arg_regions);
                            merge_and_queue_regions(fs_bounds, regions, arg_regions, prods, env,
                                                    only_regions_computed, s.func.name(), visited);
                        } else if (arg.is_image_param() || arg.is_buffer()) {
                            // If the argument is an image or a buffer, the required
                            // bounds are unknown. Create an infinite region of the
                            // correct dimension and update the region map.
                            Buffer<> buf;
                            if (arg.is_image_param()) {
                                buf = arg.image_param.buffer();
                            } else {
                                buf = arg.buffer;
                            }
                            map<string, Box> buf_reg;
                            for (int v = 0; v < buf.dimensions(); v++) {
                                buf_reg[buf.name()].push_back(Interval());
                            }
                            merge_regions(regions, buf_reg);
                        }
                    }
                } else {
                    Definition def = get_stage_definition(s.func, s.stage_num);
                    const vector<Dim> &dims = def.schedule().dims();

                    // Substitute parameter estimates into the bounds and add them to the
                    // current scope.
                    for (int d = 0; d < (int)dims.size() - 1; d++) {
                        Interval simple_bounds = get_element(curr_bounds, dims[d].var);
                        simple_bounds.min = substitute_var_estimates(simple_bounds.min);
                        simple_bounds.max = substitute_var_estimates(simple_bounds.max);
                        curr_scope.push(dims[d].var, simple_bounds);
                    }

                    // Find the regions required for each value of the current function stage,
                    // update the region map, and add them to the queue.
                    for (const auto &val : def.values()) {
                        // Substitute the parameter estimates into the expression and get
                        // the regions required for the expression.
                        Expr subs_val = substitute_var_estimates(val);
                        map<string, Box> curr_regions = boxes_required(subs_val, curr_scope, func_val_bounds);
                        substitute_estimates_region(curr_regions);

                        // Arguments to the definition may require regions of functions.
                        // For example, update definitions in histograms where the bin is
                        // based on the value of a function.
                        Box left_reg;
                        for (const Expr &arg : def.args()) {
                            Expr subs_arg = substitute_var_estimates(arg);
                            map<string, Box> arg_regions = boxes_required(subs_arg, curr_scope, func_val_bounds);
                            substitute_estimates_region(arg_regions);

                            // Merge the regions with the regions found while looking at
                            // the values.
                            merge_regions(curr_regions, arg_regions);

                            Interval arg_bounds = bounds_of_expr_in_scope(arg, curr_scope, func_val_bounds);
                            left_reg.push_back(arg_bounds);
                        }

                        auto iter_curr = curr_regions.find(s.func.name());
                        if (iter_curr == curr_regions.end()) {
                            curr_regions.emplace(s.func.name(), left_reg);
                        } else {
                            merge_boxes(iter_curr->second, left_reg);
                        }

                        // Update the region map, and add 'curr_regions' to the queue.
                        merge_and_queue_regions(fs_bounds, regions, curr_regions, prods, env,
                                                only_regions_computed, s.func.name(), visited);
                    }
                }

                // Remove processed region from the queue.
                fs_bounds.erase(iter);
            }
        }
    }

    // Simplify the bounds on each region and substitute global pipeline
    // bounds for function regions which lower and upper bounds could not be
    // determined.
    map<string, Box> concrete_regions;

    for (auto &f_reg : regions) {
        simplify_box(f_reg.second);

        Box concrete_box;
        for (size_t i = 0; i < f_reg.second.size(); i++) {
            Expr lower = f_reg.second[i].min;
            Expr upper = f_reg.second[i].max;

            auto iter = env.find(f_reg.first);
            bool in_env = (iter != env.end());

            if (!lower.as<IntImm>() && in_env) {
                const Function &curr_f = iter->second;
                for (const auto &b : curr_f.schedule().estimates()) {
                    size_t num_pure_args = curr_f.args().size();
                    if ((i < num_pure_args) && (b.var == curr_f.args()[i])) {
                        lower = b.min;
                    }
                }
            }

            if (!upper.as<IntImm>() && in_env) {
                const Function &curr_f = iter->second;
                for (const auto &b : curr_f.schedule().estimates()) {
                    size_t num_pure_args = curr_f.args().size();
                    if ((i < num_pure_args) && (b.var == curr_f.args()[i])) {
                        const IntImm *bmin = b.min.as<IntImm>();
                        const IntImm *bextent = b.extent.as<IntImm>();
                        upper = IntImm::make(Int(32), bmin->value + bextent->value - 1);
                    }
                }
            }

            Interval concrete_bounds = Interval(lower, upper);
            concrete_box.push_back(concrete_bounds);
        }
        concrete_regions[f_reg.first] = concrete_box;
    }

    regions_required_cache[query].push_back(RegionsRequired(bounds, concrete_regions));
    return concrete_regions;
}

// Return redundantly computed regions of producers ('prods') while computing a
// region of the function stage ('f', 'stage_num') specified by 'bounds'. 'var'
// is the dimension along which redundant computation is accounted for.
map<string, Box>
DependenceAnalysis::redundant_regions(const Function &f, int stage_num, const string &var,
                                      const DimBounds &bounds,
                                      const set<string> &prods,
                                      bool only_regions_computed,
                                      const Scope<Interval> *input_estimates) {
    // Find the regions required to compute the region of 'f' specified
    // by 'bounds'.
    map<string, Box> regions = regions_required(
        f, stage_num, bounds, prods, only_regions_computed, input_estimates);

    // Shift the bounds by the size of the interval along the direction
    // of var.
    DimBounds shifted_bounds;

    for (const auto &b : bounds) {
        if (b.first == var) {
            Expr len = b.second.max - b.second.min + 1;
            Interval bound = Interval(b.second.min + len, b.second.max + len);
            shifted_bounds[b.first] = bound;
        } else {
            shifted_bounds[b.first] = b.second;
        }
    }

    // Find the regions required to compute the region of f specified
    // by shifted_bounds.
    map<string, Box> regions_shifted = regions_required(
        f, stage_num, shifted_bounds, prods, only_regions_computed, input_estimates);

    // Compute the overlaps between 'regions_shifted' and the original
    // regions required.
    map<string, Box> overlaps;
    for (const auto &reg : regions) {
        auto iter = regions_shifted.find(reg.first);
        if (iter == regions.end()) {
            // It will be interesting to log cases where this actually happens
            // i.e., the shifted regions do not contain a function that was
            // there in the original regions.
            continue;
        }
        const Box &b = reg.second;
        const Box &b_shifted = iter->second;
        // The boxes should be of the same size.
        internal_assert(b.size() == b_shifted.size());

        Box b_intersect;
        for (uint32_t i = 0; i < b.size(); i++) {
            b_intersect.push_back(Interval::make_intersection(b[i], b_shifted[i]));
        }
        // A function should appear once in the regions and therefore cannot
        // already be present in the overlaps map.
        internal_assert(overlaps.find(reg.first) == overlaps.end());
        overlaps.emplace(reg.first, b_intersect);
    }

    // Simplify the bounds of each of the overlap regions.
    for (auto &f : overlaps) {
        simplify_box(f.second);
    }

    return overlaps;
}

// Return overlapping regions of producers ('prods') while computing a function
// stage along each of the dimensions.
vector<map<string, Box>>
DependenceAnalysis::overlap_regions(const Function &f, int stage_num,
                                    const DimBounds &bounds,
                                    const set<string> &prods,
                                    bool only_regions_computed,
                                    const Scope<Interval> *input_estimates) {
    vector<map<string, Box>> conc_overlaps;

    const vector<Dim> &dims = get_stage_dims(f, stage_num);

    // Get the redundant regions along each dimension of f.
    for (int d = 0; d < (int)dims.size() - 1; d++) {
        map<string, Box> conc_reg = redundant_regions(f, stage_num, dims[d].var, bounds,
                                                      prods, only_regions_computed, input_estimates);
        conc_overlaps.push_back(conc_reg);
    }
    return conc_overlaps;
}

// Return the regions of each function required for computing the
// outputs of the pipeline.
map<string, Box> get_pipeline_bounds(DependenceAnalysis &analysis,
                                     const vector<Function> &outputs,
                                     const Scope<Interval> *input_estimates) {
    map<string, Box> pipeline_bounds;

    // Find the regions required for each of the outputs and merge them
    // to compute the full pipeline_bounds.
    for (const auto &out : outputs) {
        DimBounds pure_bounds;
        Box out_box;
        // Use the estimates on the output for determining the output bounds.
        // If there are duplicates, use the most recent estimate.
        const auto &estimates = out.schedule().estimates();
        for (const auto &arg : out.args()) {
            int i;
            for (i = estimates.size() - 1; i >= 0; --i) {
                const auto &est = estimates[i];
                if ((est.var == arg) && est.min.defined() && est.extent.defined()) {
                    Interval in = Interval(est.min, simplify(est.min + est.extent - 1));
                    pure_bounds.emplace(arg, in);
                    out_box.push_back(in);
                    break;
                }
            }
            internal_assert(i >= 0) << "Could not find estimate for " << arg << "\n";
        }

        set<string> prods;
        for (const pair<const string, Function> &fpair : analysis.env) {
            prods.insert(fpair.first);
        }

        map<string, Box> regions = analysis.regions_required(out, pure_bounds, prods,
                                                             false, input_estimates);

        // Add the output region to the pipeline bounds as well.
        regions.emplace(out.name(), out_box);

        merge_regions(pipeline_bounds, regions);
    }

    return pipeline_bounds;
}

struct AutoSchedule {
    struct Stage {
        string function;
        size_t stage;

        Stage(const string &f, size_t s)
            : function(f), stage(s) {
        }

        bool operator==(const Stage &other) const {
            return (function == other.function) && (stage == other.stage);
        }
        bool operator<(const Stage &other) const {
            return (function < other.function) || ((function == other.function) && (stage < other.stage));
        }
    };

    const map<string, Function> &env;

    // Contain maps from function name to the topological order of the pipeline.
    map<string, size_t> topological_order;

    // Cache for storing all internal vars/rvars that have been declared during
    // the course of schedule generation, to ensure that we don't introduce any
    // duplicates in the string representation of the schedules.
    map<string, VarOrRVar> internal_vars;

    // Store the list of schedules applied to some function stages (most recent
    // schedule is placed last in the list).
    map<string, map<int, vector<string>>> func_schedules;

    // Store the list of vars/rvars used in the schedule applied to some
    // function stages.
    map<string, map<int, set<string>>> used_vars;

    AutoSchedule(const map<string, Function> &env, const vector<string> &order)
        : env(env) {
        for (size_t i = 0; i < order.size(); ++i) {
            topological_order.emplace(order[i], i);
        }
        // Allocate a slot in 'used_vars' for each function stages in the pipeline
        for (const auto &iter : env) {
            for (size_t i = 0; i < iter.second.updates().size() + 1; ++i) {
                used_vars[iter.first][i];
            }
        }
    }

    // Given a function name, return a string representation of getting the
    // function handle
    string get_func_handle(const string &name) const {
        size_t index = get_element(topological_order, name);
        return "pipeline.get_func(" + std::to_string(index) + ")";
    }

    friend std::ostream &operator<<(std::ostream &stream, const AutoSchedule &sched) {
        for (const auto &iter : sched.internal_vars) {
            if (iter.second.is_rvar) {
                stream << "RVar ";
            } else {
                stream << "Var ";
            }
            stream << iter.first << "(\"" << iter.first << "\");\n";
        }
        stream << "\n";

        // Declare all the functions + schedules
        std::ostringstream func_ss;
        std::ostringstream schedule_ss;

        for (const auto &f : sched.func_schedules) {
            const string &fname = get_sanitized_name(f.first);
            func_ss << "Func " << fname << " = " << sched.get_func_handle(f.first) << ";\n";

            schedule_ss << "{\n";

            // Declare all the Vars and RVars that are actually used in the schedule
            const Function &func = get_element(sched.env, f.first);
            for (size_t i = 0; i < func.args().size(); ++i) {
                if (sched.used_vars.at(func.name()).at(0).find(func.args()[i]) != sched.used_vars.at(func.name()).at(0).end()) {
                    schedule_ss << "    Var " << func.args()[i] << " = "
                                << fname << ".args()[" << i << "];\n";
                }
            }
            set<string> declared_rvars;
            for (size_t i = 0; i < func.updates().size(); ++i) {
                const vector<ReductionVariable> &rvars = func.updates()[i].schedule().rvars();
                const set<string> &var_list = sched.used_vars.at(func.name()).at(i + 1);
                for (size_t j = 0; j < rvars.size(); ++j) {
                    if ((var_list.find(rvars[j].var) == var_list.end()) ||
                        (declared_rvars.find(rvars[j].var) != declared_rvars.end())) {
                        continue;
                    }
                    declared_rvars.insert(rvars[j].var);
                    schedule_ss << "    RVar " << rvars[j].var << "("
                                << fname << ".update(" << i << ").get_schedule().rvars()[" << j << "].var);\n";
                }
            }

            for (const auto &s : f.second) {
                internal_assert(!s.second.empty());
                schedule_ss << "    " << fname;
                if (s.first > 0) {
                    schedule_ss << ".update(" << std::to_string(s.first - 1) << ")";
                }
                for (size_t i = 0; i < s.second.size(); ++i) {
                    schedule_ss << "\n        ." << s.second[i];
                }
                schedule_ss << ";\n";
            }

            schedule_ss << "}\n";
        }

        stream << func_ss.str() << "\n";
        stream << schedule_ss.str() << "\n";

        return stream;
    }

    void push_schedule(const string &stage_name, size_t stage_num,
                       const string &sched, const set<string> &vars) {
        vector<string> v = split_string(stage_name, ".");
        internal_assert(!v.empty());

        used_vars[v[0]][stage_num].insert(vars.begin(), vars.end());

        // If the previous schedule applied is the same as this one,
        // there is no need to re-apply the schedule
        auto &schedules = func_schedules[v[0]][stage_num];
        if (schedules.empty()) {
            schedules.push_back(sched);
        } else {
            if (schedules[schedules.size() - 1] != sched) {
                schedules.push_back(sched);
            }
        }
    }
};

// Implement the grouping algorithm and the cost model for making the grouping
// choices.
struct Partitioner {
    // GroupingChoice encodes the grouping of the 'prod' function into the 'cons' stage.
    struct GroupingChoice {
        string prod;
        FStage cons;

        GroupingChoice(const string &prod, const FStage &cons)
            : prod(prod), cons(cons) {
        }

        bool operator==(const GroupingChoice &other) const {
            return (prod == other.prod) && (cons == other.cons);
        }

        bool operator<(const GroupingChoice &other) const {
            return (prod < other.prod) || ((prod == other.prod) && (cons < other.cons));
        }

        friend std::ostream &operator<<(std::ostream &stream, const GroupingChoice &choice) {
            stream << "Choice: " << choice.prod << " -> " << choice.cons << "\n";
            return stream;
        }
    };

    // A group is a sub-pipeline with a single output. Members of a group are
    // either inlined into the consumer functions within the group or computed
    // at tiles of the output, specified by 'tile_sizes'.
    //
    // TODO: The restriction of computing either at the inline or tile level
    // makes the space of scheduling choices for a group very tractable.
    // However, the restriction might miss good schedules which can only be
    // realized by computing the members of the group at different levels of
    // the group.
    //
    // There are two approaches to extend the space of schedules considered:
    // 1) Recursive grouping: Treat the problem of determining the compute levels
    // within a group as a smaller instance of the grouping problem with
    // different parameters for the input, output sizes, and cache model.
    //
    // 2) Tightening: Always compute a function at the lowest level possible
    // without introducing redundant work. This is a restricted form of recursive
    // grouping which does not explore the trade-off between redundant work and
    // locality.
    //
    // Either approach can be implemented as a post process for each group
    // after the initial grouping process finishes. The cost model may
    // already make sub-optimal higher level partitioning when it is not aware
    // of the benefits of the post processing. However, it should strictly be
    // an improvement over the initial grouping. As a first step, it is good
    // to make it a post process.
    //
    // Incorporating the recursive grouping process into the cost model can be
    // tricky and can potentially make the cost of analyzing a group
    // prohibitive, as it requires solving smaller instances of the grouping
    // problem for analyzing each configuration. On the other hand, tightening
    // can be integrated into the cost model with out significantly increasing
    // the time to analyze a grouping configuration.
    //
    // TODO: Add sliding window optimizations. For start, it may be enough to
    // implement sliding window as a post-pass by moving the store level of all
    // the members of the group to the outermost serial loop. This could possibly
    // be incorporated in the cost model with some effort. Line-buffering
    // presents additional challenges for this post-processing strategy though.
    // A typical line-buffer would use terrible tile size for tiling, but its
    // performance will improve significantly once sliding window is turned on.
    //
    // TODO: Register tiling is an important transformation especially for
    // benchmarks with significant reuse of the data (like matrix multiply and
    // convolutional layers). The mechanism for realizing register tiling is to
    // completely unroll small tiles of the innermost kernels. Unrolling
    // interacts with vectorization, storage layout, and depends on the outer
    // level tiling.
    struct Group {
        // The output stage representing the group.
        FStage output;
        // Functions that belong to the group.
        vector<FStage> members;
        // Members of the group which are inlined.
        set<string> inlined;
        // Tile sizes along dimensions of the output function of the group.
        map<string, Expr> tile_sizes;

        Group(const FStage &output, const vector<FStage> &members)
            : output(output), members(members) {
        }

        friend std::ostream &operator<<(std::ostream &stream, const Group &g) {
            stream << "Output FStage: " << g.output << "\n"
                   << "Members: {";
            for (size_t i = 0; i < g.members.size(); ++i) {
                if (i > 0) {
                    stream << ", ";
                }
                stream << g.members[i];
            }
            stream << "}\n";

            stream << "Inlined: {";
            for (auto iter = g.inlined.begin(); iter != g.inlined.end(); ++iter) {
                if (std::distance(g.inlined.begin(), iter) > 0) {
                    stream << ", ";
                }
                stream << *iter;
            }
            stream << "}\n";

            stream << "Tile sizes: {";
            for (auto iter = g.tile_sizes.begin(); iter != g.tile_sizes.end(); ++iter) {
                if (std::distance(g.tile_sizes.begin(), iter) > 0) {
                    stream << ", ";
                }
                stream << "(" << iter->first << ", " << iter->second << ")";
            }
            stream << "}\n";

            return stream;
        }
    };

    // Result of the analysis of a group.
    struct GroupAnalysis {
        // Estimate of the arithmetic and memory cost for computing the group.
        Cost cost;
        // Estimate of the parallelism that can be exploited while computing
        // the group.
        Expr parallelism;

        GroupAnalysis()
            : cost(Cost()), parallelism(Expr()) {
        }
        GroupAnalysis(const Cost &c, Expr p)
            : cost(c), parallelism(std::move(p)) {
        }

        inline bool defined() const {
            return cost.defined() && parallelism.defined();
        }

        void simplify() {
            cost.simplify();
            if (parallelism.defined()) {
                parallelism = Internal::simplify(parallelism);
            }
        }

        friend std::ostream &operator<<(std::ostream &stream, const GroupAnalysis &analysis) {
            stream << "[arith cost:" << analysis.cost.arith << ", "
                   << "memory cost:" << analysis.cost.memory << ", "
                   << "parallelism:" << analysis.parallelism << "]\n";
            return stream;
        }
    };

    // Configuration of a group and the corresponding analysis. A group is the
    // set of functions that are computed together in tiles and the group config
    // specifies at what granularity they are computed together ('tile_sizes').
    struct GroupConfig {
        map<string, Expr> tile_sizes;
        GroupAnalysis analysis;
        GroupConfig(const map<string, Expr> &tile_sizes, const GroupAnalysis &analysis)
            : tile_sizes(tile_sizes), analysis(analysis) {
        }
        GroupConfig()
            : tile_sizes(map<string, Expr>()), analysis(GroupAnalysis()) {
        }
    };

    // Cache for storing the best configuration for the grouping choice. During
    // the grouping process, the impact of grouping two groups together is only
    // limited to the producers and consumers of the groups that are being grouped
    // together. The best grouping choices for the rest of the pipeline need not be
    // re-evaluated and caching them improves performance significantly.
    map<GroupingChoice, GroupConfig> grouping_cache;

    // Each group in the pipeline has a single output stage. A group is comprised
    // of function stages that are computed together in tiles (stages of a function
    // are always grouped together). 'groups' is the mapping from the output stage
    // of the group to the group.
    map<FStage, Group> groups;
    // The child stages of each stage (i.e. stages that depend on or use the values
    // computed by a particular stage) in the pipeline.
    map<FStage, set<FStage>> children;
    // Map from the output stage of the group to the analysis of the group. The mapping
    // needs to be updated whenever the grouping changes.
    map<FStage, GroupAnalysis> group_costs;

    // Levels that are targeted by the grouping algorithm. In the 'Inline' mode, the grouping
    // algorithm groups the functions by inlining the expression for the producer function
    // into the consumer stage. In the 'FastMem' mode, the grouping is done at the level of
    // tiles of the group output stage.
    enum class Level { Inline,
                       FastMem };

    // Bounds of each function stage in the pipeline. These bounds are inferred from the
    // estimates of the outputs and other functions in the pipeline.
    const map<string, Box> &pipeline_bounds;
    // Parameters of the machine model that is used for estimating the cost of each
    // group in the pipeline.
    const MachineParams &arch_params;
    // Dependency analysis of the pipeline. This support queries on regions
    // accessed and computed for producing some regions of some functions.
    DependenceAnalysis &dep_analysis;
    // The arithmetic and memory costs of evaluating the expressions which define
    // each function in the pipeline.
    RegionCosts &costs;
    // Output functions of the pipeline.
    const vector<Function> &outputs;

    Partitioner(const map<string, Box> &_pipeline_bounds,
                const MachineParams &_arch_params,
                const vector<Function> &_outputs,
                DependenceAnalysis &_dep_analysis,
                RegionCosts &_costs);

    void initialize_groups();

    // Merge 'prod_group' into 'cons_group'. The output stage of 'cons_group'
    // will be the output stage of the merged group.
    Group merge_groups(const Group &prod_group, const Group &cons_group);

    // Merge 'prods' in 'choice' into 'cons'. Set the tile size of the new group
    // to the one specified by 'eval'. If 'level' is set to Inline, all members
    // of 'prods' will be inlined in the new group.
    void merge_groups(const GroupingChoice &choice, const GroupConfig &eval,
                      Partitioner::Level level);

    // Given a grouping 'g', compute the estimated cost (arithmetic + memory) and
    // parallelism that can be potentially exploited when computing that group.
    GroupAnalysis analyze_group(const Group &g, bool show_analysis);

    // For each group in the partition, return the regions of the producers
    // need to be allocated to compute a tile of the group's output.
    map<FStage, map<string, Box>> group_storage_bounds();

    // For each group in the partition, return the regions of the producers
    // required to compute a tile of the group's output.
    map<FStage, map<FStage, DimBounds>> group_loop_bounds();

    // Partition the pipeline by iteratively merging groups until a fixpoint is
    // reached.
    void group(Partitioner::Level level);

    // Given a grouping choice, return a configuration for the group that gives
    // the highest estimated benefits.
    GroupConfig evaluate_choice(const GroupingChoice &group, Partitioner::Level level);

    // Pick the best choice among all the grouping options currently available. Uses
    // the cost model to estimate the benefit of each choice. This returns a vector of
    // choice and configuration pairs which describe the best grouping choice.
    vector<pair<GroupingChoice, GroupConfig>>
    choose_candidate_grouping(const vector<pair<string, string>> &cands,
                              Partitioner::Level level);

    // Return the bounds required to produce a function stage.
    DimBounds get_bounds(const FStage &stg);

    // Return the bounds required to produce a tile of a function stage.
    DimBounds get_bounds_from_tile_sizes(const FStage &stg,
                                         const map<string, Expr> &tile_sizes);

    // Return the estimated size of the bounds.
    map<string, Expr> bounds_to_estimates(const DimBounds &bounds);

    // Given a function stage, return a vector of possible tile configurations for
    // that function stage.
    vector<map<string, Expr>> generate_tile_configs(const FStage &stg);

    // Find the best tiling configuration for a group 'g' among a set of tile
    // configurations. This returns a pair of configuration with the highest
    // estimated benefit and the estimated benefit.
    pair<map<string, Expr>, GroupAnalysis> find_best_tile_config(const Group &g);

    // Estimate the benefit (arithmetic + memory) of 'new_grouping' over 'old_grouping'.
    // Positive values indicates that 'new_grouping' may be preferrable over 'old_grouping'.
    // When 'ensure_parallelism' is set to true, this will return an undefined cost
    // if the estimated parallelism is smaller than the machine parameters.
    // If 'no_redundant_work' is set, we only consider the arithmetic cost, i.e. if
    // the arithmetic benefit is negative, we will treat it as no benefits and we
    // should not perform the new grouping.
    Expr estimate_benefit(const GroupAnalysis &old_grouping, const GroupAnalysis &new_grouping,
                          bool no_redundant_work, bool ensure_parallelism) const;

    // Same as above; however, 'new_grouping' is a vector of function pairs that
    // are to be grouped together.
    Expr estimate_benefit(const vector<pair<GroupingChoice, GroupConfig>> &new_grouping,
                          bool no_redundant_work, bool ensure_parallelism) const;

    // Return the total estimate on arithmetic and memory costs of computing all
    // groups within the pipeline.
    Cost get_pipeline_cost();

    // Return the maximum access stride to allocation of 'func_acc' along any
    // loop variable specified in 'vars'. Access expressions along each dimension
    // of the allocation are specified by 'acc_exprs'. The dimension bounds of the
    // allocation are specified by 'buffer_bounds'.
    Expr find_max_access_stride(const Scope<> &vars, const string &func_acc,
                                const vector<Expr> &acc_exprs, const Box &buffer_bounds);

    // Return the sum of access strides along each of the loop variables in
    // a function stage. The bounds of all the allocations accessed are specified
    // in 'allocation_bounds'. Return an empty map if it can't figure out any of
    // the stride dimension.
    map<string, Expr> analyze_spatial_locality(
        const FStage &stg, const map<string, Box> &parent_bounds,
        const set<string> &inlines = set<string>());

    map<string, Expr> evaluate_reuse(const FStage &stg, const set<string> &prods);

    // Generate and apply schedules for all functions within a pipeline by
    // following their grouping structure.
    //
    // TODO: A mode where schedules are not applied to the functions might be
    // interesting.
    //
    // TODO: The current form of the schedule returned is not very useful since it
    // cannot be manipulated and introspected very easily. The problem is that all
    // of the scheduling uses internal function and variable names which are not
    // visible to the user. Additionally, functions like sum and maximum are not
    // user visible. More thought needs to go into interaction between the user and
    // auto scheduling.
    void generate_cpu_schedule(const Target &t, AutoSchedule &sched);

    // Same as \ref Partitioner::generate_cpu_schedule, but this generates and
    // applies schedules for a group of function stages.

    void generate_group_cpu_schedule(const Group &g, const Target &t,
                                     const map<FStage, DimBounds> &group_loop_bounds,
                                     const map<string, Box> &group_storage_bounds,
                                     const set<string> &inlines,
                                     AutoSchedule &sched);

    // Split the dimension of stage 'f_handle' along 'v' into inner and outer
    // dimensions. Modify 'estimates' according to the split and append the split
    // schedule to 'sched'.
    pair<VarOrRVar, VarOrRVar> split_dim(
        const Group &g, Stage f_handle, int stage_num, const Definition &def,
        bool is_group_output, const VarOrRVar &v, const Expr &factor, const string &in_suffix,
        const string &out_suffix, map<string, Expr> &estimates, AutoSchedule &sched);

    // Loop over the dimensions of function stage 'f_handle' starting from innermost
    // and vectorize the first pure dimension encountered.
    void vectorize_stage(
        const Group &g, Stage f_handle, int stage_num, Definition def,
        const Function &func, bool is_group_output, const Target &t, set<string> &rvars,
        map<string, Expr> &estimates, AutoSchedule &sched);

    // Reorder the dimensions to preserve spatial locality. This function
    // checks the stride of each access. The dimensions of the loop are reordered
    // such that the dimension with the smallest access stride is innermost.
    // This takes the strides along each dimension as input.
    void reorder_dims(Stage f_handle, int stage_num, Definition def,
                      map<string, Expr> strides, AutoSchedule &sched);

    // Helper functions to display partition information of the pipeline.
    void disp_pipeline_costs();
    void disp_pipeline_bounds();
    void disp_pipeline_graph();
    void disp_grouping();
};

void Partitioner::disp_grouping() {
    debug(0) << "\n=========\n"
             << "Grouping:\n"
             << "=========\n";
    for (const auto &g : groups) {
        debug(0) << g.second << "\n";
    }
    debug(0) << "=========\n";
}

void Partitioner::disp_pipeline_graph() {
    debug(0) << "\n================\n"
             << "Pipeline graph:\n"
             << "================\n";
    for (const auto &f : children) {
        debug(0) << f.first << ": {";
        for (auto iter = f.second.begin(); iter != f.second.end(); ++iter) {
            if (std::distance(f.second.begin(), iter) > 0) {
                debug(0) << ", ";
            }
            debug(0) << *iter;
        }
        debug(0) << "}\n";
    }
    debug(0) << "================\n";
}

void Partitioner::disp_pipeline_bounds() {
    debug(0) << "\n================\n"
             << "Pipeline bounds:\n"
             << "================\n";
    disp_regions(pipeline_bounds);
    debug(0) << "===============\n";
}

Cost Partitioner::get_pipeline_cost() {
    internal_assert(!group_costs.empty());

    Cost total_cost(0, 0);
    for (const pair<const FStage, Group> &g : groups) {
        const GroupAnalysis &analysis = get_element(group_costs, g.first);
        if (!analysis.cost.defined()) {
            return Cost();
        }
        total_cost.arith += analysis.cost.arith;
        total_cost.memory += analysis.cost.memory;
    }
    total_cost.simplify();
    return total_cost;
}

void Partitioner::disp_pipeline_costs() {
    internal_assert(!group_costs.empty());
    Cost total_cost(0, 0);
    debug(0) << "\n===============\n"
             << "Pipeline costs:\n"
             << "===============\n"
             << "Group: (name) [arith cost, mem cost, parallelism]\n";
    for (const pair<const FStage, Group> &g : groups) {
        const GroupAnalysis &analysis = get_element(group_costs, g.first);
        if (!total_cost.arith.defined()) {
            continue;
        } else if (!analysis.cost.arith.defined()) {
            total_cost.arith = Expr();
        } else {
            total_cost.arith += analysis.cost.arith;
        }

        if (!total_cost.memory.defined()) {
            continue;
        } else if (!analysis.cost.memory.defined()) {
            total_cost.memory = Expr();
        } else {
            total_cost.memory += analysis.cost.memory;
        }

        debug(0) << "Group: " << g.first << " [";
        debug(0) << analysis.cost.arith << ", " << analysis.cost.memory
                 << ", " << analysis.parallelism << "]\n";
    }
    total_cost.simplify();
    debug(0) << "Total arithmetic cost: " << total_cost.arith << "\n"
             << "Total memory cost: " << total_cost.memory << "\n"
             << "===============\n";
}

// Construct a partitioner and build the pipeline graph on which the grouping
// algorithm operates.
Partitioner::Partitioner(const map<string, Box> &_pipeline_bounds,
                         const MachineParams &_arch_params,
                         const vector<Function> &_outputs,
                         DependenceAnalysis &_dep_analysis,
                         RegionCosts &_costs)
    : pipeline_bounds(_pipeline_bounds), arch_params(_arch_params),
      dep_analysis(_dep_analysis), costs(_costs), outputs(_outputs) {
    // Place each stage of a function in its own group. Each stage is
    // a node in the pipeline graph.
    for (const auto &f : dep_analysis.env) {
        if (!pipeline_bounds.count(f.first)) {
            // If a function does not have a pipeline bound (i.e. it can be
            // statically proven that no one ever uses it), we should not
            // consider it during the grouping.
            debug(5) << "Creating partitioner: ignore function \"" << f.first
                     << "\" since it has empty pipeline bounds\n";
            continue;
        }
        int num_stages = f.second.updates().size() + 1;
        for (int s = 0; s < num_stages; s++) {
            FStage stg(f.second, s);
            Group g(stg, {stg});
            groups.insert(make_pair(stg, g));
        }
    }

    // Find the consumers of each function and use it to populate the children map.
    for (const auto &f : dep_analysis.env) {
        int num_stages = f.second.updates().size() + 1;
        for (int s = 0; s < num_stages; s++) {
            set<string> parents = get_parents(f.second, s);
            for (const string &c : parents) {
                // Filter out the calls to pipeline inputs. 'env' only contains
                // the functions computed and not the inputs.
                auto iter = dep_analysis.env.find(c);
                if ((c != f.first) && (iter != dep_analysis.env.end())) {
                    // Consumer depends only on the last stage of a producer
                    // with multiple stages.
                    const Function &prod_func = iter->second;
                    int final_stage = prod_func.updates().size();

                    FStage prod_stage(prod_func, final_stage);
                    FStage cons_stage(f.second, s);
                    children[prod_stage].insert(cons_stage);
                }
            }

            if (s > 0) {
                // Update the children map to reflect the dependencies between
                // different stages of the same function.
                FStage prod_stage(f.second, s - 1);
                FStage cons_stage(f.second, s);
                children[prod_stage].insert(cons_stage);
            }
        }
    }
}

void Partitioner::initialize_groups() {
    for (pair<const FStage, Group> &g : groups) {
        pair<map<string, Expr>, GroupAnalysis> best = find_best_tile_config(g.second);
        g.second.tile_sizes = best.first;
        group_costs.emplace(g.second.output, best.second);
    }
    grouping_cache.clear();
}

map<string, Expr> Partitioner::evaluate_reuse(const FStage &stg,
                                              const set<string> &prods) {
    map<string, Expr> reuse;
    Function f = stg.func;

    // TODO: Check if tile size of 1 in each dimension gives a reasonable
    // answer or reuse should be evaluated at a much larger granularity or
    // symbolically. Using a symbolic version might be better if the objective
    // is to prove the dimension has no reuse. The only downside with the
    // symbolic method is that it is totally at the mercy of the simplifier.
    // Another option is sampling or using a larger granularity.
    map<string, Expr> tile_sizes;

    const vector<Dim> &dims = get_stage_dims(stg.func, stg.stage_num);
    for (int d = 0; d < (int)dims.size() - 1; d++) {
        tile_sizes[dims[d].var] = 1;
    }

    DimBounds bounds = get_bounds_from_tile_sizes(stg, tile_sizes);

    vector<map<string, Box>> reuse_regions =
        dep_analysis.overlap_regions(stg.func, stg.stage_num, bounds, prods,
                                     false, &costs.input_estimates);

    for (int d = 0; d < (int)dims.size() - 1; d++) {
        Expr total_reuse = make_zero(Int(64));
        if (debug::debug_level() >= 3) {
            disp_regions(reuse_regions[d]);
        }
        for (const auto &reg : reuse_regions[d]) {
            Expr size = box_size(reg.second);
            if (!size.defined()) {
                total_reuse = Expr();
                break;
            } else {
                total_reuse += size;
            }
        }
        reuse.emplace(dims[d].var, simplify(total_reuse));
    }

    return reuse;
}

vector<pair<Partitioner::GroupingChoice, Partitioner::GroupConfig>>
Partitioner::choose_candidate_grouping(const vector<pair<string, string>> &cands,
                                       Partitioner::Level level) {
    vector<pair<GroupingChoice, GroupConfig>> best_grouping;
    Expr best_benefit = make_zero(Int(64));
    for (const auto &p : cands) {
        // Compute the aggregate benefit of inlining into all the children.
        vector<pair<GroupingChoice, GroupConfig>> grouping;

        const Function &prod_f = get_element(dep_analysis.env, p.first);
        int final_stage = prod_f.updates().size();

        FStage prod(prod_f, final_stage);

        for (const FStage &c : get_element(children, prod)) {
            GroupConfig best_config;
            GroupingChoice cand_choice(prod_f.name(), c);

            // Check if the candidate has been evaluated for grouping before
            const auto &iter = grouping_cache.find(cand_choice);
            if (iter != grouping_cache.end()) {
                best_config = iter->second;
            } else {
                best_config = evaluate_choice(cand_choice, level);
                // Cache the result of the evaluation for the pair
                grouping_cache.emplace(cand_choice, best_config);
            }

            grouping.emplace_back(cand_choice, best_config);
        }

        bool no_redundant_work = false;
        Expr overall_benefit = estimate_benefit(grouping, no_redundant_work, true);

        debug(3) << "Candidate grouping:\n";
        for (const auto &g : grouping) {
            debug(3) << "  " << g.first;
        }
        debug(3) << "Candidate benefit: " << overall_benefit << "\n";
        // TODO: The grouping process can be non-deterministic when the costs
        // of two choices are equal
        if (overall_benefit.defined() && can_prove(best_benefit < overall_benefit)) {
            best_grouping = grouping;
            best_benefit = overall_benefit;
        }
    }

    debug(3) << "\nBest grouping:\n";
    for (const auto &g : best_grouping) {
        debug(3) << "  " << g.first;
    }
    if (!best_grouping.empty()) {
        debug(3) << "Best benefit: " << best_benefit << "\n";
    }

    return best_grouping;
}

inline bool operator==(const map<string, Expr> &m1, const map<string, Expr> &m2) {
    if (m1.size() != m2.size()) {
        return false;
    }
    for (const auto &it1 : m1) {
        const auto &it2 = m2.find(it1.first);
        if (it2 == m2.end()) {
            return false;
        } else if (!equal(it1.second, it2->second)) {
            return false;
        }
    }
    return true;
}

vector<map<string, Expr>> Partitioner::generate_tile_configs(const FStage &stg) {
    // TODO: This is a wart due to the cost model not taking vectorization
    // and pre-fetching into account. Ensuring the innermost dimension has
    // at least size of 64 gives enough values for vectorization and can help
    // with prefetching. This also interacts with the number of parallel tasks
    // that are generated.
    int min_inner_dim_size = 64;

    const vector<Dim> &dims = get_stage_dims(stg.func, stg.stage_num);

    // Get the dimensions that are going to be tiled in this stage.
    // Skipping rvars for now.
    vector<string> tile_vars;
    for (int d = 0; d < (int)dims.size() - 1; d++) {
        if (!dims[d].is_rvar()) {
            tile_vars.push_back(dims[d].var);
        }
    }

    vector<int> size_variants = {1, 4, 8, 16, 32, 64, 128, 256};
    vector<map<string, Expr>> tile_configs;

    // For all the tile configurations generated, we force the innermost dimension
    // to be at least of size 64 to ensure enough values for vectorization.

    // Skewed tile configurations
    for (size_t i = 0; i < tile_vars.size(); i++) {
        for (const auto &dim_size : size_variants) {
            map<string, Expr> tiling;
            tiling.emplace(tile_vars[i],
                           (i == 0) ? std::max(dim_size, min_inner_dim_size) : dim_size);
            for (size_t j = 0; j < tile_vars.size(); j++) {
                if (j < i) {
                    tiling.emplace(tile_vars[j], size_variants[size_variants.size() - 1]);
                } else if (j > i) {
                    tiling.emplace(tile_vars[j], size_variants[0]);
                }
            }
            if (!tiling.empty()) {
                bool is_duplicate =
                    std::find_if(tile_configs.begin(), tile_configs.end(),
                                 [&tiling](const map<string, Expr> &m) { return (tiling == m); }) != tile_configs.end();
                if (!is_duplicate) {
                    tile_configs.push_back(tiling);
                }
            }
        }
    }

    // Almost square tile configurations
    for (const auto &dim_size : size_variants) {
        map<string, Expr> tiling;
        for (size_t j = 0; j < tile_vars.size(); j++) {
            tiling.emplace(tile_vars[j],
                           (j == 0) ? std::max(dim_size, min_inner_dim_size) : dim_size);
        }
        if (!tiling.empty()) {
            bool is_duplicate =
                std::find_if(tile_configs.begin(), tile_configs.end(),
                             [&tiling](const map<string, Expr> &m) { return (tiling == m); }) != tile_configs.end();
            if (!is_duplicate) {
                tile_configs.push_back(tiling);
            }
        }
    }

    // Reorder tile configurations
    for (int i = 0; i < (1 << (tile_vars.size())); i++) {
        map<string, Expr> tiling;
        for (size_t j = 0; j < tile_vars.size(); j++) {
            if (((i >> (j)) & 1) == 1) {
                if (j == 0) {
                    tiling.emplace(tile_vars[j], min_inner_dim_size);
                } else {
                    tiling.emplace(tile_vars[j], 1);
                }
            }
        }
        if (!tiling.empty()) {
            bool is_duplicate =
                std::find_if(tile_configs.begin(), tile_configs.end(),
                             [&tiling](const map<string, Expr> &m) { return (tiling == m); }) != tile_configs.end();
            if (!is_duplicate) {
                tile_configs.push_back(tiling);
            }
        }
    }

    return tile_configs;
}

pair<map<string, Expr>, Partitioner::GroupAnalysis>
Partitioner::find_best_tile_config(const Group &g) {
    // Initialize to no tiling
    map<string, Expr> no_tile_config;
    Group no_tile = g;
    no_tile.tile_sizes = no_tile_config;

    bool show_analysis = false;
    GroupAnalysis no_tile_analysis = analyze_group(no_tile, show_analysis);

    GroupAnalysis best_analysis = no_tile_analysis;
    map<string, Expr> best_config = no_tile_config;
    if (!best_analysis.cost.defined()) {
        return make_pair(best_config, best_analysis);
    }

    // Generate tiling configurations
    vector<map<string, Expr>> configs = generate_tile_configs(g.output);

    Group best_group = g;
    for (const auto &config : configs) {
        Group new_group = g;
        new_group.tile_sizes = config;

        GroupAnalysis new_analysis = analyze_group(new_group, show_analysis);

        bool no_redundant_work = false;
        Expr benefit = estimate_benefit(best_analysis, new_analysis,
                                        no_redundant_work, true);

        if (show_analysis) {
            debug(0) << "Benefit relative to not tiling:" << benefit << "\n";
            debug(0) << "Best analysis:" << new_analysis;
            debug(0) << "No tile analysis:" << no_tile_analysis;
            debug(0)
                << "arith cost:" << cast<float>(new_analysis.cost.arith / no_tile_analysis.cost.arith)
                << ", mem cost:" << cast<float>(new_analysis.cost.memory / no_tile_analysis.cost.memory) << "\n";
        }

        if (benefit.defined() && can_prove(benefit > 0)) {
            best_config = config;
            best_analysis = new_analysis;
            best_group = new_group;
        }
    }

    return make_pair(best_config, best_analysis);
}

void Partitioner::group(Partitioner::Level level) {
    bool fixpoint = false;
    while (!fixpoint) {
        Cost pre_merge = get_pipeline_cost();

        fixpoint = true;
        vector<pair<string, string>> cand;
        for (const pair<const FStage, Group> &g : groups) {
            bool is_output = false;
            for (const Function &f : outputs) {
                if (g.first.func.name() == f.name()) {
                    is_output = true;
                    break;
                }
            }

            // All stages of a function are computed at a single location.
            // The last stage of the function represents the candidate choice
            // of grouping the function into a consumer.

            const Function &prod_f = get_element(dep_analysis.env, g.first.func.name());
            bool is_final_stage = (g.first.stage_num == prod_f.updates().size());

            if (is_output || !is_final_stage) {
                continue;
            }

            const auto &iter = children.find(g.first);
            if (iter != children.end()) {
                // All the stages belonging to a function are considered to be a
                // single child.
                set<string> child_groups;
                for (const FStage &s : iter->second) {
                    child_groups.insert(s.func.name());
                }

                int num_children = child_groups.size();
                // Only groups with a single child are considered for grouping
                // when grouping for computing in tiles.
                // TODO: The current scheduling model does not allow functions
                // to be computed at different points.
                if ((num_children == 1) && (level == Partitioner::Level::FastMem)) {
                    const string &prod_name = prod_f.name();
                    const string &cons_name = (*child_groups.begin());
                    cand.emplace_back(prod_name, cons_name);
                } else if ((level == Partitioner::Level::Inline) && prod_f.is_pure()) {
                    const string &prod_name = prod_f.name();
                    cand.emplace_back(prod_name, "");
                }
            }
        }

        debug(3) << "\n============================\n"
                 << "Current grouping candidates:\n"
                 << "============================\n";
        for (size_t i = 0; i < cand.size(); ++i) {
            debug(3) << "{" << cand[i].first << ", " << cand[i].second << "}\n";
        }

        vector<pair<GroupingChoice, GroupConfig>> best = choose_candidate_grouping(cand, level);
        if (best.empty()) {
            continue;
        } else {
            fixpoint = false;
        }

        // The following code makes the assumption that all the stages of a function
        // will be in the same group. 'choose_candidate_grouping' ensures that the
        // grouping choice being returned adheres to this constraint.
        const string &prod = best[0].first.prod;

        const Function &prod_f = get_element(dep_analysis.env, prod);
        size_t num_stages = prod_f.updates().size() + 1;

        FStage final_stage(prod_f, num_stages - 1);
        set<FStage> prod_group_children = get_element(children, final_stage);

        // Invalidate entries of the grouping cache
        set<GroupingChoice> invalid_keys;
        for (const auto &c : prod_group_children) {
            for (const auto &entry : grouping_cache) {
                if ((entry.first.prod == c.func.name()) || (entry.first.cons == c)) {
                    invalid_keys.insert(entry.first);
                }
            }
        }
        for (const auto &key : invalid_keys) {
            grouping_cache.erase(key);
        }

        for (const auto &group : best) {
            internal_assert(group.first.prod == prod);
            merge_groups(group.first, group.second, level);
        }

        for (size_t s = 0; s < num_stages; s++) {
            FStage prod_group(prod_f, s);
            groups.erase(prod_group);
            group_costs.erase(prod_group);

            // Update the children mapping
            children.erase(prod_group);
            for (auto &f : children) {
                set<FStage> &cons = f.second;
                auto iter = cons.find(prod_group);
                if (iter != cons.end()) {
                    cons.erase(iter);
                    // For a function with multiple stages, all the stages will
                    // be in the same group and the consumers of the function
                    // only depend on the last stage. Therefore, when the
                    // producer group has multiple stages, parents of the
                    // producers should point to the consumers of the last
                    // stage of the producer.
                    cons.insert(prod_group_children.begin(), prod_group_children.end());
                }
            }
        }

        Cost post_merge = get_pipeline_cost();
        if (debug::debug_level() >= 3) {
            disp_pipeline_costs();
        }
    }
}

DimBounds Partitioner::get_bounds(const FStage &s) {
    DimBounds bounds;

    const vector<string> &args = s.func.args();
    for (size_t d = 0; d < args.size(); d++) {
        bounds[args[d]] = get_element(pipeline_bounds, s.func.name())[d];
    }

    return get_stage_bounds(s.func, s.stage_num, bounds);
}

DimBounds Partitioner::get_bounds_from_tile_sizes(const FStage &s,
                                                  const map<string, Expr> &tile_sizes) {
    map<string, Interval> bounds;

    const map<string, Interval> &def_bounds = get_bounds(s);
    const vector<Dim> &dims = get_stage_dims(s.func, s.stage_num);

    for (int d = 0; d < (int)dims.size() - 1; d++) {
        string var = dims[d].var;
        const Interval &bound = get_element(def_bounds, var);
        const auto &iter = tile_sizes.find(var);
        if (iter != tile_sizes.end()) {
            const Expr &size = iter->second;
            // Check if the bounds allow for tiling with the given tile size,
            // i.e. ensure at least 2 tiles
            Expr extent = get_extent(bound);
            internal_assert(extent.defined());
            if (can_prove(extent >= 2 * size)) {
                // TODO: Maybe shift this to the center of the pipeline bound
                bounds[var] = Interval(0, simplify(size - 1));
            } else {
                // If the dimension is too small, do not tile it and set the
                // extent of the bounds to that of the dimension estimate
                bounds[var] = bound;
            }
        } else {
            bounds[var] = bound;
        }
    }

    return bounds;
}

Partitioner::GroupAnalysis Partitioner::analyze_group(const Group &g, bool show_analysis) {
    set<string> group_inputs;
    set<string> group_members;

    for (const auto &stg : g.members) {
        group_members.insert(stg.func.name());
        set<string> parents = get_parents(stg.func, stg.stage_num);
        for (const auto &c : parents) {
            bool is_member = false;
            for (const auto &m : g.members) {
                if (m.func.name() == c) {
                    is_member = true;
                    break;
                }
            }
            if (!is_member) {
                group_inputs.insert(c);
            }
        }
    }

    // Count the number of tiles
    Expr estimate_tiles = make_one(Int(64));
    Expr parallelism = make_one(Int(64));

    if (!g.output.func.has_extern_definition()) {
        // Get the definition corresponding to the group output
        Definition def = get_stage_definition(g.output.func, g.output.stage_num);
        const vector<Dim> &dims = def.schedule().dims();

        DimBounds stg_bounds = get_bounds(g.output);

        for (int d = 0; d < (int)dims.size() - 1; d++) {
            const string &var = dims[d].var;
            const auto &iter = g.tile_sizes.find(var);
            if (iter != g.tile_sizes.end()) {
                const Expr &size = iter->second;
                Expr extent = get_extent(get_element(stg_bounds, var));
                if (!extent.defined()) {
                    return GroupAnalysis();
                }

                Expr dim_tiles = simplify((extent + size - 1) / size);
                estimate_tiles *= dim_tiles;
                // Since all Vars are inherently parallelizable by construct, we
                // only need to take RVars into account for the analysis.
                if (can_parallelize_rvar(var, g.output.func.name(), def)) {
                    parallelism *= dim_tiles;
                }
            }
        }
    }

    // Get the regions of the pipeline required to compute a tile of the group
    DimBounds tile_bounds = get_bounds_from_tile_sizes(g.output, g.tile_sizes);

    map<string, Box> alloc_regions = dep_analysis.regions_required(
        g.output.func, g.output.stage_num, tile_bounds, group_members, false, &costs.input_estimates);

    map<string, Box> compute_regions = dep_analysis.regions_required(
        g.output.func, g.output.stage_num, tile_bounds, group_members, true, &costs.input_estimates);

    map<string, Box> group_reg, prod_reg, input_reg;

    // Separating into regions that computed within the group and regions that
    // are input to the group
    for (const auto &reg : compute_regions) {
        if ((group_members.find(reg.first) != group_members.end()) &&
            (reg.first != g.output.func.name())) {
            group_reg.emplace(reg.first, reg.second);
        } else if (group_inputs.find(reg.first) != group_inputs.end()) {
            if (dep_analysis.env.find(reg.first) != dep_analysis.env.end()) {
                prod_reg.emplace(reg.first, reg.second);
            } else {
                input_reg.emplace(reg.first, reg.second);
            }
        }
    }

    // Aggregate costs for intermediate functions in a tile and the
    // tile output
    Cost tile_cost = costs.region_cost(group_reg, g.inlined);
    if (!tile_cost.defined()) {
        return GroupAnalysis();
    }

    Cost out_cost = costs.stage_region_cost(g.output.func.name(),
                                            g.output.stage_num,
                                            tile_bounds, g.inlined);
    if (!out_cost.defined()) {
        return GroupAnalysis();
    }

    for (const auto &reg : alloc_regions) {
        if (!box_size(reg.second).defined()) {
            return GroupAnalysis();
        }
    }

    Cost group_cost(simplify(tile_cost.arith + out_cost.arith),
                    simplify(tile_cost.memory + out_cost.memory));

    // Detailed load costs for all the group intermediates
    map<string, Expr> group_load_costs =
        costs.detailed_load_costs(group_reg, g.inlined);

    map<string, Expr> out_load_costs =
        costs.stage_detailed_load_costs(g.output.func.name(),
                                        g.output.stage_num,
                                        tile_bounds, g.inlined);

    combine_load_costs(group_load_costs, out_load_costs);

    Box out_tile_extent;
    if (g.output.stage_num == 0) {
        const vector<string> &args = g.output.func.args();
        for (size_t d = 0; d < args.size(); d++) {
            const auto &iter = tile_bounds.find(args[d]);
            if (iter != tile_bounds.end()) {
                out_tile_extent.push_back(iter->second);
            } else {
                out_tile_extent.push_back(Interval());
            }
        }
    }

    Cost per_tile_cost(group_cost.arith, make_zero(Int(64)));

    // This is the old cost model; keeping it here for reference, for now.
    /*
    if (tile_inter_size > arch_params.l1_size) {
        // Conservative estimate of accesses to memory
        //per_tile_mem_cost = tile_inter_size;
        // Aggressive estimate of accesses to memory
        per_tile_mem_cost = tile_cost.second;
    } else {
        // The tile_input_size captures the region of the input
        // required to compute the tile. However, all of it many not be
        // accessed during the computation of the tile when the access
        // is sparse. A better estimate is given by the smaller of
        // the number of memory accesses and the region size
        per_tile_mem_cost = std::min(tile_input_size + tile_output_size,
                                     tile_cost.second);
    }*/

    // TODO: Use smooth step curve from Jon to better model cache behavior,
    // where each step corresponds to different cache level.
    //
    // The current cost model drops off linearly. Larger memory footprint is
    // penalized more than smaller memory footprint (since smaller one can fit
    // more in the cache). The cost is clamped at 'balance', which is roughly at
    // memory footprint equal to or larger than the last level cache size.

    // If 'model_reuse' is set, the cost model should take into account memory
    // reuse within the tile, e.g. matrix multiply reuses inputs multiple times.
    // TODO: Implement a better reuse model.
    bool model_reuse = false;

    // Linear dropoff
    float load_slope = arch_params.balance / arch_params.last_level_cache_size;
    for (const auto &f_load : group_load_costs) {
        internal_assert(g.inlined.find(f_load.first) == g.inlined.end())
            << "Intermediates of inlined pure function \"" << f_load.first
            << "\" should not have been in the group_load_costs\n";

        const auto &alloc_reg = get_element(alloc_regions, f_load.first);

        Expr footprint;
        bool is_group_member = (group_members.find(f_load.first) != group_members.end());
        bool is_output = (f_load.first == g.output.func.name());

        // We use allocated region as conservative estimate of the footprint since
        // the loads could be from any random locations of the allocated regions.

        if (!is_output && is_group_member) {
            footprint = costs.region_size(f_load.first, alloc_reg);
        } else {
            Expr initial_footprint;
            const auto &f_load_pipeline_bounds = get_element(pipeline_bounds, f_load.first);

            bool is_function = (dep_analysis.env.find(f_load.first) != dep_analysis.env.end());
            if (!is_function) {  // It is a load to some input buffer
                // Initial loads
                initial_footprint = costs.input_region_size(f_load.first, f_load_pipeline_bounds);
                // Subsequent loads
                footprint = costs.input_region_size(f_load.first, alloc_reg);
            } else if (is_output) {  // Load to the output function of the group
                internal_assert(is_group_member)
                    << "Output " << f_load.first << " should have been a group member\n";
                // Initial loads
                initial_footprint = costs.region_size(f_load.first, f_load_pipeline_bounds);
                // Subsequent loads
                footprint = costs.region_size(f_load.first, out_tile_extent);
            } else {  // Load to some non-member function (i.e. function from other groups)
                // Initial loads
                initial_footprint = costs.region_size(f_load.first, f_load_pipeline_bounds);
                // Subsequent loads
                footprint = costs.region_size(f_load.first, alloc_reg);
            }

            if (model_reuse) {
                Expr initial_factor =
                    cast<int64_t>(min(1 + initial_footprint * load_slope, arch_params.balance));
                per_tile_cost.memory += initial_factor * footprint;
            } else {
                footprint = initial_footprint;
            }

            if (!footprint.defined()) {
                return GroupAnalysis();
            }
        }

        Expr cost_factor = cast<int64_t>(min(1 + footprint * load_slope, arch_params.balance));
        per_tile_cost.memory += cost_factor * f_load.second;
    }

    if (show_analysis) {
        debug(0) << "\nDetailed loads:\n";
        for (const auto &f_load : group_load_costs) {
            debug(0) << "(" << f_load.first << "," << f_load.second << ")";
        }
        debug(0) << "\n";

        debug(0) << "\nPer tile memory cost:" << per_tile_cost.memory << "\n";
        debug(0) << "Per tile arith cost:" << per_tile_cost.arith << "\n";
    }

    GroupAnalysis g_analysis(
        Cost(per_tile_cost.arith * estimate_tiles, per_tile_cost.memory * estimate_tiles),
        parallelism);
    g_analysis.simplify();

    return g_analysis;
}

Partitioner::Group Partitioner::merge_groups(const Group &prod_group,
                                             const Group &cons_group) {
    vector<FStage> group_members;
    for (const auto &s : prod_group.members) {
        group_members.push_back(s);
    }
    for (const auto &s : cons_group.members) {
        group_members.push_back(s);
    }

    Group group(cons_group.output, group_members);

    for (const auto &f : prod_group.inlined) {
        group.inlined.insert(f);
    }
    for (const auto &f : cons_group.inlined) {
        group.inlined.insert(f);
    }

    return group;
}

void Partitioner::merge_groups(const GroupingChoice &choice, const GroupConfig &eval,
                               Partitioner::Level level) {
    const Function &prod_f = get_element(dep_analysis.env, choice.prod);
    size_t num_stages = prod_f.updates().size() + 1;

    const FStage &child = choice.cons;
    Group &child_group = get_element(groups, child);

    for (size_t s = 0; s < num_stages; s++) {
        FStage cand(prod_f, s);
        Group &cand_group = get_element(groups, cand);
        child_group.members.insert(child_group.members.end(),
                                   cand_group.members.begin(),
                                   cand_group.members.end());

        if (level == Partitioner::Level::Inline) {
            for (const auto &stg : cand_group.members) {
                child_group.inlined.insert(stg.func.name());
            }
        } else {
            for (const auto &in : cand_group.inlined) {
                child_group.inlined.insert(in);
            }
        }
    }

    child_group.tile_sizes = eval.tile_sizes;

    // Update group costs.
    // We could just reuse the analysis from 'eval' since it was computed
    // by assuming the merge had happened.
    group_costs[child] = eval.analysis;
}

Partitioner::GroupConfig Partitioner::evaluate_choice(const GroupingChoice &choice,
                                                      Partitioner::Level level) {
    // Create a group that reflects the grouping choice and evaluate the cost
    // of the group.
    const Function &prod_f = get_element(dep_analysis.env, choice.prod);
    int num_prod_stages = prod_f.updates().size() + 1;
    vector<Group> prod_groups;

    for (int s = 0; s < num_prod_stages; s++) {
        FStage prod_s(prod_f, s);
        prod_groups.push_back(get_element(groups, prod_s));
    }

    Group cons = get_element(groups, choice.cons);
    Group group = cons;
    for (const auto &prod_g : prod_groups) {
        group = merge_groups(prod_g, group);
    }

    GroupAnalysis group_analysis;
    map<string, Expr> best_tile_config;

    if (level == Partitioner::Level::Inline) {
        // Set the tile sizes to one along all dimensions of the consumer group
        map<string, Expr> tile_sizes;

        const Function &cons_f = cons.output.func;

        const vector<Dim> &dims = get_stage_dims(cons_f, cons.output.stage_num);
        for (int d = 0; d < (int)dims.size() - 1; d++) {
            tile_sizes[dims[d].var] = 1;
        }

        group.tile_sizes = tile_sizes;

        for (const auto &prod_g : prod_groups) {
            for (const FStage &s : prod_g.members) {
                group.inlined.insert(s.func.name());
            }
        }

        for (const string &f : cons.inlined) {
            group.inlined.insert(f);
        }

        group_analysis = analyze_group(group, false);
        best_tile_config = tile_sizes;

    } else {
        pair<map<string, Expr>, GroupAnalysis> config = find_best_tile_config(group);
        best_tile_config = config.first;
        group_analysis = config.second;
    }

    return GroupConfig(best_tile_config, group_analysis);
}

Expr Partitioner::estimate_benefit(const GroupAnalysis &old_grouping,
                                   const GroupAnalysis &new_grouping,
                                   bool no_redundant_work,
                                   bool ensure_parallelism) const {
    // TODO: Instead of having a hard parallelism constraint, it may be better
    // to consider other metric, such as arith_cost/parallelism
    if (ensure_parallelism &&
        (!new_grouping.parallelism.defined() ||
         !can_prove(new_grouping.parallelism >= arch_params.parallelism))) {
        return Expr();
    }

    if (!old_grouping.cost.defined() || !new_grouping.cost.defined()) {
        return Expr();
    }

    Expr arith_benefit = old_grouping.cost.arith - new_grouping.cost.arith;
    if (no_redundant_work && !can_prove(arith_benefit >= 0)) {
        return Expr();
    }
    Expr mem_benefit = old_grouping.cost.memory - new_grouping.cost.memory;
    return simplify(mem_benefit + arith_benefit);
}

Expr Partitioner::estimate_benefit(
    const vector<pair<GroupingChoice, GroupConfig>> &new_grouping,
    bool no_redundant_work, bool ensure_parallelism) const {

    set<FStage> old_groups;

    GroupAnalysis new_group_analysis(Cost(0, 0), Int(64).max());
    for (const auto &g : new_grouping) {
        const Function &prod_f = get_element(dep_analysis.env, g.first.prod);
        int num_prod_stages = prod_f.updates().size() + 1;
        for (int s = 0; s < num_prod_stages; s++) {
            FStage prod_s(prod_f, s);
            old_groups.insert(prod_s);
        }

        old_groups.insert(g.first.cons);

        GroupAnalysis analysisg = g.second.analysis;
        if (analysisg.defined()) {
            new_group_analysis.cost.arith += analysisg.cost.arith;
            new_group_analysis.cost.memory += analysisg.cost.memory;
            new_group_analysis.parallelism = min(new_group_analysis.parallelism,
                                                 analysisg.parallelism);
        } else {
            new_group_analysis.cost = Cost();
            new_group_analysis.parallelism = Expr();
            break;
        }
    }
    new_group_analysis.simplify();

    GroupAnalysis old_group_analysis(Cost(0, 0), Int(64).max());
    for (const auto &g : old_groups) {
        const auto &iter = group_costs.find(g);
        internal_assert(iter != group_costs.end());
        GroupAnalysis analysisg = iter->second;
        if (analysisg.defined()) {
            old_group_analysis.cost.arith += analysisg.cost.arith;
            old_group_analysis.cost.memory += analysisg.cost.memory;
            old_group_analysis.parallelism = min(old_group_analysis.parallelism,
                                                 analysisg.parallelism);
        } else {
            old_group_analysis.cost = Cost();
            old_group_analysis.parallelism = Expr();
            break;
        }
    }
    old_group_analysis.simplify();

    return estimate_benefit(old_group_analysis, new_group_analysis,
                            no_redundant_work, ensure_parallelism);
}

map<string, Expr> Partitioner::bounds_to_estimates(const DimBounds &bounds) {
    map<string, Expr> estimates;
    for (const auto &bound : bounds) {
        estimates.emplace(bound.first, get_extent(bound.second));
    }
    return estimates;
}

map<FStage, map<string, Box>> Partitioner::group_storage_bounds() {
    map<FStage, map<string, Box>> group_storage_bounds;
    for (const pair<const FStage, Group> &gpair : groups) {
        const Group &g = gpair.second;
        DimBounds bounds = get_bounds_from_tile_sizes(g.output, g.tile_sizes);

        set<string> prods;
        for (const FStage &s : g.members) {
            prods.insert(s.func.name());
        }

        map<string, Box> reg_alloc =
            dep_analysis.regions_required(g.output.func, g.output.stage_num,
                                          bounds, prods, false, &costs.input_estimates);
        map<string, Box> group_alloc;
        for (const FStage &s : g.members) {
            const auto &iter = reg_alloc.find(s.func.name());
            if ((iter != reg_alloc.end()) && (s.func.name() != g.output.func.name())) {
                group_alloc[s.func.name()] = iter->second;
            }
        }

        group_storage_bounds[gpair.first] = group_alloc;
    }

    return group_storage_bounds;
}

map<FStage, map<FStage, DimBounds>> Partitioner::group_loop_bounds() {
    map<FStage, map<FStage, DimBounds>> group_bounds;
    for (const pair<const FStage, Group> &gpair : groups) {
        Group g = gpair.second;
        map<FStage, DimBounds> mem_bounds;

        DimBounds bounds = get_bounds_from_tile_sizes(g.output, g.tile_sizes);

        set<string> prods;
        for (const FStage &s : g.members) {
            prods.insert(s.func.name());
        }

        map<string, Box> reg_computed =
            dep_analysis.regions_required(g.output.func, g.output.stage_num,
                                          bounds, prods, true, &costs.input_estimates);

        for (const FStage &s : g.members) {
            const auto &iter = reg_computed.find(s.func.name());
            if (iter != reg_computed.end()) {
                map<string, Expr> tile_sizes;
                const vector<string> &args = s.func.args();
                for (size_t arg = 0; arg < args.size(); arg++) {
                    tile_sizes[args[arg]] = get_extent(iter->second[arg]);
                }
                mem_bounds[s] = get_bounds_from_tile_sizes(s, tile_sizes);
            }
        }

        group_bounds[gpair.first] = mem_bounds;
    }

    return group_bounds;
}

// We need to get the base name of the dimension for scheduling (i.e. it
// can't have any dots). For example, in split case, if "x" is the starting
// dimension name, after split(x, x0, xi, ...), we will end up with something
// like "x.x0" and  "x.xi". If we want to later schedule "x.x0", we need to
// pass "x0" instead of "x.x0".
string get_base_name(string name) {
    size_t dot_pos = name.rfind('.');
    if (dot_pos != string::npos) {
        return name.substr(dot_pos + 1);
    }
    return name;
}

pair<VarOrRVar, VarOrRVar> Partitioner::split_dim(
    const Group &g, Stage f_handle, int stage_num, const Definition &def,
    bool is_group_output, const VarOrRVar &v, const Expr &factor, const string &in_suffix,
    const string &out_suffix, map<string, Expr> &estimates, AutoSchedule &sched) {
    // Create new variables for the split dimensions
    string arg_name = v.name();
    string inner_name = arg_name + in_suffix;
    string outer_name = arg_name + out_suffix;
    VarOrRVar inner(inner_name, v.is_rvar), outer(outer_name, v.is_rvar);

    {
        const auto &iter = sched.internal_vars.find(inner.name());
        if (iter == sched.internal_vars.end()) {
            sched.internal_vars.emplace(inner.name(), inner);
        } else {
            internal_assert(iter->second.is_rvar == inner.is_rvar);
        }
    }
    {
        const auto &iter = sched.internal_vars.find(outer.name());
        if (iter == sched.internal_vars.end()) {
            sched.internal_vars.emplace(outer.name(), outer);
        } else {
            internal_assert(iter->second.is_rvar == outer.is_rvar);
        }
    }

    // The default tail strategy is good enough for most use cases (see docs on
    // TailStrategy::Auto). However, the default of pure vars in update definitions
    // is RoundUp, which may introduces an out-of-bound error if it is an access
    // to inputs or outputs.
    //
    // We could have just used GuardWithIf when splitting pure vars in update
    // definition to ensure no out-of-bounds error. However, this is only
    // necessary, if the update definition involves accesses to inputs or outputs.
    // For other accesses, we could potentially use a more aggressive tail strategy
    // such as RoundUp or ShiftInwards. Note that if we use RoundUp or ShiftInwards,
    // any nested loops (generated by compute_at) will be affected as well. However,
    // since in the current auto-scheduler model, we always compute_at at the group
    // output, if the update definition is not the group output, we do not need to
    // care for the nested loops. If it is the update definition of the group output
    // however, we'd better make sure that no other member of the groups accesses
    // the inputs or outputs.

    TailStrategy strategy = TailStrategy::Auto;
    if ((stage_num > 0) && !v.is_rvar) {
        // TODO: It's safe to use RoundUp here if we know there are no
        // loads from any input, but at this point we've lost track of
        // loads from inputs that happen inside inlined Funcs.
        strategy = TailStrategy::GuardWithIf;
    }

    f_handle.split(v, outer, inner, factor, strategy);

    std::ostringstream oss;
    oss << "split(" << arg_name << ", " << outer_name << ", " << inner_name << ", " << factor;
    switch (strategy) {
    case TailStrategy::RoundUp:
        oss << ", TailStrategy::RoundUp)";
        break;
    case TailStrategy::GuardWithIf:
        oss << ", TailStrategy::GuardWithIf)";
        break;
    case TailStrategy::ShiftInwards:
        oss << ", TailStrategy::ShiftInwards)";
        break;
    case TailStrategy::Auto:
        oss << ")";
        break;
    default:
        internal_error;
    }
    sched.push_schedule(f_handle.name(), stage_num, oss.str(),
                        {arg_name, outer_name, inner_name});

    const Expr &est = get_element(estimates, arg_name);
    internal_assert(est.defined());

    estimates[inner_name] = factor;
    estimates[outer_name] = simplify((est + factor - 1) / factor);
    estimates.erase(arg_name);

    return make_pair(inner, outer);
}

void Partitioner::vectorize_stage(const Group &g, Stage f_handle, int stage_num,
                                  Definition def, const Function &func, bool is_group_output,
                                  const Target &t, set<string> &rvars,
                                  map<string, Expr> &estimates, AutoSchedule &sched) {
    vector<Dim> &dims = def.schedule().dims();
    int vec_dim_index = -1;

    // Set the vector length as the maximum of the natural vector size of all
    // values produced by the function.
    int vec_len = 0;
    for (const auto &type : func.output_types()) {
        vec_len = std::max(vec_len, t.natural_vector_size(type));
    }

    for (int d = 0; d < (int)dims.size() - 1; d++) {
        string dim_name = get_base_name(dims[d].var);
        bool can_vectorize = true;
        if (rvars.find(dim_name) != rvars.end()) {
            can_vectorize = can_parallelize_rvar(dim_name, func.name(), def);
        }
        const auto &iter = estimates.find(dim_name);
        if ((iter != estimates.end()) && iter->second.defined()) {
            if (can_vectorize && can_prove(iter->second >= vec_len)) {
                vec_dim_index = d;
                break;
            }
        }
    }

    if (vec_dim_index >= 0) {
        string vec_dim_name = get_base_name(dims[vec_dim_index].var);
        bool is_rvar = (rvars.find(vec_dim_name) != rvars.end());
        internal_assert(is_rvar == dims[vec_dim_index].is_rvar());

        VarOrRVar vec_var(vec_dim_name, is_rvar);
        pair<VarOrRVar, VarOrRVar> split_vars =
            split_dim(g, f_handle, stage_num, def, is_group_output, vec_var, vec_len,
                      "_vi", "_vo", estimates, sched);

        f_handle.vectorize(split_vars.first);
        sched.push_schedule(f_handle.name(), stage_num,
                            "vectorize(" + split_vars.first.name() + ")",
                            {split_vars.first.name()});

        if (is_rvar) {
            rvars.erase(vec_dim_name);
            rvars.insert(split_vars.first.name());
            rvars.insert(split_vars.second.name());
        }

        // TODO: Reorder vector dim to innermost if it is the innermost
        // storage dimension of the func.
        //
        // TODO: Check if the warning is necessary.
        if (vec_dim_index > 0) {
            user_warning << "Outer dim vectorization of var \"" << vec_dim_name
                         << "\" in function \"" << f_handle.name() << "\"\n";
        }
    }
}

// Return true if the vars/rvars in 'ordering' are in the same order as the
// dim list.
inline bool operator==(const vector<Dim> &dims, const vector<VarOrRVar> &ordering) {
    if (dims.size() != ordering.size() + 1) {  // The dim list also contains '__outermost'
        return false;
    }
    for (size_t i = 0; i < ordering.size(); ++i) {
        if (dims[i].var != ordering[i].name()) {
            return false;
        }
    }
    return true;
}

// Return true if the vars/rvars in 'ordering' are not in the same order as the
// dim list.
inline bool operator!=(const vector<Dim> &dims, const vector<VarOrRVar> &ordering) {
    return !(dims == ordering);
}

void Partitioner::reorder_dims(Stage f_handle, int stage_num, Definition def,
                               map<string, Expr> strides, AutoSchedule &sched) {
    vector<Dim> &dims = def.schedule().dims();
    internal_assert(dims.size() > 1);
    vector<pair<string, int>> order;

    for (int d = 0; d < (int)dims.size() - 1; d++) {
        internal_assert(strides.find(dims[d].var) != strides.end());
    }

    // Iterate until all the dimensions have been assigned an order
    while (!strides.empty()) {
        // Find the pure dimension (can be vars or rvars) with the smallest stride
        bool found_pure_dim = false;
        Expr min_pure_stride = Int(64).max();
        string min_pure_var;
        int min_pure_index = -1;
        for (int d = 0; d < (int)dims.size() - 1; d++) {
            string var_name = get_base_name(dims[d].var);
            const auto &iter = strides.find(var_name);
            if ((iter != strides.end()) && dims[d].is_pure()) {
                const Expr &dim_stride = iter->second;
                internal_assert(dim_stride.defined());
                if (can_prove(dim_stride < min_pure_stride)) {
                    min_pure_stride = dim_stride;
                    min_pure_var = var_name;
                    min_pure_index = d;
                }
                found_pure_dim = true;
            }
        }
        if (found_pure_dim && min_pure_var.empty()) {
            // Since none of the pure strides can be proven as the minimum, we
            // should break here otherwise it may cause infinite loop.
            return;
        }

        // Check if the stride of the pure dimension is smaller than
        // the first impure dimension that has not yet been assigned
        // an order
        Expr min_impure_stride = Int(64).max();
        string min_impure_var;
        int min_impure_index = -1;
        for (int d = 0; d < (int)dims.size() - 1; d++) {
            string var_name = get_base_name(dims[d].var);
            const auto &iter = strides.find(var_name);
            if ((iter != strides.end()) && !dims[d].is_pure()) {
                const Expr &dim_stride = iter->second;
                internal_assert(dim_stride.defined());
                if (can_prove(dim_stride < min_impure_stride)) {
                    min_impure_stride = dim_stride;
                    min_impure_var = var_name;
                    min_impure_index = d;
                    // Impure dimensions cannot be reordered relative to
                    // each other. Stop after encountering the first impure
                    // dimension.
                    break;
                }
            }
        }

        if (min_pure_var.empty() && min_impure_var.empty()) {
            // Since none of the pure and impure strides can be proven as the
            // minimum, we should break here otherwise it may cause infinite loop.
            return;
        }

        pair<string, int> curr_min_var;
        if (!min_impure_var.empty() && can_prove(min_impure_stride < min_pure_stride)) {
            curr_min_var.first = min_impure_var;
            curr_min_var.second = min_impure_index;
            internal_assert(dims[min_impure_index].is_rvar());
        } else {
            curr_min_var.first = min_pure_var;
            curr_min_var.second = min_pure_index;
        }

        order.push_back(curr_min_var);
        strides.erase(curr_min_var.first);
    }

    vector<VarOrRVar> ordering;
    for (const auto &o : order) {
        VarOrRVar o_var(o.first, dims[o.second].is_rvar());
        ordering.push_back(o_var);
    }

    internal_assert(!ordering.empty());
    set<string> var_list = {ordering[0].name()};
    string var_order = ordering[0].name();
    for (size_t o = 1; o < ordering.size(); o++) {
        var_order += ", " + ordering[o].name();
        var_list.insert(ordering[o].name());
    }

    if (dims != ordering) {
        f_handle.reorder(ordering);
        sched.push_schedule(f_handle.name(), stage_num, "reorder(" + var_order + ")", var_list);
    }
}

// Visitor to find all the variables the depend on a variable.
class FindVarsUsingVar : public IRVisitor {
    using IRVisitor::visit;

    void visit(const Let *let) override {
        if (expr_uses_vars(let->value, vars)) {
            vars.push(let->name);
        }
        let->value.accept(this);
        let->body.accept(this);
    }

public:
    Scope<> vars;

    FindVarsUsingVar(const string &var) {
        vars.push(var);
    }
};

void Partitioner::generate_group_cpu_schedule(
    const Group &g, const Target &t,
    const map<FStage, DimBounds> &group_loop_bounds,
    const map<string, Box> &group_storage_bounds,
    const set<string> &inlines,
    AutoSchedule &sched) {
    string out_f_name = g.output.func.name();
    Function g_out = g.output.func;

    debug(3) << "\n================\n"
             << "Scheduling group:\n"
             << "================\n"
             << g;

    if (g.output.func.has_extern_definition()) {
        internal_assert(g.members.size() == 1);
        Func(g_out).compute_root();
        sched.push_schedule(g_out.name(), g.output.stage_num, "compute_root()", {});
        return;
    }

    // Get the estimates for stage bounds
    DimBounds stg_bounds = get_bounds(g.output);
    map<string, Expr> stg_estimates = bounds_to_estimates(stg_bounds);

    Stage f_handle = Stage(Func(g_out));

    // Get a function handle for scheduling the stage
    if (g.output.stage_num > 0) {
        int stage_num = g.output.stage_num;
        f_handle = Func(g_out).update(stage_num - 1);
    } else {
        Func(g_out).compute_root();
        sched.push_schedule(f_handle.name(), g.output.stage_num, "compute_root()", {});
    }

    // Realize tiling and update the dimension estimates
    vector<VarOrRVar> outer_dims;
    vector<VarOrRVar> inner_dims;

    // Get the definition corresponding to the stage
    Definition def = get_stage_definition(g_out, g.output.stage_num);

    // 'dims' will get modified since we are going to apply the schedules
    // (e.g. tiling, reordering, etc.)
    vector<Dim> &dims = def.schedule().dims();

    // Keep track of the rvars
    set<string> rvars;
    for (int d = 0; d < (int)dims.size() - 1; d++) {
        if (dims[d].is_rvar()) {
            rvars.insert(get_base_name(dims[d].var));
        }
    }

    // Reorder the dimensions for better spatial locality (i.e. smallest stride
    // is innermost). If we only have one dimension (excluding __outermost),
    // there is nothing to reorder.
    if (dims.size() > 2) {
        map<string, Expr> strides =
            analyze_spatial_locality(g.output, group_storage_bounds, inlines);
        if (!strides.empty()) {
            reorder_dims(f_handle, g.output.stage_num, def, strides, sched);
        }
    }

    vector<string> dim_vars(dims.size() - 1);
    for (int d = 0; d < (int)dims.size() - 1; d++) {
        dim_vars[d] = get_base_name(dims[d].var);
    }

    // Apply tiling to output of the group
    for (const auto &var : dim_vars) {
        bool is_rvar = (rvars.find(var) != rvars.end());
        VarOrRVar v(var, is_rvar);

        const auto &iter = g.tile_sizes.find(var);
        if ((iter != g.tile_sizes.end()) &&
            get_element(stg_estimates, var).defined() &&
            can_prove(get_element(stg_estimates, var) > iter->second)) {
            const Expr &tile_size = iter->second;
            if (can_prove(tile_size == 1)) {
                outer_dims.push_back(v);
            } else {
                pair<VarOrRVar, VarOrRVar> tile_vars =
                    split_dim(g, f_handle, g.output.stage_num, def, true, v,
                              tile_size, "_i", "_o", stg_estimates, sched);

                inner_dims.push_back(tile_vars.first);
                outer_dims.push_back(tile_vars.second);

                if (is_rvar) {
                    rvars.erase(var);
                    rvars.insert(tile_vars.first.name());
                    rvars.insert(tile_vars.second.name());
                }
            }
        } else {
            inner_dims.push_back(v);
        }
    }

    // Reorder the tile dimensions
    if (!outer_dims.empty()) {

        vector<VarOrRVar> ordering;
        for (const auto &v : inner_dims) {
            ordering.push_back(v);
        }
        for (const auto &v : outer_dims) {
            ordering.push_back(v);
        }

        set<string> var_list;
        string var_order = ordering[0].name();
        for (size_t o = 1; o < ordering.size(); o++) {
            var_order += ", " + ordering[o].name();
            var_list.insert(ordering[o].name());
        }

        if (dims != ordering) {
            f_handle.reorder(ordering);
            sched.push_schedule(f_handle.name(), g.output.stage_num,
                                "reorder(" + var_order + ")", var_list);
        }
    }

    vectorize_stage(g, f_handle, g.output.stage_num, def, g_out, true, t,
                    rvars, stg_estimates, sched);

    // Parallelize definition
    Expr def_par = 1;
    // TODO: Investigate if it is better to pull one large dimension and
    // parallelize over it or to generate nested parallelism.
    //
    // Go from the outer to the innermost loop until sufficient parallelism
    // is achieved. Stop the search once we find a vectorized dimension since
    // it doesn't make any sense to have a parallelized inner loop within a
    // vectorized outer loop.
    bool nested_parallelism = true;
    if (nested_parallelism) {
        int dim_start = dims.size() - 2;
        string seq_var;
        for (int d = dim_start; d >= 0; d--) {
            if (dims[d].for_type == ForType::Vectorized) {
                break;
            }

            string var = get_base_name(dims[d].var);
            bool is_rvar = (rvars.find(var) != rvars.end());
            internal_assert(is_rvar == dims[d].is_rvar());
            VarOrRVar v(var, is_rvar);

            if (is_rvar && !can_parallelize_rvar(var, g_out.name(), def)) {
                if (seq_var.empty()) {
                    seq_var = var;
                }
                continue;
            }

            if (can_prove(def_par >= arch_params.parallelism)) {
                // Enough parallelism to saturate target machine
                break;
            }

            const auto &iter = stg_estimates.find(var);
            if ((iter != stg_estimates.end()) && iter->second.defined()) {
                if (!seq_var.empty()) {
                    VarOrRVar seq(seq_var, (rvars.find(seq_var) != rvars.end()));
                    f_handle.reorder(seq, v);
                    sched.push_schedule(f_handle.name(), g.output.stage_num,
                                        "reorder(" + seq_var + ", " + var + ")",
                                        {seq_var, var});
                }
                f_handle.parallel(v);
                sched.push_schedule(f_handle.name(), g.output.stage_num,
                                    "parallel(" + var + ")", {var});
                def_par = simplify(def_par * iter->second);
            } else {
                break;
            }
        }
    }

    if (can_prove(def_par < arch_params.parallelism)) {
        user_warning << "Insufficient parallelism for " << f_handle.name() << "\n";
    }

    // Find the level at which group members will be computed.
    int tile_inner_index = dims.size() - outer_dims.size() - 1;
    VarOrRVar tile_inner_var(Var::outermost());
    if (!outer_dims.empty()) {
        string var_name = get_base_name(dims[tile_inner_index].var);
        bool is_rvar = (rvars.find(var_name) != rvars.end());
        tile_inner_var = VarOrRVar(var_name, is_rvar);
    }

    for (const FStage &mem : g.members) {
        // Skip member stages that have been inlined or stage that is the
        // output stage of the group
        if ((g.inlined.find(mem.func.name()) != g.inlined.end()) ||
            (mem.func.name() == g_out.name())) {
            continue;
        }

        // Get the definition corresponding to the stage
        Definition mem_def = get_stage_definition(mem.func, mem.stage_num);

        // Get the estimates for the dimensions of the member stage
        map<string, Expr> mem_estimates =
            bounds_to_estimates(get_element(group_loop_bounds, mem));

        set<string> mem_rvars;
        vector<Dim> &mem_dims = mem_def.schedule().dims();
        for (int d = 0; d < (int)mem_dims.size() - 1; d++) {
            if (mem_dims[d].is_rvar()) {
                mem_rvars.insert(get_base_name(mem_dims[d].var));
            }
        }

        // Get a function handle for scheduling the stage
        Stage mem_handle = Stage(Func(mem.func));

        if (mem.stage_num > 0) {
            mem_handle = Func(mem.func).update(mem.stage_num - 1);
        } else {
            if (!outer_dims.empty()) {
                if (tile_inner_var.is_rvar) {
                    Func(mem.func).compute_at(Func(g_out), tile_inner_var.rvar);
                } else {
                    Func(mem.func).compute_at(Func(g_out), tile_inner_var.var);
                }
                string sanitized_g_out = get_sanitized_name(g_out.name());
                sched.push_schedule(mem_handle.name(), mem.stage_num,
                                    "compute_at(" + sanitized_g_out + ", " + tile_inner_var.name() + ")",
                                    {sanitized_g_out, tile_inner_var.name()});
            } else {
                user_warning << "Degenerate tiling. No dimensions are tiled"
                             << "\n";
                user_warning << "Computing \"" << mem.func.name() << "\" at root"
                             << "\n";
                Func(mem.func).compute_root();
                sched.push_schedule(mem_handle.name(), mem.stage_num, "compute_root()", {});
            }
        }

        // Reorder the dimensions for better spatial locality. If we only have
        // one dimension (excluding __outermost), there is nothing to reorder.
        if (dims.size() > 2) {
            map<string, Expr> mem_strides =
                analyze_spatial_locality(mem, group_storage_bounds, inlines);
            if (!mem_strides.empty()) {
                reorder_dims(mem_handle, mem.stage_num, mem_def, mem_strides, sched);
            }
        }

        vectorize_stage(g, mem_handle, mem.stage_num, mem_def, mem.func, false,
                        t, mem_rvars, mem_estimates, sched);
    }
}

void Partitioner::generate_cpu_schedule(const Target &t, AutoSchedule &sched) {
    // Grab the group bounds early as they rely on the dimensions of the group
    // outputs which will be altered by modifying schedules.
    map<FStage, map<FStage, DimBounds>> loop_bounds = group_loop_bounds();
    map<FStage, map<string, Box>> storage_bounds = group_storage_bounds();

    set<string> inlines;
    // Mark all functions that are inlined.
    for (const pair<const FStage, Group> &g : groups) {
        for (const string &inline_func : g.second.inlined) {
            inlines.insert(inline_func);
        }
    }

    // TODO: Inlining functions with update definitions has different
    // behavior than pure functions. They may need to be computed above
    // the innermost vector loop to avoid complications with varying
    // extents across different vector lanes.
    //
    // Since the default schedule is compute inline, we don't need to
    // explicitly call compute_inline() on the function.

    // Realize schedule for each group in the pipeline.
    for (const auto &g : groups) {
        generate_group_cpu_schedule(g.second, t, get_element(loop_bounds, g.first),
                                    get_element(storage_bounds, g.first), inlines, sched);
    }
}

Expr Partitioner::find_max_access_stride(const Scope<> &vars,
                                         const string &func_acc,
                                         const vector<Expr> &acc_exprs,
                                         const Box &buffer_bounds) {
    size_t num_storage_dims = 0;
    Expr bytes_per_ele = make_zero(Int(64));

    // Get the number of dimensions of the allocated storage and the
    // number of bytes required to store a single value of func_acc.
    const auto &iter = dep_analysis.env.find(func_acc);
    if (iter != dep_analysis.env.end()) {
        const Function &f = iter->second;
        for (const auto &e : f.values()) {
            bytes_per_ele += e.type().bytes();
        }
        num_storage_dims = f.schedule().storage_dims().size();
    } else {
        bytes_per_ele = get_element(costs.inputs, func_acc).bytes();
        num_storage_dims = buffer_bounds.size();
    }

    Expr curr_stride = bytes_per_ele;
    Expr stride = make_zero(Int(64));

    internal_assert(num_storage_dims <= acc_exprs.size());
    for (size_t sdim = 0; sdim < num_storage_dims; sdim++) {
        // Check if the access expression depends on any of the loop variables
        // in 'vars'. Expressions that do not involve the variable have stride 0.
        if (expr_uses_vars(acc_exprs[sdim], vars)) {
            stride = max(stride, curr_stride);
        }

        const Interval &dim_range = buffer_bounds[sdim];
        Expr dim_extent = get_extent(dim_range);
        if (!dim_extent.defined()) {
            return Expr();
        }
        curr_stride *= dim_extent;
    }

    return simplify(stride);
}

map<string, Expr>
Partitioner::analyze_spatial_locality(const FStage &stg,
                                      const map<string, Box> &allocation_bounds,
                                      const set<string> &inlines) {
    internal_assert(!stg.func.has_extern_definition());
    // Handle inlining. When a function is inlined into another, the stride of
    // the accesses should be computed on the expression post inlining.
    // For example:
    // f(x, y) = ...;
    // g(x, y) = f(y, x); // transpose
    // h(x, y) = g(y, x); // transpose
    //
    // If both g and f are inlined into h, then the resulting expression for h
    // will look like:
    // h(x, y) = f(x, y);
    //
    // Computing the stride of a loop over x in the function h will be incorrect
    // if inlining is not taken into account.

    // Get all the allocations accessed in the definition corresponding to 'stg'.
    FindAllCalls find;
    Definition def = get_stage_definition(stg.func, stg.stage_num);
    // Perform inlining on the all the values and the args in the stage.
    for (auto &val : def.values()) {
        val = perform_inline(val, dep_analysis.env, inlines, dep_analysis.order);
    }
    for (auto &arg : def.args()) {
        arg = perform_inline(arg, dep_analysis.env, inlines, dep_analysis.order);
    }
    def.accept(&find);

    // Arguments on the left hand side might themselves involve accesses
    // to allocations and thus need to be accounted for when computing the
    // strides along each dimension.
    vector<pair<string, vector<Expr>>> &call_args = find.call_args;
    // Account for the spatial locality of the store. Add the access on the
    // left hand side to call_args.
    call_args.emplace_back(stg.func.name(), def.args());

    // Map for holding the strides across each dimension
    map<string, Expr> var_strides;
    const vector<Dim> &dims = def.schedule().dims();

    for (int d = 0; d < (int)dims.size() - 1; d++) {
        // Get all the variables involving the dimension in the definition.
        FindVarsUsingVar dep_vars(dims[d].var);
        def.accept(&dep_vars);

        // Accumulate the stride of each access to a loop dimension.
        Expr total_stride = 0;
        for (const pair<string, vector<Expr>> &call : call_args) {
            Box call_alloc_reg;
            const auto &iter = allocation_bounds.find(call.first);
            if (iter != allocation_bounds.end()) {
                call_alloc_reg = iter->second;
            } else {
                call_alloc_reg = get_element(pipeline_bounds, call.first);
            }
            Expr current_stride = find_max_access_stride(dep_vars.vars, call.first,
                                                         call.second, call_alloc_reg);
            if (!current_stride.defined()) {
                return map<string, Expr>();
            }
            total_stride += current_stride;
        }
        var_strides.emplace(dims[d].var, simplify(total_stride));
    }

    return var_strides;
}

// Verify that function 'f' does not have partially specified schedules/bounds.
// The current auto scheduler cannots handle such cases.
void validate_no_partial_schedules(const Function &f, bool is_output) {
    if (f.has_extern_definition()) {
        return;
    }

    // Verify no compute_root or bounds are specified
    user_assert(f.schedule().compute_level().is_inlined())
        << "AutoSchedule: cannot auto-schedule function \"" << f.name()
        << "\" since it is scheduled to be computed at root\n";

    user_assert(is_output || f.schedule().bounds().empty())
        << "AutoSchedule: cannot auto-schedule function \"" << f.name()
        << "\" since it has partially specified bounds\n";

    int num_stages = f.updates().size() + 1;
    for (int stage = 0; stage < num_stages; ++stage) {
        const Definition &def = get_stage_definition(f, stage);
        const StageSchedule &schedule = def.schedule();

        // Verify no splits are specified
        user_assert(schedule.splits().empty())
            << "AutoSchedule: cannot auto-schedule function \"" << f.name()
            << "\" since it has partially specified schedules at stage " << stage << "\n";

        // Verify that none of the dimensions are scheduled to be parallelized or
        // vectorized, or unrolled.
        for (const auto &d : schedule.dims()) {
            user_assert(d.for_type == ForType::Serial)
                << "AutoSchedule: cannot auto-schedule function \"" << f.name()
                << "\" since stage " << stage << " is not serial at dim " << d.var << "\n";
        }

        if (stage == 0) {
            // Since we can only specialize on a Func, we only need to check for no
            // specializations for the initial stage.
            user_assert(def.specializations().empty())
                << "AutoSchedule: cannot auto-schedule function \"" << f.name()
                << "\" since it has specializations\n";

            // Verify that there is no loop reordering on the initial definition
            // (i.e. the Vars in the dim list should be in the same order as
            // the args in the LHS of the definition).
            internal_assert(schedule.dims().size() - 1 == def.args().size());
            for (size_t i = 0; i < def.args().size(); ++i) {
                const Variable *arg = def.args()[i].as<Variable>();
                internal_assert(arg);
                user_assert(arg->name == schedule.dims()[i].var)
                    << "AutoSchedule: cannot auto-schedule function \"" << f.name()
                    << "\" since dim \"" << arg->name << "\" at stage " << stage
                    << " has been reordered\n";
            }
        } else {
            // Verify that there is no loop reordering on the update definition
            // (i.e. the Vars in the dim list should be in the same order as
            // the args in the LHS of the definition, the RVars in the dim list
            // should be in the same order as the RVars in the rvar list, and
            // all RVars should come before all Vars).

            const vector<Dim> &dims = schedule.dims();
            const vector<ReductionVariable> &rvars = schedule.rvars();
            const vector<Expr> &args = f.definition().args();
            internal_assert(dims.size() - 1 >= rvars.size());

            for (size_t i = 0; i < rvars.size(); ++i) {
                const Dim &d = dims[i];
                user_assert(d.is_rvar() && (d.var == rvars[i].var))
                    << "AutoSchedule: cannot auto-schedule function \"" << f.name()
                    << "\" since dim \"" << i << "\" at stage " << stage
                    << " has been reordered\n";
            }

            internal_assert(dims.size() - rvars.size() - 1 <= args.size());
            int last_index = -1;
            for (int i = rvars.size(); i < (int)dims.size() - 1; ++i) {
                const Dim &d = dims[i];
                user_assert(!d.is_rvar())
                    << "AutoSchedule: cannot auto-schedule function \"" << f.name()
                    << "\" since dim \"" << i << "\" at stage " << stage
                    << " has been reordered\n";

                const auto &iter =
                    std::find_if(args.begin(), args.end(),
                                 [&d](const Expr &arg) {
                                     const Variable *v = arg.as<Variable>();
                                     return (d.var == v->name);
                                 });
                internal_assert(iter != args.end());
                int current_index = iter - args.begin();
                user_assert(current_index > last_index)
                    << "AutoSchedule: cannot auto-schedule function \"" << f.name()
                    << "\" since dim \"" << i << "\" at stage " << stage
                    << " has been reordered\n";
                last_index = current_index;
            }
        }
    }
}

// Return true if 'f' is used by some extern Func.
bool used_by_extern_func(const map<string, Function> &env, const Function &f) {
    for (const auto &iter : env) {
        for (const ExternFuncArgument &arg : iter.second.extern_arguments()) {
            if (arg.is_func()) {
                if (Function(arg.func).name() == f.name()) {
                    return true;
                }
            }
        }
    }
    return false;
}

// If the bounds of a Func are undefined, then we should just inline the Func
// as long as it is legal to inline or used by some extern Func.
set<string> get_unbounded_functions(const map<string, Box> &pipeline_bounds,
                                    const map<string, Function> &env) {
    set<string> unbounded;
    for (const auto &iter : env) {
        if (!pipeline_bounds.count(iter.first)) {
            debug(5) << "...Skip checking function \"" << iter.first
                     << "\" since it does not have pipeline bounds\n";
            continue;
        }
        const Function &f = iter.second;
        if (!f.can_be_inlined() || used_by_extern_func(env, f)) {
            continue;
        }
        const Box &bound = get_element(pipeline_bounds, iter.first);
        if (is_box_unbounded(bound)) {
            unbounded.insert(iter.first);
        }
    }
    return unbounded;
}

bool inline_unbounded(const vector<Function> &outputs,
                      const vector<string> &order,
                      const map<string, Function> &env,
                      const set<string> &unbounded) {
    bool inlined = false;
    // The very last few functions in 'order' are the last to be realized in the
    // pipeline (the final producers) so there is no point in checking it.
    for (int i = 0; i < (int)order.size() - (int)outputs.size(); ++i) {
        const Function &f1 = env.at(order[i]);
        if (!unbounded.count(f1.name())) {
            continue;
        }
        inlined = true;
        debug(4) << "Function \"" << order[i] << "\" is unbounded\n";
        for (int j = i + 1; j < (int)order.size(); ++j) {
            internal_assert(order[i] != order[j]);
            const Function &f2 = env.at(order[j]);
            debug(5) << "Inline unbounded function \"" << f1.name()
                     << "\" inside \"" << f2.name() << "\"\n";
            inline_function(f2, f1);
        }
    }
    return inlined;
}

}  // anonymous namespace

// Generate schedules for all functions in the pipeline required to compute the
// outputs. This applies the schedules and returns a string representation of
// the schedules. The target architecture is specified by 'target'.
string generate_schedules(const vector<Function> &outputs, const Target &target,
                          const MachineParams &arch_params) {
    // Make an environment map which is used throughout the auto scheduling process.
    map<string, Function> env;
    for (const Function &f : outputs) {
        map<string, Function> more_funcs = find_transitive_calls(f);
        env.insert(more_funcs.begin(), more_funcs.end());
    }

    // Finalize all the LoopLevels
    for (auto &iter : env) {
        iter.second.lock_loop_levels();
    }

    // Compute the topological order, before any trivial inlining (i.e. before
    // we remove any functions from 'env'). We need the full topological
    // order to pass to get_func() when generating the string representation
    // of the schedule.
    debug(2) << "Computing topological order...\n";
    vector<string> top_order = topological_order(outputs, env);

    // Validate that none of the functions in the pipeline have partial schedules.
    debug(2) << "Validating no partial schedules...\n";
    for (const auto &iter : env) {
        bool is_output = false;
        for (const auto &o : outputs) {
            is_output |= iter.second.same_as(o);
        }
        validate_no_partial_schedules(iter.second, is_output);
    }

    // The auto scheduling algorithm requires estimates on the outputs of the
    // pipeline to get quantitative estimates of costs for computing functions
    // in the pipeline.
    debug(2) << "Checking estimates on outputs...\n";
    check_estimates_on_outputs(outputs);

    // Run a pre-pass that inline all trivial Funcs (i.e. if the cost of
    // computing a Func is about the same as calling that Func, we should
    // just inline it).
    debug(2) << "Inlining all trivial functions...\n";
    if (inline_all_trivial_functions(outputs, top_order, env)) {
        // If any of the Funcs is inlined, we need to recompute 'env', since some
        // of the Funcs are no longer used and need to be removed from 'env'.
        //
        // Instead of recomputing 'env', we could also remove the inlined Func
        // within inline_all_trivial_functions(); however, it is a bit tricky
        // to do when dealing with inlined tuple. Consider the following case:
        //   f(x, y) = x + y;
        //   g(x, y) = {x, f(x, y)};
        //   h(x, y) = g(x, y)[0];
        // When 'g' is inlined in 'h', no one uses 'f' anymore and it can
        // be removed from 'env'. However, to know this, we need to trace
        // all the function calls within the pipeline. Thus, we might as well
        // recompute the 'env' from scratch.
        env.clear();
        for (const Function &f : outputs) {
            map<string, Function> more_funcs = find_transitive_calls(f);
            env.insert(more_funcs.begin(), more_funcs.end());
        }
    }

    // Compute the realization order of the functions within the pipeline.
    vector<string> order = realization_order(outputs, env).first;

    // Run a pre-pass that inline all Funcs which values are accessed by
    // another single Func in element-wise manner. We need to do this
    // repeatedly since some inlining decisions may enable further inlining
    // that previously not possible. Consider the following case:
    //   f1(x) = x;
    //   f2(x) = f1(x) + 2;
    //   f3(x) = f1(x) * 2;
    //   f4(x) = f2(x) + f3(x);
    //   f5(x) = f4(x) + 3;
    // In the first iteration, we cannot inline 'f1' since it is used by two
    // functions: 'f2' and 'f3'. If 'f2' and 'f4' get inlined and 'f3' is only
    // used by 'f4', then 'f1' can now also be inlined.
    debug(2) << "Inlining all element-wise functions...\n";
    while (inline_all_element_wise_functions(outputs, order, env)) {
        // We need to recompute 'env' for the same reason as with
        // inline_all_trivial_functions
        env.clear();
        for (const Function &f : outputs) {
            map<string, Function> more_funcs = find_transitive_calls(f);
            env.insert(more_funcs.begin(), more_funcs.end());
        }
        order = realization_order(outputs, env).first;
    }

    // Compute the bounds of function values which are used for dependence analysis.
    debug(2) << "Computing function value bounds...\n";
    FuncValueBounds func_val_bounds = compute_function_value_bounds(order, env);

    // Initialize the cost model.
    // Compute the expression costs for each function in the pipeline.
    debug(2) << "Initializing region costs...\n";
    RegionCosts costs(env, order);
    if (debug::debug_level() >= 3) {
        costs.disp_func_costs();
    }

    debug(2) << "Initializing dependence analysis...\n";
    DependenceAnalysis dep_analysis(env, order, func_val_bounds);

    // Compute bounds of all functions in the pipeline given estimates on
    // outputs. Also report functions which bounds could not be inferred.
    debug(2) << "Computing pipeline bounds...\n";
    map<string, Box> pipeline_bounds =
        get_pipeline_bounds(dep_analysis, outputs, &costs.input_estimates);

    // Determine all unbounded functions that are not extern Func or
    // used by some extern Funcs.
    debug(2) << "Determining all unbounded functions...\n";
    set<string> unbounded = get_unbounded_functions(pipeline_bounds, env);
    if (!unbounded.empty()) {
        // If some functions are unbounded, we should inline those directly.
        // Also, we need to recompute 'env' and re-initialize 'costs' and
        // 'dep_analysis'
        debug(2) << "Inlining all unbounded functions...\n";
        internal_assert(inline_unbounded(outputs, order, env, unbounded));

        env.clear();
        for (const Function &f : outputs) {
            map<string, Function> more_funcs = find_transitive_calls(f);
            env.insert(more_funcs.begin(), more_funcs.end());
        }
        order = realization_order(outputs, env).first;

        debug(2) << "Re-computing function value bounds...\n";
        func_val_bounds = compute_function_value_bounds(order, env);
        debug(2) << "Re-initializing region costs...\n";
        RegionCosts costs(env, order);
        debug(2) << "Re-initializing dependence analysis...\n";
        dep_analysis = DependenceAnalysis(env, order, func_val_bounds);
        debug(2) << "Re-computing pipeline bounds...\n";
        pipeline_bounds = get_pipeline_bounds(dep_analysis, outputs, &costs.input_estimates);
    }

    debug(2) << "Initializing partitioner...\n";
    Partitioner part(pipeline_bounds, arch_params, outputs, dep_analysis, costs);

    // Compute and display reuse
    /* TODO: Use the reuse estimates to reorder loops
    for (const auto &f : env) {
        FindAllCalls find;
        f.second.accept(&find);
        int num_stages = f.second.updates().size() + 1;
        for (int s = 0; s < num_stages; s++) {
            FStage curr_s(f.second, s);
            map<string, Expr> reuse = part.evaluate_reuse(curr_s, find.funcs_called);
            debug(0) << curr_s << "\n";
            for (const auto &dir : reuse) {
                debug(0) << dir.first << " " << dir.second << ",";
            }
            debug(0) << "\n";
        }
    }*/

    // Display the current pipeline graph.
    // TODO: Output the graph in dot format.
    if (debug::debug_level() >= 3) {
        part.disp_pipeline_graph();
        part.disp_pipeline_bounds();
    }

    debug(2) << "Partitioner initializing groups...\n";
    part.initialize_groups();
    if (debug::debug_level() >= 3) {
        part.disp_pipeline_costs();
    }

    debug(2) << "Partitioner computing inline group...\n";
    part.group(Partitioner::Level::Inline);
    if (debug::debug_level() >= 3) {
        part.disp_grouping();
    }

    debug(2) << "Partitioner computing fast-mem group...\n";
    part.grouping_cache.clear();
    part.group(Partitioner::Level::FastMem);
    if (debug::debug_level() >= 3) {
        part.disp_pipeline_costs();
        part.disp_grouping();
        part.disp_pipeline_graph();
    }

    debug(2) << "Initializing AutoSchedule...\n";
    AutoSchedule sched(env, top_order);
    debug(2) << "Generating CPU schedule...\n";
    part.generate_cpu_schedule(target, sched);

    std::ostringstream oss;
    oss << sched;
    string sched_string = oss.str();

    debug(3) << "\n\n*******************************\nSchedule:\n"
             << "*******************************\n"
             << sched_string << "\n\n";

    // TODO: Unify both inlining and grouping for fast mem
    // TODO: GPU scheduling
    // TODO: Hierarchical tiling

    return sched_string;
}

struct Mullapudi2016 {
    void operator()(const Pipeline &pipeline, const Target &target, const MachineParams &arch_params, AutoSchedulerResults *outputs) {
        AutoSchedulerResults results;
        results.target = target;
        results.machine_params_string = arch_params.to_string();

        user_assert(target.arch == Target::X86 || target.arch == Target::ARM ||
                    target.arch == Target::POWERPC || target.arch == Target::MIPS)
            << "The Mullapudi2016 autoscheduler is not supported for the target: " << target.to_string();
        results.scheduler_name = "Mullapudi2016";
        std::vector<Function> pipeline_outputs;
        for (const Func &f : pipeline.outputs()) {
            pipeline_outputs.push_back(f.function());
        }
        results.schedule_source = generate_schedules(pipeline_outputs, target, arch_params);
        // this autoscheduler has no featurization

        *outputs = results;
    }
};

REGISTER_AUTOSCHEDULER(Mullapudi2016)

}  // namespace Internal

}  // namespace Halide
Browse the archive

https://github.com/halide/Halide