https://github.com/halide/Halide
Tip revision: 37a0d771d87bda71b6992621d9db13dbdaaa0693 authored by Andrew Adams on 06 May 2021, 21:16:35 UTC
Use a VectorReduce not to determine if any lanes are true in Hexagon backend
Use a VectorReduce not to determine if any lanes are true in Hexagon backend
Tip revision: 37a0d77
Resnet50Generator.cpp
#include "Halide.h"
#include <tuple>
#include <unordered_map>
namespace {
struct Tensor {
Halide::Func f;
std::vector<int> shape;
std::string name;
};
struct WeightShape {
int c; // output channels
int w;
int h;
int pad;
int stride;
};
// returns index of found value in array or -1 if not in array
int find_index(int value, std::vector<int> vec) {
std::vector<int>::iterator it = std::find(vec.begin(), vec.end(), value);
if (it == vec.end())
return -1;
return std::distance(vec.begin(), it);
}
class Resnet50Generator : public Halide::Generator<Resnet50Generator> {
public:
Input<Buffer<float>> input{"input", 3};
/** parameter values for scaling layers **/
Input<Buffer<float>> conv1_gamma{"conv1_gamma", 1};
Input<Buffer<float>[4]> br1_gamma { "br1_gamma", 1 };
Input<Buffer<float>[16]> br2a_gamma { "br2a_gamma", 1 };
Input<Buffer<float>[16]> br2b_gamma { "br2b_gamma", 1 };
Input<Buffer<float>[16]> br2c_gamma { "br2c_gamma", 1 };
Input<Buffer<float>> conv1_beta{"conv1_beta", 1};
Input<Buffer<float>[4]> br1_beta { "br1_beta", 1 };
Input<Buffer<float>[16]> br2a_beta { "br2a_beta", 1 };
Input<Buffer<float>[16]> br2b_beta { "br2b_beta", 1 };
Input<Buffer<float>[16]> br2c_beta { "br2c_beta", 1 };
Input<Buffer<float>> conv1_mu{"conv1_mu", 1};
Input<Buffer<float>[4]> br1_mu { "br1_mu", 1 };
Input<Buffer<float>[16]> br2a_mu { "br2a_mu", 1 };
Input<Buffer<float>[16]> br2b_mu { "br2b_mu", 1 };
Input<Buffer<float>[16]> br2c_mu { "br2c_mu", 1 };
Input<Buffer<float>> conv1_sig{"conv1_sig", 1};
Input<Buffer<float>[4]> br1_sig { "br1_sig", 1 };
Input<Buffer<float>[16]> br2a_sig { "br2a_sig", 1 };
Input<Buffer<float>[16]> br2b_sig { "br2b_sig", 1 };
Input<Buffer<float>[16]> br2c_sig { "br2c_sig", 1 };
/** weights and biases for convolutions **/
Input<Buffer<float>> conv1_weights{"conv1_weights", 4};
Input<Buffer<float>[4]> br1_conv_weights { "br1_conv_weights", 4 };
Input<Buffer<float>[16]> br2a_conv_weights { "br2a_conv_weights", 4 };
Input<Buffer<float>[16]> br2b_conv_weights { "br2b_conv_weights", 4 };
Input<Buffer<float>[16]> br2c_conv_weights { "br2c_conv_weights", 4 };
Input<Buffer<float>> fc1000_weights{"fc1000_weights", 2};
Input<Buffer<float>> fc1000_bias{"fc1000_bias", 1};
Output<Buffer<float>> final_output{"final_output", 1};
/** list out shapes of each layers weights **/
// weight shapes: out channels, kernel_w, kernel_h, pad, stride. In channels infered by input tensor shape
const WeightShape conv1_ws = {64, 7, 7, 3, 2};
const WeightShape pool1_ws = {64, 3, 3, 1, 2};
const WeightShape pool5_ws = {2048, 7, 7, 0, 1};
const WeightShape fc1000_ws = {1000, 1, 1, 0, 1}; // 1x1 conv with 2048 input channels and 1000 output channels
// res2a, res2b, res2c all have shame shapes
const WeightShape res2x_br2a_ws = {64, 1, 1, 0, 1};
const WeightShape res2a_br2b_ws = {64, 3, 3, 1, 1};
const WeightShape res2x_br2b_ws = {64, 3, 3, 1, 1};
const WeightShape res2x_br2c_ws = {256, 1, 1, 0, 1};
const WeightShape res2a_br1_ws = {256, 1, 1, 0, 1};
// res3x is same for most layers
const WeightShape res3x_br2a_ws = {128, 1, 1, 0, 1};
const WeightShape res3a_br2b_ws = {128, 3, 3, 1, 2};
const WeightShape res3x_br2b_ws = {128, 3, 3, 1, 1};
const WeightShape res3x_br2c_ws = {512, 1, 1, 0, 1};
const WeightShape res3a_br1_ws = {512, 1, 1, 0, 2};
const WeightShape res4x_br2a_ws = {256, 1, 1, 0, 1};
const WeightShape res4a_br2b_ws = {256, 3, 3, 1, 2};
const WeightShape res4x_br2b_ws = {256, 3, 3, 1, 1};
const WeightShape res4x_br2c_ws = {1024, 1, 1, 0, 1};
const WeightShape res4a_br1_ws = {1024, 1, 1, 0, 2};
const WeightShape res5x_br2a_ws = {512, 1, 1, 0, 1};
const WeightShape res5a_br2b_ws = {512, 3, 3, 1, 2};
const WeightShape res5x_br2b_ws = {512, 3, 3, 1, 1};
const WeightShape res5x_br2c_ws = {2048, 1, 1, 0, 1};
const WeightShape res5a_br1_ws = {2048, 1, 1, 0, 2};
const WeightShape br1_ws[4] = {res2a_br1_ws, res3a_br1_ws, res4a_br1_ws, res5a_br1_ws};
const WeightShape br2a_ws[16] = {res2x_br2a_ws, res2x_br2a_ws, res2x_br2a_ws,
res3x_br2a_ws, res3x_br2a_ws, res3x_br2a_ws, res3x_br2a_ws,
res4x_br2a_ws, res4x_br2a_ws, res4x_br2a_ws, res4x_br2a_ws, res4x_br2a_ws, res4x_br2a_ws,
res5x_br2a_ws, res5x_br2a_ws, res5x_br2a_ws};
const WeightShape br2b_ws[16] = {res2a_br2b_ws, res2x_br2b_ws, res2x_br2b_ws,
res3a_br2b_ws, res3x_br2b_ws, res3x_br2b_ws, res3x_br2b_ws,
res4a_br2b_ws, res4x_br2b_ws, res4x_br2b_ws, res4x_br2b_ws, res4x_br2b_ws, res4x_br2b_ws,
res5a_br2b_ws, res5x_br2b_ws, res5x_br2b_ws};
const WeightShape br2c_ws[16] = {res2x_br2c_ws, res2x_br2c_ws, res2x_br2c_ws,
res3x_br2c_ws, res3x_br2c_ws, res3x_br2c_ws, res3x_br2c_ws,
res4x_br2c_ws, res4x_br2c_ws, res4x_br2c_ws, res4x_br2c_ws, res4x_br2c_ws, res4x_br2c_ws,
res5x_br2c_ws, res5x_br2c_ws, res5x_br2c_ws};
Var c, i, j;
void generate() {
// Algorithm
/** Declare arrays of other functions and build the requested block **/
Tensor br1_conv[4];
Tensor br1_norm[4];
Tensor br1_scale[4];
Tensor br2a_conv[16];
Tensor br2a_norm[16];
Tensor br2a_scaled[16];
Tensor br2a_relu[16];
Tensor br2b_conv[16];
Tensor br2b_norm[16];
Tensor br2b_scaled[16];
Tensor br2b_relu[16];
Tensor br2c_conv[16];
Tensor br2c_norm[16];
Tensor br2c_scaled[16];
Tensor resunit_sum[16];
Tensor resunit_relu[16];
Tensor pool5;
Tensor fc1000;
Tensor softmax;
// these tensors are different depending on the block and must be conditionally assigned.
Tensor input_t;
std::vector<int> input_shape;
Tensor br2a_input;
Tensor resunit_sum_input;
// used only for block_id == 0
Tensor conv1, norm1, scaled1, relu1, pool1;
std::vector<int> branch1_indices{0, 3, 7, 13};
/** if block_id is 0 build the (stem) conv1 section **/
for (int block_id = 0; block_id < 16; ++block_id) {
if (block_id == 0) {
input_shape = {3, 224, 224};
input_t.f = input;
input_t.shape = input_shape;
conv1 = conv2D(input_t, conv1_ws, conv1_weights, "conv1");
norm1 = norm_layer(conv1, conv1_mu, conv1_sig, "norm1");
scaled1 = scale_layer(norm1, conv1_gamma, conv1_beta, "scale1");
relu1 = relu_layer(scaled1, "relu1");
pool1 = max_pool_layer(relu1, pool1_ws, "pool1");
br2a_input = pool1;
} else {
br2a_input = resunit_relu[block_id - 1];
}
// build branch1 if this section has branch1
int br1_i = find_index(block_id, branch1_indices);
if (br1_i >= 0) {
br1_conv[br1_i] = conv2D(br2a_input, br1_ws[br1_i], br1_conv_weights[br1_i], "br1_conv");
br1_norm[br1_i] = norm_layer(br1_conv[br1_i], br1_mu[br1_i], br1_sig[br1_i], "br1_norm");
br1_scale[br1_i] = scale_layer(br1_norm[br1_i], br1_gamma[br1_i], br1_beta[br1_i], "br1_scale");
resunit_sum_input = br1_scale[br1_i];
} else {
resunit_sum_input = resunit_relu[block_id - 1];
}
// branch2a
auto weights = br2a_conv_weights[block_id];
br2a_conv[block_id] = conv2D(br2a_input, br2a_ws[block_id], weights, "block" + std::to_string(block_id) + "_2a_conv");
br2a_norm[block_id] = norm_layer(br2a_conv[block_id], br2a_mu[block_id], br2a_sig[block_id], "block" + std::to_string(block_id) + "_2a_norm");
br2a_scaled[block_id] = scale_layer(br2a_norm[block_id], br2a_gamma[block_id], br2a_beta[block_id], "block" + std::to_string(block_id) + "_2a_scale");
br2a_relu[block_id] = relu_layer(br2a_scaled[block_id], "2a_relu");
// branch 2b
weights = br2b_conv_weights[block_id];
br2b_conv[block_id] = conv2D(br2a_relu[block_id], br2b_ws[block_id], weights, "block" + std::to_string(block_id) + "_2b_conv");
br2b_norm[block_id] = norm_layer(br2b_conv[block_id], br2b_mu[block_id], br2b_sig[block_id], "block" + std::to_string(block_id) + "_2b_norm");
br2b_scaled[block_id] = scale_layer(br2b_norm[block_id], br2b_gamma[block_id], br2b_beta[block_id], "block" + std::to_string(block_id) + "_2b_scale");
br2b_relu[block_id] = relu_layer(br2b_scaled[block_id], "2b_relu");
// branch 2c
weights = br2c_conv_weights[block_id];
br2c_conv[block_id] = conv2D(br2b_relu[block_id], br2c_ws[block_id], weights, "block" + std::to_string(block_id) + "_2c_conv");
br2c_norm[block_id] = norm_layer(br2c_conv[block_id], br2c_mu[block_id], br2c_sig[block_id], "block" + std::to_string(block_id) + "_2c_norm");
br2c_scaled[block_id] = scale_layer(br2c_norm[block_id], br2c_gamma[block_id], br2c_beta[block_id], "block" + std::to_string(block_id) + "_2c_scale");
// create residual unit
resunit_sum[block_id] = sum_layer(resunit_sum_input, br2c_scaled[block_id], "block" + std::to_string(block_id) + "_res_sum");
resunit_relu[block_id] = relu_layer(resunit_sum[block_id], "block" + std::to_string(block_id) + "_res_relu");
// create final 3 layers
if (block_id == 15) {
pool5 = avg_pool_layer(resunit_relu[block_id], pool5_ws, "pool5");
fc1000 = fc_layer(pool5, fc1000_ws, fc1000_weights, fc1000_bias, "fc");
final_output = softmax_layer(fc1000, 1000, "softmax");
}
}
// TODO: Actually schedule this.
conv1.f.compute_root();
scaled1.f.compute_root();
relu1.f.compute_root();
pool1.f.compute_root();
for (int i = 0; i < 16; i++) {
br2a_relu[i].f.compute_root().vectorize(c, 8).parallel(j);
br2b_relu[i].f.compute_root().vectorize(c, 8).parallel(j);
resunit_relu[i].f.compute_root().vectorize(c, 8).parallel(j);
}
pool5.f.compute_root();
fc1000.f.compute_root();
softmax.f.compute_root();
}
private:
Func pad(Func f, Expr width, Expr height) {
Halide::Region bounds(f.dimensions());
bounds[1].min = 0;
bounds[1].extent = width;
bounds[2].min = 0;
bounds[2].extent = height;
return Halide::BoundaryConditions::constant_exterior(f, 0.0f, bounds);
}
std::vector<int> compute_shape(const Tensor &in, const WeightShape ¶ms) {
int w = (1.0 / params.stride) * (params.pad * 2 + in.shape[1] - params.w + 1 + params.stride - 1);
int h = (1.0 / params.stride) * (params.pad * 2 + in.shape[2] - params.h + 1 + params.stride - 1);
int c = params.c;
return {c, w, h};
}
Tensor conv2D(const Tensor &input, const WeightShape &weight_shape, const Func &weights, const std::string &name) {
int p = weight_shape.pad;
Func padded;
// pad input
if (p) {
padded = pad(input.f, input.shape[1], input.shape[2]);
} else {
padded = input.f;
}
RDom r(0, input.shape[0], 0, weight_shape.w, 0, weight_shape.h);
Func conv;
conv(c, i, j) += weights(c, r.y, r.z, r.x) * padded(r.x, weight_shape.stride * i + r.y - p, weight_shape.stride * j + r.z - p);
Tensor output;
output.f = conv;
output.name = name;
output.shape = compute_shape(input, weight_shape);
return output;
}
// assumes input is 3D (c, w, h) where w and h = 1
Tensor fc_layer(const Tensor &input, const WeightShape &weight_shape, const Func &weights, const Func &bias, const std::string &name) {
RDom r(0, input.shape[0]);
Func fc;
fc(c) = bias(c);
fc(c) += weights(c, r.x) * input.f(r.x, 0, 0);
Tensor output;
output.f = fc;
output.name = name;
output.shape = compute_shape(input, weight_shape);
return output;
}
Tensor relu_layer(const Tensor &input, const std::string &name) {
Func relu;
relu(c, i, j) = max(0.0f, input.f(c, i, j));
Tensor output;
output.f = relu;
output.shape = input.shape;
output.name = name;
return output;
}
Tensor max_pool_layer(const Tensor &input, const WeightShape &weight_shape, const std::string &name) {
int p = weight_shape.pad;
Func padded;
if (p) {
padded = pad(input.f, input.shape[1], input.shape[2]);
} else {
padded = input.f;
}
RDom r(0, weight_shape.w, 0, weight_shape.h);
Func pool;
pool(c, i, j) = maximum(padded(c, weight_shape.stride * i + r.x - p, weight_shape.stride * j + r.y - p));
Tensor output;
output.f = pool;
output.name = name;
output.shape = compute_shape(input, weight_shape);
return output;
}
Tensor avg_pool_layer(const Tensor &input, const WeightShape &weight_shape, const std::string &name) {
int p = weight_shape.pad;
Func padded;
if (p) {
padded = pad(input.f, input.shape[1], input.shape[2]);
} else {
padded = input.f;
}
RDom r(0, weight_shape.w, 0, weight_shape.h);
float scale = weight_shape.w * weight_shape.h;
Func pool;
float n = 1.0f / scale;
pool(c, i, j) += n * padded(c, weight_shape.stride * i + r.x - p, weight_shape.stride * j + r.y - p);
Tensor output;
output.f = pool;
output.name = name;
output.shape = compute_shape(input, weight_shape);
return output;
}
Tensor norm_layer(const Tensor &input, const Func &mu, const Func &sigma, const std::string &name) {
Func normed;
Expr e = input.f(c, i, j);
normed(c, i, j) = (input.f(c, i, j) - mu(c)) / (sqrt(sigma(c) + 1e-5f));
Tensor output;
output.f = normed;
output.shape = input.shape;
output.name = name;
return output;
}
Tensor scale_layer(const Tensor &input, const Func &gamma, const Func &beta, const std::string &name) {
Func scaled;
scaled(c, i, j) = input.f(c, i, j) * gamma(c) + beta(c);
Tensor output;
output.f = scaled;
output.shape = input.shape;
output.name = name;
return output;
}
Tensor sum_layer(const Tensor &t1, const Tensor &t2, const std::string &name) {
assert(t1.shape == t2.shape);
Func summed;
summed(c, i, j) = t1.f(c, i, j) + t2.f(c, i, j);
Tensor output;
output.f = summed;
output.shape = t1.shape;
output.name = name;
return output;
}
Func softmax_layer(const Tensor &input, const int classes, const std::string &name) {
assert(input.shape[0] == classes);
RDom r(0, classes);
Func exp_vals;
exp_vals(c) = exp(input.f(c));
Func output("output");
output(c) = exp_vals(c) / sum(exp_vals(r.x));
return output;
}
};
} //namespace
HALIDE_REGISTER_GENERATOR(Resnet50Generator, resnet50)