#include <iostream>
#include <limits>
#include "CodeGen_C.h"
#include "CodeGen_Internal.h"
#include "Deinterleave.h"
#include "FindIntrinsics.h"
#include "IROperator.h"
#include "Lerp.h"
#include "Param.h"
#include "Simplify.h"
#include "Substitute.h"
#include "Type.h"
#include "Util.h"
#include "Var.h"
namespace Halide {
namespace Internal {
using std::map;
using std::ostream;
using std::ostringstream;
using std::string;
using std::vector;
extern "C" unsigned char halide_internal_initmod_inlined_c[];
extern "C" unsigned char halide_internal_runtime_header_HalideRuntime_h[];
extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeCuda_h[];
extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeHexagonHost_h[];
extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeMetal_h[];
extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeOpenCL_h[];
extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeOpenGLCompute_h[];
extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeQurt_h[];
extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeD3D12Compute_h[];
namespace {
// HALIDE_MUST_USE_RESULT defined here is intended to exactly
// duplicate the definition in HalideRuntime.h (so that either or
// both can be present, in any order).
const char *const kDefineMustUseResult = R"INLINE_CODE(#ifndef HALIDE_MUST_USE_RESULT
#ifdef __has_attribute
#if __has_attribute(nodiscard)
#define HALIDE_MUST_USE_RESULT [[nodiscard]]
#elif __has_attribute(warn_unused_result)
#define HALIDE_MUST_USE_RESULT __attribute__((warn_unused_result))
#else
#define HALIDE_MUST_USE_RESULT
#endif
#else
#define HALIDE_MUST_USE_RESULT
#endif
#endif
)INLINE_CODE";
const string headers = R"INLINE_CODE(
/* MACHINE GENERATED By Halide. */
#if !(__cplusplus >= 201103L || _MSVC_LANG >= 201103L)
#error "This code requires C++11 (or later); please upgrade your compiler."
#endif
#include <assert.h>
#include <float.h>
#include <limits.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <type_traits>
)INLINE_CODE";
// We now add definitions of things in the runtime which are
// intended to be inlined into every module but are only expressed
// in .ll. The redundancy is regrettable (FIXME).
const string globals = R"INLINE_CODE(
extern "C" {
int64_t halide_current_time_ns(void *ctx);
void halide_profiler_pipeline_end(void *, void *);
}
#ifdef _WIN32
#ifndef _MT
__declspec(dllimport) float __cdecl roundf(float);
__declspec(dllimport) double __cdecl round(double);
#endif
#else
inline float asinh_f32(float x) {return asinhf(x);}
inline float acosh_f32(float x) {return acoshf(x);}
inline float atanh_f32(float x) {return atanhf(x);}
inline double asinh_f64(double x) {return asinh(x);}
inline double acosh_f64(double x) {return acosh(x);}
inline double atanh_f64(double x) {return atanh(x);}
#endif
inline float sqrt_f32(float x) {return sqrtf(x);}
inline float sin_f32(float x) {return sinf(x);}
inline float asin_f32(float x) {return asinf(x);}
inline float cos_f32(float x) {return cosf(x);}
inline float acos_f32(float x) {return acosf(x);}
inline float tan_f32(float x) {return tanf(x);}
inline float atan_f32(float x) {return atanf(x);}
inline float atan2_f32(float x, float y) {return atan2f(x, y);}
inline float sinh_f32(float x) {return sinhf(x);}
inline float cosh_f32(float x) {return coshf(x);}
inline float tanh_f32(float x) {return tanhf(x);}
inline float hypot_f32(float x, float y) {return hypotf(x, y);}
inline float exp_f32(float x) {return expf(x);}
inline float log_f32(float x) {return logf(x);}
inline float pow_f32(float x, float y) {return powf(x, y);}
inline float floor_f32(float x) {return floorf(x);}
inline float ceil_f32(float x) {return ceilf(x);}
inline float round_f32(float x) {return roundf(x);}
inline double sqrt_f64(double x) {return sqrt(x);}
inline double sin_f64(double x) {return sin(x);}
inline double asin_f64(double x) {return asin(x);}
inline double cos_f64(double x) {return cos(x);}
inline double acos_f64(double x) {return acos(x);}
inline double tan_f64(double x) {return tan(x);}
inline double atan_f64(double x) {return atan(x);}
inline double atan2_f64(double x, double y) {return atan2(x, y);}
inline double sinh_f64(double x) {return sinh(x);}
inline double cosh_f64(double x) {return cosh(x);}
inline double tanh_f64(double x) {return tanh(x);}
inline double hypot_f64(double x, double y) {return hypot(x, y);}
inline double exp_f64(double x) {return exp(x);}
inline double log_f64(double x) {return log(x);}
inline double pow_f64(double x, double y) {return pow(x, y);}
inline double floor_f64(double x) {return floor(x);}
inline double ceil_f64(double x) {return ceil(x);}
inline double round_f64(double x) {return round(x);}
inline float nan_f32() {return NAN;}
inline float neg_inf_f32() {return -INFINITY;}
inline float inf_f32() {return INFINITY;}
inline bool is_nan_f32(float x) {return isnan(x);}
inline bool is_nan_f64(double x) {return isnan(x);}
inline bool is_inf_f32(float x) {return isinf(x);}
inline bool is_inf_f64(double x) {return isinf(x);}
inline bool is_finite_f32(float x) {return isfinite(x);}
inline bool is_finite_f64(double x) {return isfinite(x);}
template<typename A, typename B>
inline A reinterpret(const B &b) {
static_assert(sizeof(A) == sizeof(B), "type size mismatch");
A a;
memcpy(&a, &b, sizeof(a));
return a;
}
inline float float_from_bits(uint32_t bits) {
return reinterpret<float, uint32_t>(bits);
}
template<typename T>
inline int halide_popcount(T a) {
int bits_set = 0;
while (a != 0) {
bits_set += a & 1;
a >>= 1;
}
return bits_set;
}
template<typename T>
inline int halide_count_leading_zeros(T a) {
int leading_zeros = 0;
int bit = sizeof(a) * 8 - 1;
while (bit >= 0 && (a & (((T)1) << bit)) == 0) {
leading_zeros++;
bit--;
}
return leading_zeros;
}
template<typename T>
inline int halide_count_trailing_zeros(T a) {
int trailing_zeros = 0;
constexpr int bits = sizeof(a) * 8;
int bit = 0;
while (bit < bits && (a & (((T)1) << bit)) == 0) {
trailing_zeros++;
bit++;
}
return trailing_zeros;
}
template<typename T>
inline T halide_cpp_max(const T &a, const T &b) {return (a > b) ? a : b;}
template<typename T>
inline T halide_cpp_min(const T &a, const T &b) {return (a < b) ? a : b;}
template<typename T>
inline void halide_unused(const T&) {}
template<typename A, typename B>
const B &return_second(const A &a, const B &b) {
halide_unused(a);
return b;
}
namespace {
class HalideFreeHelper {
typedef void (*FreeFunction)(void *user_context, void *p);
void * user_context;
void *p;
FreeFunction free_function;
public:
HalideFreeHelper(void *user_context, void *p, FreeFunction free_function)
: user_context(user_context), p(p), free_function(free_function) {}
~HalideFreeHelper() { free(); }
void free() {
if (p) {
// TODO: do all free_functions guarantee to ignore a nullptr?
free_function(user_context, p);
p = nullptr;
}
}
};
} // namespace
)INLINE_CODE";
class TypeInfoGatherer : public IRGraphVisitor {
private:
using IRGraphVisitor::include;
using IRGraphVisitor::visit;
void include_type(const Type &t) {
if (t.is_vector()) {
if (t.is_bool()) {
// bool vectors are always emitted as uint8 in the C++ backend
// TODO: on some architectures, we could do better by choosing
// a bitwidth that matches the other vectors in use; EliminateBoolVectors
// could be used for this with a bit of work.
vector_types_used.insert(UInt(8).with_lanes(t.lanes()));
} else if (!t.is_handle()) {
// Vector-handle types can be seen when processing (e.g.)
// require() statements that are vectorized, but they
// will all be scalarized away prior to use, so don't emit
// them.
vector_types_used.insert(t);
if (t.is_int()) {
// If we are including an int-vector type, also include
// the same-width uint-vector type; there are various operations
// that can use uint vectors for intermediate results (e.g. lerp(),
// but also Mod, which can generate a call to abs() for int types,
// which always produces uint results for int inputs in Halide);
// it's easier to just err on the side of extra vectors we don't
// use since they are just type declarations.
vector_types_used.insert(t.with_code(halide_type_uint));
}
}
}
}
void include_lerp_types(const Type &t) {
if (t.is_vector() && t.is_int_or_uint() && (t.bits() >= 8 && t.bits() <= 32)) {
include_type(t.widen());
}
}
protected:
void include(const Expr &e) override {
include_type(e.type());
IRGraphVisitor::include(e);
}
// GCC's __builtin_shuffle takes an integer vector of
// the size of its input vector. Make sure this type exists.
void visit(const Shuffle *op) override {
vector_types_used.insert(Int(32, op->vectors[0].type().lanes()));
IRGraphVisitor::visit(op);
}
void visit(const For *op) override {
for_types_used.insert(op->for_type);
IRGraphVisitor::visit(op);
}
void visit(const Ramp *op) override {
include_type(op->type.with_lanes(op->lanes));
IRGraphVisitor::visit(op);
}
void visit(const Broadcast *op) override {
include_type(op->type.with_lanes(op->lanes));
IRGraphVisitor::visit(op);
}
void visit(const Cast *op) override {
include_type(op->type);
IRGraphVisitor::visit(op);
}
void visit(const Call *op) override {
include_type(op->type);
if (op->is_intrinsic(Call::lerp)) {
// lower_lerp() can synthesize wider vector types.
for (const auto &a : op->args) {
include_lerp_types(a.type());
}
} else if (op->is_intrinsic()) {
Expr lowered = lower_intrinsic(op);
if (lowered.defined()) {
lowered.accept(this);
return;
}
}
IRGraphVisitor::visit(op);
}
public:
std::set<ForType> for_types_used;
std::set<Type> vector_types_used;
};
} // namespace
CodeGen_C::CodeGen_C(ostream &s, const Target &t, OutputKind output_kind, const std::string &guard)
: IRPrinter(s), id("$$ BAD ID $$"), target(t), output_kind(output_kind),
extern_c_open(false), inside_atomic_mutex_node(false), emit_atomic_stores(false), using_vector_typedefs(false) {
if (is_header()) {
// If it's a header, emit an include guard.
stream << "#ifndef HALIDE_" << c_print_name(guard) << "\n"
<< "#define HALIDE_" << c_print_name(guard) << "\n"
<< "#include <stdint.h>\n"
<< "\n"
<< "// Forward declarations of the types used in the interface\n"
<< "// to the Halide pipeline.\n"
<< "//\n";
if (target.has_feature(Target::NoRuntime)) {
stream << "// For the definitions of these structs, include HalideRuntime.h\n";
} else {
stream << "// Definitions for these structs are below.\n";
}
stream << "\n"
<< "// Halide's representation of a multi-dimensional array.\n"
<< "// Halide::Runtime::Buffer is a more user-friendly wrapper\n"
<< "// around this. Its declaration is in HalideBuffer.h\n"
<< "struct halide_buffer_t;\n"
<< "\n"
<< "// Metadata describing the arguments to the generated function.\n"
<< "// Used to construct calls to the _argv version of the function.\n"
<< "struct halide_filter_metadata_t;\n"
<< "\n";
// We just forward declared the following types:
forward_declared.insert(type_of<halide_buffer_t *>().handle_type);
forward_declared.insert(type_of<halide_filter_metadata_t *>().handle_type);
} else if (is_extern_decl()) {
// Extern decls to be wrapped inside other code (eg python extensions);
// emit the forward decls with a minimum of noise. Note that we never
// mess with legacy buffer types in this case.
stream << "struct halide_buffer_t;\n"
<< "struct halide_filter_metadata_t;\n"
<< "\n";
forward_declared.insert(type_of<halide_buffer_t *>().handle_type);
forward_declared.insert(type_of<halide_filter_metadata_t *>().handle_type);
} else {
// Include declarations of everything generated C source might want
stream
<< headers
<< globals
<< halide_internal_runtime_header_HalideRuntime_h << "\n"
<< halide_internal_initmod_inlined_c << "\n";
stream << "\n";
}
stream << kDefineMustUseResult << "\n";
// Throw in a default (empty) definition of HALIDE_FUNCTION_ATTRS
// (some hosts may define this to e.g. __attribute__((warn_unused_result)))
stream << "#ifndef HALIDE_FUNCTION_ATTRS\n";
stream << "#define HALIDE_FUNCTION_ATTRS\n";
stream << "#endif\n";
}
CodeGen_C::~CodeGen_C() {
set_name_mangling_mode(NameMangling::Default);
if (is_header()) {
if (!target.has_feature(Target::NoRuntime)) {
stream << "\n"
<< "// The generated object file that goes with this header\n"
<< "// includes a full copy of the Halide runtime so that it\n"
<< "// can be used standalone. Declarations for the functions\n"
<< "// in the Halide runtime are below.\n";
if (target.os == Target::Windows) {
stream
<< "//\n"
<< "// The inclusion of this runtime means that it is not legal\n"
<< "// to link multiple Halide-generated object files together.\n"
<< "// This problem is Windows-specific. On other platforms, we\n"
<< "// use weak linkage.\n";
} else {
stream
<< "//\n"
<< "// The runtime is defined using weak linkage, so it is legal\n"
<< "// to link multiple Halide-generated object files together,\n"
<< "// or to clobber any of these functions with your own\n"
<< "// definition.\n";
}
stream << "//\n"
<< "// To generate an object file without a full copy of the\n"
<< "// runtime, use the -no_runtime target flag. To generate a\n"
<< "// standalone Halide runtime to use with such object files\n"
<< "// use the -r flag with any Halide generator binary, e.g.:\n"
<< "// $ ./my_generator -r halide_runtime -o . target=host\n"
<< "\n"
<< halide_internal_runtime_header_HalideRuntime_h << "\n";
if (target.has_feature(Target::CUDA)) {
stream << halide_internal_runtime_header_HalideRuntimeCuda_h << "\n";
}
if (target.has_feature(Target::HVX)) {
stream << halide_internal_runtime_header_HalideRuntimeHexagonHost_h << "\n";
}
if (target.has_feature(Target::Metal)) {
stream << halide_internal_runtime_header_HalideRuntimeMetal_h << "\n";
}
if (target.has_feature(Target::OpenCL)) {
stream << halide_internal_runtime_header_HalideRuntimeOpenCL_h << "\n";
}
if (target.has_feature(Target::OpenGLCompute)) {
stream << halide_internal_runtime_header_HalideRuntimeOpenGLCompute_h << "\n";
}
if (target.has_feature(Target::D3D12Compute)) {
stream << halide_internal_runtime_header_HalideRuntimeD3D12Compute_h << "\n";
}
}
stream << "#endif\n";
}
}
void CodeGen_C::add_vector_typedefs(const std::set<Type> &vector_types) {
if (!vector_types.empty()) {
// MSVC has a limit of ~16k for string literals, so split
// up these declarations accordingly
const char *cpp_vector_decl = R"INLINE_CODE(
#if !defined(__has_attribute)
#define __has_attribute(x) 0
#endif
#if !defined(__has_builtin)
#define __has_builtin(x) 0
#endif
namespace {
// We can't use std::array because that has its own overload of operator<, etc,
// which will interfere with ours.
template <typename ElementType, size_t Lanes>
struct CppVector {
ElementType elements[Lanes];
HALIDE_ALWAYS_INLINE
ElementType& operator[](size_t i) {
return elements[i];
}
HALIDE_ALWAYS_INLINE
const ElementType operator[](size_t i) const {
return elements[i];
}
HALIDE_ALWAYS_INLINE
ElementType *data() {
return elements;
}
HALIDE_ALWAYS_INLINE
const ElementType *data() const {
return elements;
}
};
template <typename ElementType_, size_t Lanes_>
class CppVectorOps {
public:
using ElementType = ElementType_ ;
static constexpr size_t Lanes = Lanes_;
using Vec = CppVector<ElementType, Lanes>;
using Mask = CppVector<uint8_t, Lanes>;
CppVectorOps() = delete;
static Vec broadcast(const ElementType v) {
Vec r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = v;
}
return r;
}
static Vec ramp(const ElementType base, const ElementType stride) {
Vec r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = base + stride * i;
}
return r;
}
static Vec load(const void *base, int32_t offset) {
Vec r;
memcpy(r.data(), ((const ElementType*)base + offset), sizeof(ElementType) * Lanes);
return r;
}
static Vec load_gather(const void *base, const CppVector<int32_t, Lanes> &offset) {
Vec r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = ((const ElementType*)base)[offset[i]];
}
return r;
}
static Vec load_predicated(const void *base, const CppVector<int32_t, Lanes> &offset, const Mask &predicate) {
Vec r;
for (size_t i = 0; i < Lanes; i++) {
if (predicate[i]) {
r[i] = ((const ElementType*)base)[offset[i]];
}
}
return r;
}
static void store(const Vec &v, void *base, int32_t offset) {
memcpy(((ElementType*)base + offset), v.data(), sizeof(ElementType) * Lanes);
}
static void store_scatter(const Vec &v, void *base, const CppVector<int32_t, Lanes> &offset) {
for (size_t i = 0; i < Lanes; i++) {
((ElementType*)base)[offset[i]] = v[i];
}
}
static void store_predicated(const Vec &v, void *base, const CppVector<int32_t, Lanes> &offset, const Mask &predicate) {
for (size_t i = 0; i < Lanes; i++) {
if (predicate[i]) {
((ElementType*)base)[offset[i]] = v[i];
}
}
}
template<int... Indices>
static Vec shuffle(const Vec &a) {
static_assert(sizeof...(Indices) == Lanes, "shuffle() requires an exact match of lanes");
Vec r = { a[Indices]... };
return r;
}
static Vec replace(const Vec &v, size_t i, const ElementType b) {
Vec r = v;
r[i] = b;
return r;
}
template <typename OtherVec>
static Vec convert_from(const OtherVec &src) {
Vec r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = static_cast<ElementType>(src[i]);
}
return r;
}
static Vec max(const Vec &a, const Vec &b) {
Vec r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = ::halide_cpp_max(a[i], b[i]);
}
return r;
}
static Vec min(const Vec &a, const Vec &b) {
Vec r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = ::halide_cpp_min(a[i], b[i]);
}
return r;
}
static Vec select(const Mask &cond, const Vec &true_value, const Vec &false_value) {
Vec r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = cond[i] ? true_value[i] : false_value[i];
}
return r;
}
static Mask logical_or(const Vec &a, const Vec &b) {
CppVector<uint8_t, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] || b[i] ? 0xff : 0x00;
}
return r;
}
static Mask logical_and(const Vec &a, const Vec &b) {
CppVector<uint8_t, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] && b[i] ? 0xff : 0x00;
}
return r;
}
static Mask lt(const Vec &a, const Vec &b) {
CppVector<uint8_t, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] < b[i] ? 0xff : 0x00;
}
return r;
}
static Mask le(const Vec &a, const Vec &b) {
CppVector<uint8_t, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] <= b[i] ? 0xff : 0x00;
}
return r;
}
static Mask gt(const Vec &a, const Vec &b) {
CppVector<uint8_t, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] > b[i] ? 0xff : 0x00;
}
return r;
}
static Mask ge(const Vec &a, const Vec &b) {
CppVector<uint8_t, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] >= b[i] ? 0xff : 0x00;
}
return r;
}
static Mask eq(const Vec &a, const Vec &b) {
CppVector<uint8_t, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] == b[i] ? 0xff : 0x00;
}
return r;
}
static Mask ne(const Vec &a, const Vec &b) {
CppVector<uint8_t, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] != b[i] ? 0xff : 0x00;
}
return r;
}
};
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator~(const CppVector<ElementType, Lanes> &v) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = ~v[i];
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator!(const CppVector<ElementType, Lanes> &v) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = !v[i];
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator+(const CppVector<ElementType, Lanes> &a, const CppVector<ElementType, Lanes> &b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] + b[i];
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator-(const CppVector<ElementType, Lanes> &a, const CppVector<ElementType, Lanes> &b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] - b[i];
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator*(const CppVector<ElementType, Lanes> &a, const CppVector<ElementType, Lanes> &b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] * b[i];
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator/(const CppVector<ElementType, Lanes> &a, const CppVector<ElementType, Lanes> &b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] / b[i];
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator%(const CppVector<ElementType, Lanes> &a, const CppVector<ElementType, Lanes> &b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] % b[i];
}
return r;
}
template <typename ElementType, size_t Lanes, typename OtherElementType>
CppVector<ElementType, Lanes> operator<<(const CppVector<ElementType, Lanes> &a, const CppVector<OtherElementType, Lanes> &b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] << b[i];
}
return r;
}
template <typename ElementType, size_t Lanes, typename OtherElementType>
CppVector<ElementType, Lanes> operator>>(const CppVector<ElementType, Lanes> &a, const CppVector<OtherElementType, Lanes> &b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] >> b[i];
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator&(const CppVector<ElementType, Lanes> &a, const CppVector<ElementType, Lanes> &b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] & b[i];
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator|(const CppVector<ElementType, Lanes> &a, const CppVector<ElementType, Lanes> &b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] | b[i];
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator^(const CppVector<ElementType, Lanes> &a, const CppVector<ElementType, Lanes> &b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] ^ b[i];
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator+(const CppVector<ElementType, Lanes> &a, const ElementType b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] + b;
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator-(const CppVector<ElementType, Lanes> &a, const ElementType b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] - b;
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator*(const CppVector<ElementType, Lanes> &a, const ElementType b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] * b;
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator/(const CppVector<ElementType, Lanes> &a, const ElementType b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] / b;
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator%(const CppVector<ElementType, Lanes> &a, const ElementType b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] % b;
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator>>(const CppVector<ElementType, Lanes> &a, const ElementType b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] >> b;
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator<<(const CppVector<ElementType, Lanes> &a, const ElementType b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] << b;
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator&(const CppVector<ElementType, Lanes> &a, const ElementType b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] & b;
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator|(const CppVector<ElementType, Lanes> &a, const ElementType b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] | b;
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator^(const CppVector<ElementType, Lanes> &a, const ElementType b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] ^ b;
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator+(const ElementType a, const CppVector<ElementType, Lanes> &b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a + b[i];
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator-(const ElementType a, const CppVector<ElementType, Lanes> &b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a - b[i];
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator*(const ElementType a, const CppVector<ElementType, Lanes> &b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a * b[i];
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator/(const ElementType a, const CppVector<ElementType, Lanes> &b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a / b[i];
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator%(const ElementType a, const CppVector<ElementType, Lanes> &b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a % b[i];
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator>>(const ElementType a, const CppVector<ElementType, Lanes> &b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a >> b[i];
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator<<(const ElementType a, const CppVector<ElementType, Lanes> &b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a << b[i];
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator&(const ElementType a, const CppVector<ElementType, Lanes> &b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a & b[i];
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator|(const ElementType a, const CppVector<ElementType, Lanes> &b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a | b[i];
}
return r;
}
template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator^(const ElementType a, const CppVector<ElementType, Lanes> &b) {
CppVector<ElementType, Lanes> r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a ^ b[i];
}
return r;
}
} // namespace
)INLINE_CODE";
const char *native_vector_decl = R"INLINE_CODE(
namespace {
#if __has_attribute(ext_vector_type) || __has_attribute(vector_size)
#if __has_attribute(ext_vector_type)
// Clang
template<typename ElementType, size_t Lanes>
using NativeVector __attribute__((ext_vector_type(Lanes), aligned(sizeof(ElementType)))) = ElementType;
#elif __has_attribute(vector_size) || defined(__GNUC__)
// GCC
template<typename ElementType, size_t Lanes>
using NativeVector __attribute__((vector_size(Lanes * sizeof(ElementType)), aligned(sizeof(ElementType)))) = ElementType;
#else
#error
#endif
template<typename T>
struct NativeVectorComparisonType {
using type = void;
};
template<>
struct NativeVectorComparisonType<int8_t> { using type = char; };
template<>
struct NativeVectorComparisonType<int16_t> { using type = int16_t; };
template<>
struct NativeVectorComparisonType<int32_t> { using type = int32_t; };
template<>
struct NativeVectorComparisonType<int64_t> { using type = int64_t; };
template<>
struct NativeVectorComparisonType<uint8_t> { using type = char; };
template<>
struct NativeVectorComparisonType<uint16_t> { using type = int16_t; };
template<>
struct NativeVectorComparisonType<uint32_t> { using type = int32_t; };
template<>
struct NativeVectorComparisonType<uint64_t> { using type = int64_t; };
template<>
struct NativeVectorComparisonType<float> { using type = int32_t; };
template<>
struct NativeVectorComparisonType<double> { using type = int64_t; };
template <typename ElementType_, size_t Lanes_>
class NativeVectorOps {
public:
using ElementType = ElementType_ ;
static constexpr size_t Lanes = Lanes_;
using Vec = NativeVector<ElementType, Lanes>;
using Mask = NativeVector<uint8_t, Lanes>;
NativeVectorOps() = delete;
static Vec broadcast(const ElementType v) {
const Vec zero = {}; // Zero-initialized native vector.
return v - zero;
}
static Vec ramp(const ElementType base, const ElementType stride) {
Vec r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = base + stride * i;
}
return r;
}
static Vec load(const void *base, int32_t offset) {
Vec r;
// We only require Vec to be element-aligned, so we can't safely just read
// directly from memory (might segfault). Use memcpy for safety.
//
// If Vec is a non-power-of-two (e.g. uint8x48), the actual implementation
// might be larger (e.g. it might really be a uint8x64). Only copy the amount
// that is in the logical type, to avoid possible overreads.
memcpy(&r, ((const ElementType*)base + offset), sizeof(ElementType) * Lanes);
return r;
}
static Vec load_gather(const void *base, const NativeVector<int32_t, Lanes> offset) {
Vec r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = ((const ElementType*)base)[offset[i]];
}
return r;
}
static Vec load_predicated(const void *base, const NativeVector<int32_t, Lanes> offset, const NativeVector<uint8_t, Lanes> predicate) {
Vec r;
for (size_t i = 0; i < Lanes; i++) {
if (predicate[i]) {
r[i] = ((const ElementType*)base)[offset[i]];
}
}
return r;
}
static void store(const Vec v, void *base, int32_t offset) {
// We only require Vec to be element-aligned, so we can't safely just write
// directly from memory (might segfault). Use memcpy for safety.
//
// If Vec is a non-power-of-two (e.g. uint8x48), the actual implementation
// might be larger (e.g. it might really be a uint8x64). Only copy the amount
// that is in the logical type, to avoid possible overreads.
memcpy(((ElementType*)base + offset), &v, sizeof(ElementType) * Lanes);
}
static void store_scatter(const Vec v, void *base, const NativeVector<int32_t, Lanes> offset) {
for (size_t i = 0; i < Lanes; i++) {
((ElementType*)base)[offset[i]] = v[i];
}
}
static void store_predicated(const Vec v, void *base, const NativeVector<int32_t, Lanes> offset, const NativeVector<uint8_t, Lanes> predicate) {
for (size_t i = 0; i < Lanes; i++) {
if (predicate[i]) {
((ElementType*)base)[offset[i]] = v[i];
}
}
}
template<int... Indices>
static Vec shuffle(const Vec a) {
static_assert(sizeof...(Indices) == Lanes, "shuffle() requires an exact match of lanes");
#if __has_builtin(__builtin_shufflevector)
// Clang
return __builtin_shufflevector(a, a, Indices...);
#elif __has_builtin(__builtin_shuffle) || defined(__GNUC__)
// GCC
return __builtin_shuffle(a, NativeVector<int, sizeof...(Indices)>{Indices...});
#else
Vec r = { a[Indices]... };
return r;
#endif
}
static Vec replace(Vec v, size_t i, const ElementType b) {
v[i] = b;
return v;
}
template <typename OtherVec>
static Vec convert_from(const OtherVec src) {
#if __has_builtin(__builtin_convertvector)
// Don't use __builtin_convertvector for float->int: it appears to have
// different float->int rounding behavior in at least some situations;
// for now we'll use the much-slower-but-correct explicit C++ code.
// (https://github.com/halide/Halide/issues/2080)
constexpr bool is_float_to_int = std::is_floating_point<OtherVec>::value &&
std::is_integral<Vec>::value;
if (!is_float_to_int) {
return __builtin_convertvector(src, Vec);
}
#endif
// Fallthru for float->int, or degenerate compilers that support native vectors
// but not __builtin_convertvector (Intel?)
Vec r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = static_cast<ElementType>(src[i]);
}
return r;
}
static Vec max(const Vec a, const Vec b) {
#if defined(__GNUC__) && !defined(__clang__)
// TODO: GCC doesn't seem to recognize this pattern, and scalarizes instead
return a > b ? a : b;
#else
// Clang doesn't do ternary operator for vectors, but recognizes this pattern
Vec r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] > b[i] ? a[i] : b[i];
}
return r;
#endif
}
static Vec min(const Vec a, const Vec b) {
#if defined(__GNUC__) && !defined(__clang__)
// TODO: GCC doesn't seem to recognize this pattern, and scalarizes instead
return a < b ? a : b;
#else
// Clang doesn't do ternary operator for vectors, but recognizes this pattern
Vec r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = a[i] < b[i] ? a[i] : b[i];
}
return r;
#endif
}
static Vec select(const Mask cond, const Vec true_value, const Vec false_value) {
#if defined(__GNUC__) && !defined(__clang__)
// This should do the correct lane-wise select.
using T = typename NativeVectorComparisonType<ElementType>::type;
auto b = NativeVectorOps<T, Lanes>::convert_from(cond);
return b ? true_value : false_value;
#else
// Clang doesn't do ternary operator for vectors, but recognizes this pattern
Vec r;
for (size_t i = 0; i < Lanes; i++) {
r[i] = cond[i] ? true_value[i] : false_value[i];
}
return r;
#endif
}
// The relational operators produce signed-int of same width as input; our codegen expects uint8.
static Mask logical_or(const Vec a, const Vec b) {
using T = typename NativeVectorComparisonType<ElementType>::type;
const NativeVector<T, Lanes> r = a || b;
return NativeVectorOps<uint8_t, Lanes>::convert_from(r);
}
static Mask logical_and(const Vec a, const Vec b) {
using T = typename NativeVectorComparisonType<ElementType>::type;
const NativeVector<T, Lanes> r = a && b;
return NativeVectorOps<uint8_t, Lanes>::convert_from(r);
}
static Mask lt(const Vec a, const Vec b) {
using T = typename NativeVectorComparisonType<ElementType>::type;
const NativeVector<T, Lanes> r = a < b;
return NativeVectorOps<uint8_t, Lanes>::convert_from(r);
}
static Mask le(const Vec a, const Vec b) {
using T = typename NativeVectorComparisonType<ElementType>::type;
const NativeVector<T, Lanes> r = a <= b;
return NativeVectorOps<uint8_t, Lanes>::convert_from(r);
}
static Mask gt(const Vec a, const Vec b) {
using T = typename NativeVectorComparisonType<ElementType>::type;
const NativeVector<T, Lanes> r = a > b;
return NativeVectorOps<uint8_t, Lanes>::convert_from(r);
}
static Mask ge(const Vec a, const Vec b) {
using T = typename NativeVectorComparisonType<ElementType>::type;
const NativeVector<T, Lanes> r = a >= b;
return NativeVectorOps<uint8_t, Lanes>::convert_from(r);
}
static Mask eq(const Vec a, const Vec b) {
using T = typename NativeVectorComparisonType<ElementType>::type;
const NativeVector<T, Lanes> r = a == b;
return NativeVectorOps<uint8_t, Lanes>::convert_from(r);
}
static Mask ne(const Vec a, const Vec b) {
using T = typename NativeVectorComparisonType<ElementType>::type;
const NativeVector<T, Lanes> r = a != b;
return NativeVectorOps<uint8_t, Lanes>::convert_from(r);
}
};
#endif // __has_attribute(ext_vector_type) || __has_attribute(vector_size)
} // namespace
)INLINE_CODE";
const char *vector_selection_decl = R"INLINE_CODE(
// Dec. 1, 2018: Apparently emscripten compilation runs with the __has_attribute true,
// then fails to handle the vector intrinsics later.
#if !defined(__EMSCRIPTEN__) && (__has_attribute(ext_vector_type) || __has_attribute(vector_size))
#if __GNUC__ && !__clang__
// GCC only allows powers-of-two; fall back to CppVector for other widths
#define halide_cpp_use_native_vector(type, lanes) ((lanes & (lanes - 1)) == 0)
#else
#define halide_cpp_use_native_vector(type, lanes) (true)
#endif
#else
// No NativeVector available
#define halide_cpp_use_native_vector(type, lanes) (false)
#endif // __has_attribute(ext_vector_type) || __has_attribute(vector_size)
// Failsafe to allow forcing non-native vectors in case of unruly compilers
#if HALIDE_CPP_ALWAYS_USE_CPP_VECTORS
#undef halide_cpp_use_native_vector
#define halide_cpp_use_native_vector(type, lanes) (false)
#endif
)INLINE_CODE";
// Vodoo fix: on at least one config (our arm32 buildbot running gcc 5.4),
// emitting this long text string was regularly garbled in a predictable pattern;
// flushing the stream before or after heals it. Since C++ codegen is rarely
// on a compilation critical path, we'll just band-aid it in this way.
stream << std::flush;
stream << cpp_vector_decl << native_vector_decl << vector_selection_decl;
stream << std::flush;
for (const auto &t : vector_types) {
string name = print_type(t, DoNotAppendSpace);
string scalar_name = print_type(t.element_of(), DoNotAppendSpace);
stream << "#if halide_cpp_use_native_vector(" << scalar_name << ", " << t.lanes() << ")\n";
stream << "using " << name << " = NativeVector<" << scalar_name << ", " << t.lanes() << ">;\n";
stream << "using " << name << "_ops = NativeVectorOps<" << scalar_name << ", " << t.lanes() << ">;\n";
// Useful for debugging which Vector implementation is being selected
// stream << "#pragma message \"using NativeVector for " << t << "\"\n";
stream << "#else\n";
stream << "using " << name << " = CppVector<" << scalar_name << ", " << t.lanes() << ">;\n";
stream << "using " << name << "_ops = CppVectorOps<" << scalar_name << ", " << t.lanes() << ">;\n";
// Useful for debugging which Vector implementation is being selected
// stream << "#pragma message \"using CppVector for " << t << "\"\n";
stream << "#endif\n";
}
}
using_vector_typedefs = true;
}
void CodeGen_C::set_name_mangling_mode(NameMangling mode) {
if (extern_c_open && mode != NameMangling::C) {
stream << "\n#ifdef __cplusplus\n";
stream << "} // extern \"C\"\n";
stream << "#endif\n\n";
extern_c_open = false;
} else if (!extern_c_open && mode == NameMangling::C) {
stream << "\n#ifdef __cplusplus\n";
stream << "extern \"C\" {\n";
stream << "#endif\n\n";
extern_c_open = true;
}
}
string CodeGen_C::print_type(Type type, AppendSpaceIfNeeded space_option) {
return type_to_c_type(type, space_option == AppendSpace);
}
string CodeGen_C::print_reinterpret(Type type, const Expr &e) {
ostringstream oss;
if (type.is_handle() || e.type().is_handle()) {
// Use a c-style cast if either src or dest is a handle --
// note that although Halide declares a "Handle" to always be 64 bits,
// the source "handle" might actually be a 32-bit pointer (from
// a function parameter), so calling reinterpret<> (which just memcpy's)
// would be garbage-producing.
oss << "(" << print_type(type) << ")";
} else {
oss << "reinterpret<" << print_type(type) << ">";
}
oss << "(" << print_expr(e) << ")";
return oss.str();
}
string CodeGen_C::print_name(const string &name) {
return c_print_name(name);
}
namespace {
class ExternCallPrototypes : public IRGraphVisitor {
struct NamespaceOrCall {
const Call *call; // nullptr if this is a subnamespace
std::map<string, NamespaceOrCall> names;
NamespaceOrCall(const Call *call = nullptr)
: call(call) {
}
};
std::map<string, NamespaceOrCall> c_plus_plus_externs;
std::map<string, const Call *> c_externs;
std::set<std::string> processed;
std::set<std::string> internal_linkage;
std::set<std::string> destructors;
using IRGraphVisitor::visit;
void visit(const Call *op) override {
IRGraphVisitor::visit(op);
if (!processed.count(op->name)) {
if (op->call_type == Call::Extern || op->call_type == Call::PureExtern) {
c_externs.insert({op->name, op});
} else if (op->call_type == Call::ExternCPlusPlus) {
std::vector<std::string> namespaces;
std::string name = extract_namespaces(op->name, namespaces);
std::map<string, NamespaceOrCall> *namespace_map = &c_plus_plus_externs;
for (const auto &ns : namespaces) {
auto insertion = namespace_map->insert({ns, NamespaceOrCall()});
namespace_map = &insertion.first->second.names;
}
namespace_map->insert({name, NamespaceOrCall(op)});
}
processed.insert(op->name);
}
if (op->is_intrinsic(Call::register_destructor)) {
internal_assert(op->args.size() == 2);
const StringImm *fn = op->args[0].as<StringImm>();
internal_assert(fn);
destructors.insert(fn->value);
}
}
void visit(const Allocate *op) override {
IRGraphVisitor::visit(op);
if (!op->free_function.empty()) {
destructors.insert(op->free_function);
}
}
void emit_function_decl(ostream &stream, const Call *op, const std::string &name) const {
// op->name (rather than the name arg) since we need the fully-qualified C++ name
if (internal_linkage.count(op->name)) {
stream << "static ";
}
stream << type_to_c_type(op->type, /* append_space */ true) << name << "(";
if (function_takes_user_context(name)) {
stream << "void *";
if (!op->args.empty()) {
stream << ", ";
}
}
for (size_t i = 0; i < op->args.size(); i++) {
if (i > 0) {
stream << ", ";
}
if (op->args[i].as<StringImm>()) {
stream << "const char *";
} else {
stream << type_to_c_type(op->args[i].type(), true);
}
}
stream << ");\n";
}
void emit_namespace_or_call(ostream &stream, const NamespaceOrCall &ns_or_call, const std::string &name) const {
if (ns_or_call.call == nullptr) {
stream << "namespace " << name << " {\n";
for (const auto &ns_or_call_inner : ns_or_call.names) {
emit_namespace_or_call(stream, ns_or_call_inner.second, ns_or_call_inner.first);
}
stream << "} // namespace " << name << "\n";
} else {
emit_function_decl(stream, ns_or_call.call, name);
}
}
public:
ExternCallPrototypes() {
// Make sure we don't catch calls that are already in the global declarations
const char *strs[] = {globals.c_str(),
(const char *)halide_internal_runtime_header_HalideRuntime_h,
(const char *)halide_internal_initmod_inlined_c};
for (const char *str : strs) {
size_t j = 0;
for (size_t i = 0; str[i]; i++) {
char c = str[i];
if (c == '(' && i > j + 1) {
// Could be the end of a function_name.
string name(str + j + 1, i - j - 1);
processed.insert(name);
}
if (('A' <= c && c <= 'Z') ||
('a' <= c && c <= 'z') ||
c == '_' ||
('0' <= c && c <= '9')) {
// Could be part of a function name.
} else {
j = i;
}
}
}
}
void set_internal_linkage(const std::string &name) {
internal_linkage.insert(name);
}
bool has_c_declarations() const {
return !c_externs.empty();
}
bool has_c_plus_plus_declarations() const {
return !c_plus_plus_externs.empty();
}
void emit_c_declarations(ostream &stream) const {
for (const auto &call : c_externs) {
emit_function_decl(stream, call.second, call.first);
}
for (const auto &d : destructors) {
stream << "void " << d << "(void *, void *);\n";
}
stream << "\n";
}
void emit_c_plus_plus_declarations(ostream &stream) const {
for (const auto &ns_or_call : c_plus_plus_externs) {
emit_namespace_or_call(stream, ns_or_call.second, ns_or_call.first);
}
stream << "\n";
}
};
} // namespace
void CodeGen_C::forward_declare_type_if_needed(const Type &t) {
if (!t.handle_type ||
forward_declared.count(t.handle_type) ||
t.handle_type->inner_name.cpp_type_type == halide_cplusplus_type_name::Simple) {
return;
}
for (const auto &ns : t.handle_type->namespaces) {
stream << "namespace " << ns << " { ";
}
switch (t.handle_type->inner_name.cpp_type_type) {
case halide_cplusplus_type_name::Simple:
// nothing
break;
case halide_cplusplus_type_name::Struct:
stream << "struct " << t.handle_type->inner_name.name << ";";
break;
case halide_cplusplus_type_name::Class:
stream << "class " << t.handle_type->inner_name.name << ";";
break;
case halide_cplusplus_type_name::Union:
stream << "union " << t.handle_type->inner_name.name << ";";
break;
case halide_cplusplus_type_name::Enum:
internal_error << "Passing pointers to enums is unsupported\n";
break;
}
for (const auto &ns : t.handle_type->namespaces) {
(void)ns;
stream << " }";
}
stream << "\n";
forward_declared.insert(t.handle_type);
}
void CodeGen_C::compile(const Module &input) {
TypeInfoGatherer type_info;
for (const auto &f : input.functions()) {
if (f.body.defined()) {
f.body.accept(&type_info);
}
}
uses_gpu_for_loops = (type_info.for_types_used.count(ForType::GPUBlock) ||
type_info.for_types_used.count(ForType::GPUThread) ||
type_info.for_types_used.count(ForType::GPULane));
// Forward-declare all the types we need; this needs to happen before
// we emit function prototypes, since those may need the types.
stream << "\n";
for (const auto &f : input.functions()) {
for (const auto &arg : f.args) {
forward_declare_type_if_needed(arg.type);
}
}
stream << "\n";
if (!is_header_or_extern_decl()) {
// Emit any external-code blobs that are C++.
for (const ExternalCode &code_blob : input.external_code()) {
if (code_blob.is_c_plus_plus_source()) {
stream << "\n";
stream << "// Begin External Code: " << code_blob.name() << "\n";
stream.write((const char *)code_blob.contents().data(), code_blob.contents().size());
stream << "\n";
stream << "// End External Code: " << code_blob.name() << "\n";
stream << "\n";
}
}
add_vector_typedefs(type_info.vector_types_used);
// Emit prototypes for all external and internal-only functions.
// Gather them up and do them all up front, to reduce duplicates,
// and to make it simpler to get internal-linkage functions correct.
ExternCallPrototypes e;
for (const auto &f : input.functions()) {
f.body.accept(&e);
if (f.linkage == LinkageType::Internal) {
// We can't tell at the call site if a LoweredFunc is intended to be internal
// or not, so mark them explicitly.
e.set_internal_linkage(f.name);
}
}
if (e.has_c_plus_plus_declarations()) {
set_name_mangling_mode(NameMangling::CPlusPlus);
e.emit_c_plus_plus_declarations(stream);
}
if (e.has_c_declarations()) {
set_name_mangling_mode(NameMangling::C);
e.emit_c_declarations(stream);
}
}
for (const auto &b : input.buffers()) {
compile(b);
}
for (const auto &f : input.functions()) {
compile(f);
}
}
void CodeGen_C::compile(const LoweredFunc &f) {
// Don't put non-external function declarations in headers.
if (is_header_or_extern_decl() && f.linkage == LinkageType::Internal) {
return;
}
const std::vector<LoweredArgument> &args = f.args;
have_user_context = false;
for (size_t i = 0; i < args.size(); i++) {
// TODO: check that its type is void *?
have_user_context |= (args[i].name == "__user_context");
}
NameMangling name_mangling = f.name_mangling;
if (name_mangling == NameMangling::Default) {
name_mangling = (target.has_feature(Target::CPlusPlusMangling) ? NameMangling::CPlusPlus : NameMangling::C);
}
set_name_mangling_mode(name_mangling);
std::vector<std::string> namespaces;
std::string simple_name = extract_namespaces(f.name, namespaces);
if (!is_c_plus_plus_interface()) {
user_assert(namespaces.empty()) << "Namespace qualifiers not allowed on function name if not compiling with Target::CPlusPlusNameMangling.\n";
}
if (!namespaces.empty()) {
for (const auto &ns : namespaces) {
stream << "namespace " << ns << " {\n";
}
stream << "\n";
}
// Emit the function prototype
if (f.linkage == LinkageType::Internal) {
// If the function isn't public, mark it static.
stream << "static ";
}
stream << "HALIDE_FUNCTION_ATTRS\n";
stream << "int " << simple_name << "(";
for (size_t i = 0; i < args.size(); i++) {
if (args[i].is_buffer()) {
stream << "struct halide_buffer_t *"
<< print_name(args[i].name)
<< "_buffer";
} else {
stream << print_type(args[i].type, AppendSpace)
<< print_name(args[i].name);
}
if (i < args.size() - 1) {
stream << ", ";
}
}
if (is_header_or_extern_decl()) {
stream << ");\n";
} else {
stream << ") {\n";
indent += 1;
if (uses_gpu_for_loops) {
stream << get_indent() << "halide_error("
<< (have_user_context ? "const_cast<void *>(__user_context)" : "nullptr")
<< ", \"C++ Backend does not support gpu_blocks() or gpu_threads() yet, "
<< "this function will always fail at runtime\");\n";
stream << get_indent() << "return halide_error_code_device_malloc_failed;\n";
} else {
// Emit a local user_context we can pass in all cases, either
// aliasing __user_context or nullptr.
stream << get_indent() << "void * const _ucon = "
<< (have_user_context ? "const_cast<void *>(__user_context)" : "nullptr")
<< ";\n";
if (target.has_feature(Target::NoAsserts)) {
stream << get_indent() << "halide_unused(_ucon);";
}
// Emit the body
print(f.body);
// Return success.
stream << get_indent() << "return 0;\n";
}
indent -= 1;
stream << "}\n";
}
if (is_header_or_extern_decl() && f.linkage == LinkageType::ExternalPlusMetadata) {
// Emit the argv version
stream << "\nHALIDE_FUNCTION_ATTRS\nint " << simple_name << "_argv(void **args);\n";
// And also the metadata.
stream << "\nHALIDE_FUNCTION_ATTRS\nconst struct halide_filter_metadata_t *" << simple_name << "_metadata();\n";
}
if (!namespaces.empty()) {
stream << "\n";
for (size_t i = namespaces.size(); i > 0; i--) {
stream << "} // namespace " << namespaces[i - 1] << "\n";
}
stream << "\n";
}
}
void CodeGen_C::compile(const Buffer<> &buffer) {
// Don't define buffers in headers or extern decls.
if (is_header_or_extern_decl()) {
return;
}
string name = print_name(buffer.name());
halide_buffer_t b = *(buffer.raw_buffer());
user_assert(b.host) << "Can't embed image: " << buffer.name() << " because it has a null host pointer\n";
user_assert(!b.device_dirty()) << "Can't embed image: " << buffer.name() << "because it has a dirty device pointer\n";
// Figure out the offset of the last pixel.
size_t num_elems = 1;
for (int d = 0; d < b.dimensions; d++) {
num_elems += b.dim[d].stride * (size_t)(b.dim[d].extent - 1);
}
// For now, we assume buffers that aren't scalar are constant,
// while scalars can be mutated. This accommodates all our existing
// use cases, which is that all buffers are constant, except those
// used to store stateful module information in offloading runtimes.
bool is_constant = buffer.dimensions() != 0;
// If it is an GPU source kernel, we would like to see the actual output, not the
// uint8 representation. We use a string literal for this.
if (ends_with(name, "gpu_source_kernels")) {
stream << "static const char *" << name << "_string = R\"BUFCHARSOURCE(";
stream.write((char *)b.host, num_elems);
stream << ")BUFCHARSOURCE\";\n";
stream << "static const uint8_t *" << name << "_data HALIDE_ATTRIBUTE_ALIGN(32) = (const uint8_t *) "
<< name << "_string;\n";
} else {
// Emit the data
stream << "static " << (is_constant ? "const" : "") << " uint8_t " << name << "_data[] HALIDE_ATTRIBUTE_ALIGN(32) = {\n";
stream << get_indent();
for (size_t i = 0; i < num_elems * b.type.bytes(); i++) {
if (i > 0) {
stream << ",";
if (i % 16 == 0) {
stream << "\n";
stream << get_indent();
} else {
stream << " ";
}
}
stream << (int)(b.host[i]);
}
stream << "\n};\n";
}
// Emit the shape (constant even for scalar buffers)
stream << "static const halide_dimension_t " << name << "_buffer_shape[] = {";
for (int i = 0; i < buffer.dimensions(); i++) {
stream << "halide_dimension_t(" << buffer.dim(i).min()
<< ", " << buffer.dim(i).extent()
<< ", " << buffer.dim(i).stride() << ")";
if (i < buffer.dimensions() - 1) {
stream << ", ";
}
}
stream << "};\n";
Type t = buffer.type();
// Emit the buffer struct. Note that although our shape and (usually) our host
// data is const, the buffer itself isn't: embedded buffers in one pipeline
// can be passed to another pipeline (e.g. for an extern stage), in which
// case the buffer objects need to be non-const, because the constness
// (from the POV of the extern stage) is a runtime property.
stream << "static halide_buffer_t " << name << "_buffer_ = {"
<< "0, " // device
<< "nullptr, " // device_interface
<< "const_cast<uint8_t*>(&" << name << "_data[0]), " // host
<< "0, " // flags
<< "halide_type_t((halide_type_code_t)(" << (int)t.code() << "), " << t.bits() << ", " << t.lanes() << "), "
<< buffer.dimensions() << ", "
<< "const_cast<halide_dimension_t*>(" << name << "_buffer_shape)};\n";
// Make a global pointer to it.
stream << "static halide_buffer_t * const " << name << "_buffer = &" << name << "_buffer_;\n";
}
string CodeGen_C::print_expr(const Expr &e) {
id = "$$ BAD ID $$";
e.accept(this);
return id;
}
string CodeGen_C::print_cast_expr(const Type &t, const Expr &e) {
string value = print_expr(e);
string type = print_type(t);
if (t.is_vector() &&
t.lanes() == e.type().lanes() &&
t != e.type()) {
return print_assignment(t, type + "_ops::convert_from<" + print_type(e.type()) + ">(" + value + ")");
} else {
return print_assignment(t, "(" + type + ")(" + value + ")");
}
}
void CodeGen_C::print_stmt(const Stmt &s) {
s.accept(this);
}
string CodeGen_C::print_assignment(Type t, const std::string &rhs) {
auto cached = cache.find(rhs);
if (cached == cache.end()) {
id = unique_name('_');
const char *const_flag = output_kind == CPlusPlusImplementation ? "const " : "";
stream << get_indent() << print_type(t, AppendSpace) << const_flag << id << " = " << rhs << ";\n";
cache[rhs] = id;
} else {
id = cached->second;
}
return id;
}
void CodeGen_C::open_scope() {
cache.clear();
stream << get_indent();
indent++;
stream << "{\n";
}
void CodeGen_C::close_scope(const std::string &comment) {
cache.clear();
indent--;
stream << get_indent();
if (!comment.empty()) {
stream << "} // " << comment << "\n";
} else {
stream << "}\n";
}
}
void CodeGen_C::visit(const Variable *op) {
id = print_name(op->name);
}
void CodeGen_C::visit(const Cast *op) {
id = print_cast_expr(op->type, op->value);
}
void CodeGen_C::visit_binop(Type t, const Expr &a, const Expr &b, const char *op) {
string sa = print_expr(a);
string sb = print_expr(b);
print_assignment(t, sa + " " + op + " " + sb);
}
void CodeGen_C::visit(const Add *op) {
visit_binop(op->type, op->a, op->b, "+");
}
void CodeGen_C::visit(const Sub *op) {
visit_binop(op->type, op->a, op->b, "-");
}
void CodeGen_C::visit(const Mul *op) {
visit_binop(op->type, op->a, op->b, "*");
}
void CodeGen_C::visit(const Div *op) {
int bits;
if (is_const_power_of_two_integer(op->b, &bits)) {
visit_binop(op->type, op->a, make_const(op->a.type(), bits), ">>");
} else if (op->type.is_int()) {
print_expr(lower_euclidean_div(op->a, op->b));
} else {
visit_binop(op->type, op->a, op->b, "/");
}
}
void CodeGen_C::visit(const Mod *op) {
int bits;
if (is_const_power_of_two_integer(op->b, &bits)) {
visit_binop(op->type, op->a, make_const(op->a.type(), (1 << bits) - 1), "&");
} else if (op->type.is_int()) {
print_expr(lower_euclidean_mod(op->a, op->b));
} else if (op->type.is_float()) {
string arg0 = print_expr(op->a);
string arg1 = print_expr(op->b);
ostringstream rhs;
rhs << "fmod(" << arg0 << ", " << arg1 << ")";
print_assignment(op->type, rhs.str());
} else {
visit_binop(op->type, op->a, op->b, "%");
}
}
void CodeGen_C::visit(const Max *op) {
// clang doesn't support the ternary operator on OpenCL style vectors.
// See: https://bugs.llvm.org/show_bug.cgi?id=33103
if (op->type.is_scalar()) {
print_expr(Call::make(op->type, "::halide_cpp_max", {op->a, op->b}, Call::Extern));
} else {
ostringstream rhs;
rhs << print_type(op->type) << "_ops::max(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
print_assignment(op->type, rhs.str());
}
}
void CodeGen_C::visit(const Min *op) {
// clang doesn't support the ternary operator on OpenCL style vectors.
// See: https://bugs.llvm.org/show_bug.cgi?id=33103
if (op->type.is_scalar()) {
print_expr(Call::make(op->type, "::halide_cpp_min", {op->a, op->b}, Call::Extern));
} else {
ostringstream rhs;
rhs << print_type(op->type) << "_ops::min(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
print_assignment(op->type, rhs.str());
}
}
void CodeGen_C::visit_relop(Type t, const Expr &a, const Expr &b, const char *scalar_op, const char *vector_op) {
if (t.is_scalar() || !using_vector_typedefs) {
visit_binop(t, a, b, scalar_op);
} else {
internal_assert(a.type() == b.type());
string sa = print_expr(a);
string sb = print_expr(b);
print_assignment(t, print_type(a.type()) + "_ops::" + vector_op + "(" + sa + ", " + sb + ")");
}
}
void CodeGen_C::visit(const EQ *op) {
visit_relop(op->type, op->a, op->b, "==", "eq");
}
void CodeGen_C::visit(const NE *op) {
visit_relop(op->type, op->a, op->b, "!=", "ne");
}
void CodeGen_C::visit(const LT *op) {
visit_relop(op->type, op->a, op->b, "<", "lt");
}
void CodeGen_C::visit(const LE *op) {
visit_relop(op->type, op->a, op->b, "<=", "le");
}
void CodeGen_C::visit(const GT *op) {
visit_relop(op->type, op->a, op->b, ">", "gt");
}
void CodeGen_C::visit(const GE *op) {
visit_relop(op->type, op->a, op->b, ">=", "ge");
}
void CodeGen_C::visit(const Or *op) {
visit_relop(op->type, op->a, op->b, "||", "logical_or");
}
void CodeGen_C::visit(const And *op) {
visit_relop(op->type, op->a, op->b, "&&", "logical_and");
}
void CodeGen_C::visit(const Not *op) {
print_assignment(op->type, "!(" + print_expr(op->a) + ")");
}
void CodeGen_C::visit(const IntImm *op) {
if (op->type == Int(32)) {
id = std::to_string(op->value);
} else {
static const char *const suffixes[3] = {
"ll", // PlainC
"l", // OpenCL
"", // HLSL
};
print_assignment(op->type, "(" + print_type(op->type) + ")(" + std::to_string(op->value) + suffixes[(int)integer_suffix_style] + ")");
}
}
void CodeGen_C::visit(const UIntImm *op) {
static const char *const suffixes[3] = {
"ull", // PlainC
"ul", // OpenCL
"", // HLSL
};
print_assignment(op->type, "(" + print_type(op->type) + ")(" + std::to_string(op->value) + suffixes[(int)integer_suffix_style] + ")");
}
void CodeGen_C::visit(const StringImm *op) {
ostringstream oss;
oss << Expr(op);
id = oss.str();
}
// NaN is the only float/double for which this is true... and
// surprisingly, there doesn't seem to be a portable isnan function
// (dsharlet).
template<typename T>
static bool isnan(T x) {
return x != x;
}
template<typename T>
static bool isinf(T x) {
return std::numeric_limits<T>::has_infinity && (x == std::numeric_limits<T>::infinity() ||
x == -std::numeric_limits<T>::infinity());
}
void CodeGen_C::visit(const FloatImm *op) {
if (isnan(op->value)) {
id = "nan_f32()";
} else if (isinf(op->value)) {
if (op->value > 0) {
id = "inf_f32()";
} else {
id = "neg_inf_f32()";
}
} else {
// Write the constant as reinterpreted uint to avoid any bits lost in conversion.
union {
uint32_t as_uint;
float as_float;
} u;
u.as_float = op->value;
ostringstream oss;
if (op->type.bits() == 64) {
oss << "(double) ";
}
oss << "float_from_bits(" << u.as_uint << " /* " << u.as_float << " */)";
print_assignment(op->type, oss.str());
}
}
void CodeGen_C::visit(const Call *op) {
internal_assert(op->is_extern() || op->is_intrinsic())
<< "Can only codegen extern calls and intrinsics\n";
ostringstream rhs;
// Handle intrinsics first
if (op->is_intrinsic(Call::debug_to_file)) {
internal_assert(op->args.size() == 3);
const StringImm *string_imm = op->args[0].as<StringImm>();
internal_assert(string_imm);
string filename = string_imm->value;
string typecode = print_expr(op->args[1]);
string buffer = print_name(print_expr(op->args[2]));
rhs << "halide_debug_to_file(_ucon, "
<< "\"" << filename << "\", "
<< typecode
<< ", (struct halide_buffer_t *)" << buffer << ")";
} else if (op->is_intrinsic(Call::bitwise_and)) {
internal_assert(op->args.size() == 2);
string a0 = print_expr(op->args[0]);
string a1 = print_expr(op->args[1]);
rhs << a0 << " & " << a1;
} else if (op->is_intrinsic(Call::bitwise_xor)) {
internal_assert(op->args.size() == 2);
string a0 = print_expr(op->args[0]);
string a1 = print_expr(op->args[1]);
rhs << a0 << " ^ " << a1;
} else if (op->is_intrinsic(Call::bitwise_or)) {
internal_assert(op->args.size() == 2);
string a0 = print_expr(op->args[0]);
string a1 = print_expr(op->args[1]);
rhs << a0 << " | " << a1;
} else if (op->is_intrinsic(Call::bitwise_not)) {
internal_assert(op->args.size() == 1);
rhs << "~" << print_expr(op->args[0]);
} else if (op->is_intrinsic(Call::reinterpret)) {
internal_assert(op->args.size() == 1);
rhs << print_reinterpret(op->type, op->args[0]);
} else if (op->is_intrinsic(Call::shift_left)) {
internal_assert(op->args.size() == 2);
string a0 = print_expr(op->args[0]);
string a1 = print_expr(op->args[1]);
rhs << a0 << " << " << a1;
} else if (op->is_intrinsic(Call::shift_right)) {
internal_assert(op->args.size() == 2);
string a0 = print_expr(op->args[0]);
string a1 = print_expr(op->args[1]);
rhs << a0 << " >> " << a1;
} else if (op->is_intrinsic(Call::count_leading_zeros) ||
op->is_intrinsic(Call::count_trailing_zeros) ||
op->is_intrinsic(Call::popcount)) {
internal_assert(op->args.size() == 1);
if (op->args[0].type().is_vector()) {
rhs << print_scalarized_expr(op);
} else {
string a0 = print_expr(op->args[0]);
rhs << "halide_" << op->name << "(" << a0 << ")";
}
} else if (op->is_intrinsic(Call::lerp)) {
internal_assert(op->args.size() == 3);
Expr e = lower_lerp(op->args[0], op->args[1], op->args[2]);
rhs << print_expr(e);
} else if (op->is_intrinsic(Call::absd)) {
internal_assert(op->args.size() == 2);
Expr a = op->args[0];
Expr b = op->args[1];
Expr e = cast(op->type, select(a < b, b - a, a - b));
rhs << print_expr(e);
} else if (op->is_intrinsic(Call::return_second)) {
internal_assert(op->args.size() == 2);
string arg0 = print_expr(op->args[0]);
string arg1 = print_expr(op->args[1]);
rhs << "return_second(" << arg0 << ", " << arg1 << ")";
} else if (op->is_intrinsic(Call::if_then_else)) {
internal_assert(op->args.size() == 3);
string result_id = unique_name('_');
stream << get_indent() << print_type(op->args[1].type(), AppendSpace)
<< result_id << ";\n";
string cond_id = print_expr(op->args[0]);
stream << get_indent() << "if (" << cond_id << ")\n";
open_scope();
string true_case = print_expr(op->args[1]);
stream << get_indent() << result_id << " = " << true_case << ";\n";
close_scope("if " + cond_id);
stream << get_indent() << "else\n";
open_scope();
string false_case = print_expr(op->args[2]);
stream << get_indent() << result_id << " = " << false_case << ";\n";
close_scope("if " + cond_id + " else");
rhs << result_id;
} else if (op->is_intrinsic(Call::require)) {
internal_assert(op->args.size() == 3);
if (op->args[0].type().is_vector()) {
rhs << print_scalarized_expr(op);
} else {
create_assertion(op->args[0], op->args[2]);
rhs << print_expr(op->args[1]);
}
} else if (op->is_intrinsic(Call::abs)) {
internal_assert(op->args.size() == 1);
Expr a0 = op->args[0];
rhs << print_expr(cast(op->type, select(a0 > 0, a0, -a0)));
} else if (op->is_intrinsic(Call::memoize_expr)) {
internal_assert(!op->args.empty());
string arg = print_expr(op->args[0]);
rhs << "(" << arg << ")";
} else if (op->is_intrinsic(Call::alloca)) {
internal_assert(op->args.size() == 1);
internal_assert(op->type.is_handle());
if (op->type == type_of<struct halide_buffer_t *>() &&
Call::as_intrinsic(op->args[0], {Call::size_of_halide_buffer_t})) {
stream << get_indent();
string buf_name = unique_name('b');
stream << "halide_buffer_t " << buf_name << ";\n";
rhs << "&" << buf_name;
} else {
// Make a stack of uint64_ts
string size = print_expr(simplify((op->args[0] + 7) / 8));
stream << get_indent();
string array_name = unique_name('a');
stream << "uint64_t " << array_name << "[" << size << "];";
rhs << "(" << print_type(op->type) << ")(&" << array_name << ")";
}
} else if (op->is_intrinsic(Call::make_struct)) {
if (op->args.empty()) {
internal_assert(op->type.handle_type);
// Add explicit cast so that different structs can't cache to the same value
rhs << "(" << print_type(op->type) << ")(NULL)";
} else if (op->type == type_of<halide_dimension_t *>()) {
// Emit a shape
// Get the args
vector<string> values;
for (size_t i = 0; i < op->args.size(); i++) {
values.push_back(print_expr(op->args[i]));
}
static_assert(sizeof(halide_dimension_t) == 4 * sizeof(int32_t),
"CodeGen_C assumes a halide_dimension_t is four densely-packed int32_ts");
internal_assert(values.size() % 4 == 0);
int dimension = values.size() / 4;
string shape_name = unique_name('s');
stream
<< get_indent() << "struct halide_dimension_t " << shape_name
<< "[" << dimension << "] = {\n";
indent++;
for (int i = 0; i < dimension; i++) {
stream
<< get_indent() << "{"
<< values[i * 4 + 0] << ", "
<< values[i * 4 + 1] << ", "
<< values[i * 4 + 2] << ", "
<< values[i * 4 + 3] << "},\n";
}
indent--;
stream << get_indent() << "};\n";
rhs << shape_name;
} else {
// Emit a declaration like:
// struct {const int f_0, const char f_1, const int f_2} foo = {3, 'c', 4};
// Get the args
vector<string> values;
for (size_t i = 0; i < op->args.size(); i++) {
values.push_back(print_expr(op->args[i]));
}
stream << get_indent() << "struct {\n";
// List the types.
indent++;
for (size_t i = 0; i < op->args.size(); i++) {
stream << get_indent() << "const " << print_type(op->args[i].type()) << " f_" << i << ";\n";
}
indent--;
string struct_name = unique_name('s');
stream << get_indent() << "} " << struct_name << " = {\n";
// List the values.
indent++;
for (size_t i = 0; i < op->args.size(); i++) {
stream << get_indent() << values[i];
if (i < op->args.size() - 1) {
stream << ",";
}
stream << "\n";
}
indent--;
stream << get_indent() << "};\n";
// Return a pointer to it of the appropriate type
// TODO: This is dubious type-punning. We really need to
// find a better way to do this. We dodge the problem for
// the specific case of buffer shapes in the case above.
if (op->type.handle_type) {
rhs << "(" << print_type(op->type) << ")";
}
rhs << "(&" << struct_name << ")";
}
} else if (op->is_intrinsic(Call::stringify)) {
// Rewrite to an snprintf
vector<string> printf_args;
string format_string = "";
for (size_t i = 0; i < op->args.size(); i++) {
Type t = op->args[i].type();
printf_args.push_back(print_expr(op->args[i]));
if (t.is_int()) {
format_string += "%lld";
printf_args[i] = "(long long)(" + printf_args[i] + ")";
} else if (t.is_uint()) {
format_string += "%llu";
printf_args[i] = "(long long unsigned)(" + printf_args[i] + ")";
} else if (t.is_float()) {
if (t.bits() == 32) {
format_string += "%f";
} else {
format_string += "%e";
}
} else if (op->args[i].as<StringImm>()) {
format_string += "%s";
} else {
internal_assert(t.is_handle());
format_string += "%p";
}
}
string buf_name = unique_name('b');
stream << get_indent() << "char " << buf_name << "[1024];\n";
stream << get_indent() << "snprintf(" << buf_name << ", 1024, \"" << format_string << "\", " << with_commas(printf_args) << ");\n";
rhs << buf_name;
} else if (op->is_intrinsic(Call::register_destructor)) {
internal_assert(op->args.size() == 2);
const StringImm *fn = op->args[0].as<StringImm>();
internal_assert(fn);
string arg = print_expr(op->args[1]);
stream << get_indent();
// Make a struct on the stack that calls the given function as a destructor
string struct_name = unique_name('s');
string instance_name = unique_name('d');
stream << "struct " << struct_name << " { "
<< "void * const ucon; "
<< "void * const arg; "
<< "" << struct_name << "(void *ucon, void *a) : ucon(ucon), arg((void *)a) {} "
<< "~" << struct_name << "() { " << fn->value + "(ucon, arg); } "
<< "} " << instance_name << "(_ucon, " << arg << ");\n";
rhs << print_expr(0);
} else if (op->is_intrinsic(Call::div_round_to_zero)) {
rhs << print_expr(op->args[0]) << " / " << print_expr(op->args[1]);
} else if (op->is_intrinsic(Call::mod_round_to_zero)) {
rhs << print_expr(op->args[0]) << " % " << print_expr(op->args[1]);
} else if (op->is_intrinsic(Call::mux)) {
rhs << print_expr(lower_mux(op));
} else if (op->is_intrinsic(Call::signed_integer_overflow)) {
user_error << "Signed integer overflow occurred during constant-folding. Signed"
" integer overflow for int32 and int64 is undefined behavior in"
" Halide.\n";
} else if (op->is_intrinsic(Call::prefetch)) {
user_assert((op->args.size() == 4) && is_const_one(op->args[2]))
<< "Only prefetch of 1 cache line is supported in C backend.\n";
const Variable *base = op->args[0].as<Variable>();
internal_assert(base && base->type.is_handle());
rhs << "__builtin_prefetch("
<< "((" << print_type(op->type) << " *)" << print_name(base->name)
<< " + " << print_expr(op->args[1]) << "), 1)";
} else if (op->is_intrinsic(Call::size_of_halide_buffer_t)) {
rhs << "(sizeof(halide_buffer_t))";
} else if (op->is_intrinsic(Call::strict_float)) {
internal_assert(op->args.size() == 1);
string arg0 = print_expr(op->args[0]);
rhs << "(" << arg0 << ")";
} else if (op->is_intrinsic()) {
Expr lowered = lower_intrinsic(op);
if (lowered.defined()) {
rhs << print_expr(lowered);
} else {
// TODO: other intrinsics
internal_error << "Unhandled intrinsic in C backend: " << op->name << "\n";
}
} else {
// Generic extern calls
rhs << print_extern_call(op);
}
// Special-case halide_print, which has IR that returns int, but really return void.
// The clean thing to do would be to change the definition of halide_print() to return
// an ignored int, but as halide_print() has many overrides downstream (and in third-party
// consumers), this is arguably a simpler fix for allowing halide_print() to work in the C++ backend.
if (op->name == "halide_print") {
stream << get_indent() << rhs.str() << ";\n";
// Make an innocuous assignment value for our caller (probably an Evaluate node) to ignore.
print_assignment(op->type, "0");
} else {
print_assignment(op->type, rhs.str());
}
}
string CodeGen_C::print_scalarized_expr(const Expr &e) {
Type t = e.type();
internal_assert(t.is_vector());
string v = unique_name('_');
stream << get_indent() << print_type(t, AppendSpace) << v << ";\n";
for (int lane = 0; lane < t.lanes(); lane++) {
Expr e2 = extract_lane(e, lane);
string elem = print_expr(e2);
ostringstream rhs;
rhs << v << ".replace(" << lane << ", " << elem << ")";
v = print_assignment(t, rhs.str());
}
return v;
}
string CodeGen_C::print_extern_call(const Call *op) {
if (op->type.is_vector()) {
// Need to split into multiple scalar calls.
return print_scalarized_expr(op);
}
ostringstream rhs;
vector<string> args(op->args.size());
for (size_t i = 0; i < op->args.size(); i++) {
args[i] = print_expr(op->args[i]);
// This substitution ensures const correctness for all calls
if (args[i] == "__user_context") {
args[i] = "_ucon";
}
}
if (function_takes_user_context(op->name)) {
args.insert(args.begin(), "_ucon");
}
rhs << op->name << "(" << with_commas(args) << ")";
return rhs.str();
}
void CodeGen_C::visit(const Load *op) {
// TODO: We could replicate the logic in the llvm codegen which decides whether
// the vector access can be aligned. Doing so would also require introducing
// aligned type equivalents for all the vector types.
ostringstream rhs;
Type t = op->type;
string name = print_name(op->name);
// If we're loading a contiguous ramp into a vector, just load the vector
Expr dense_ramp_base = strided_ramp_base(op->index, 1);
if (dense_ramp_base.defined() && is_const_one(op->predicate)) {
internal_assert(t.is_vector());
string id_ramp_base = print_expr(dense_ramp_base);
rhs << print_type(t) + "_ops::load(" << name << ", " << id_ramp_base << ")";
} else if (op->index.type().is_vector()) {
// If index is a vector, gather vector elements.
internal_assert(t.is_vector());
string id_index = print_expr(op->index);
if (is_const_one(op->predicate)) {
rhs << print_type(t) + "_ops::load_gather(" << name << ", " << id_index << ")";
} else {
string id_predicate = print_expr(op->predicate);
rhs << print_type(t) + "_ops::load_predicated(" << name << ", " << id_index << ", " << id_predicate << ")";
}
} else {
user_assert(is_const_one(op->predicate)) << "Predicated scalar load is not supported by C backend.\n";
string id_index = print_expr(op->index);
bool type_cast_needed = !(allocations.contains(op->name) &&
allocations.get(op->name).type.element_of() == t.element_of());
if (type_cast_needed) {
const char *const_flag = output_kind == CPlusPlusImplementation ? "const " : "";
rhs << "((" << const_flag << print_type(t.element_of()) << " *)" << name << ")";
} else {
rhs << name;
}
rhs << "[" << id_index << "]";
}
print_assignment(t, rhs.str());
}
void CodeGen_C::visit(const Store *op) {
Type t = op->value.type();
if (inside_atomic_mutex_node) {
user_assert(t.is_scalar())
<< "The vectorized atomic operation for the store" << op->name
<< " is lowered into a mutex lock, which does not support vectorization.\n";
}
// Issue atomic store if we are in the designated producer.
if (emit_atomic_stores) {
stream << "#if defined(_OPENMP)\n";
stream << "#pragma omp atomic\n";
stream << "#else\n";
stream << "#error \"Atomic stores in the C backend are only supported in compilers that support OpenMP.\"\n";
stream << "#endif\n";
}
string id_value = print_expr(op->value);
string name = print_name(op->name);
// TODO: We could replicate the logic in the llvm codegen which decides whether
// the vector access can be aligned. Doing so would also require introducing
// aligned type equivalents for all the vector types.
// If we're writing a contiguous ramp, just store the vector.
Expr dense_ramp_base = strided_ramp_base(op->index, 1);
if (dense_ramp_base.defined() && is_const_one(op->predicate)) {
internal_assert(op->value.type().is_vector());
string id_ramp_base = print_expr(dense_ramp_base);
stream << get_indent() << print_type(t) + "_ops::store(" << id_value << ", " << name << ", " << id_ramp_base << ");\n";
} else if (op->index.type().is_vector()) {
// If index is a vector, scatter vector elements.
internal_assert(t.is_vector());
string id_index = print_expr(op->index);
if (is_const_one(op->predicate)) {
stream << get_indent() << print_type(t) + "_ops::store_scatter(" << id_value << ", " << name << ", " << id_index << ");\n";
} else {
string id_predicate = print_expr(op->predicate);
stream << get_indent() << print_type(t) + "_ops::store_predicated(" << id_value << ", " << name << ", " << id_index << ", " << id_predicate << ");\n";
}
} else {
user_assert(is_const_one(op->predicate)) << "Predicated scalar store is not supported by C backend.\n";
bool type_cast_needed =
t.is_handle() ||
!allocations.contains(op->name) ||
allocations.get(op->name).type != t;
string id_index = print_expr(op->index);
stream << get_indent();
if (type_cast_needed) {
stream << "((" << print_type(t) << " *)" << name << ")";
} else {
stream << name;
}
stream << "[" << id_index << "] = " << id_value << ";\n";
}
cache.clear();
}
void CodeGen_C::visit(const Let *op) {
string id_value = print_expr(op->value);
Expr body = op->body;
if (op->value.type().is_handle()) {
// The body might contain a Load that references this directly
// by name, so we can't rewrite the name.
stream << get_indent() << print_type(op->value.type())
<< " " << print_name(op->name)
<< " = " << id_value << ";\n";
} else {
Expr new_var = Variable::make(op->value.type(), id_value);
body = substitute(op->name, new_var, body);
}
print_expr(body);
}
void CodeGen_C::visit(const Select *op) {
ostringstream rhs;
string type = print_type(op->type);
string true_val = print_expr(op->true_value);
string false_val = print_expr(op->false_value);
string cond = print_expr(op->condition);
// clang doesn't support the ternary operator on OpenCL style vectors.
// See: https://bugs.llvm.org/show_bug.cgi?id=33103
if (op->condition.type().is_scalar()) {
rhs << "(" << type << ")"
<< "(" << cond
<< " ? " << true_val
<< " : " << false_val
<< ")";
} else {
rhs << type << "_ops::select(" << cond << ", " << true_val << ", " << false_val << ")";
}
print_assignment(op->type, rhs.str());
}
void CodeGen_C::visit(const LetStmt *op) {
string id_value = print_expr(op->value);
Stmt body = op->body;
if (op->value.type().is_handle()) {
// The body might contain a Load or Store that references this
// directly by name, so we can't rewrite the name.
stream << get_indent() << print_type(op->value.type())
<< " " << print_name(op->name)
<< " = " << id_value << ";\n";
} else {
Expr new_var = Variable::make(op->value.type(), id_value);
body = substitute(op->name, new_var, body);
}
body.accept(this);
}
// Halide asserts have different semantics to C asserts. They're
// supposed to clean up and make the containing function return
// -1, so we can't use the C version of assert. Instead we convert
// to an if statement.
void CodeGen_C::create_assertion(const string &id_cond, const Expr &message) {
internal_assert(!message.defined() || message.type() == Int(32))
<< "Assertion result is not an int: " << message;
if (target.has_feature(Target::NoAsserts)) {
stream << get_indent() << "halide_unused(" << id_cond << ");\n";
return;
}
stream << get_indent() << "if (!" << id_cond << ")\n";
open_scope();
string id_msg = print_expr(message);
stream << get_indent() << "return " << id_msg << ";\n";
close_scope("");
}
void CodeGen_C::create_assertion(const Expr &cond, const Expr &message) {
create_assertion(print_expr(cond), message);
}
void CodeGen_C::visit(const AssertStmt *op) {
create_assertion(op->condition, op->message);
}
void CodeGen_C::visit(const ProducerConsumer *op) {
stream << get_indent();
if (op->is_producer) {
stream << "// produce " << op->name << "\n";
} else {
stream << "// consume " << op->name << "\n";
}
print_stmt(op->body);
}
void CodeGen_C::visit(const Fork *op) {
// TODO: This doesn't actually work with nested tasks
stream << get_indent() << "#pragma omp parallel\n";
open_scope();
stream << get_indent() << "#pragma omp single\n";
open_scope();
stream << get_indent() << "#pragma omp task\n";
open_scope();
print_stmt(op->first);
close_scope("");
stream << get_indent() << "#pragma omp task\n";
open_scope();
print_stmt(op->rest);
close_scope("");
stream << get_indent() << "#pragma omp taskwait\n";
close_scope("");
close_scope("");
}
void CodeGen_C::visit(const Acquire *op) {
string id_sem = print_expr(op->semaphore);
string id_count = print_expr(op->count);
open_scope();
stream << get_indent() << "while (!halide_semaphore_try_acquire(" << id_sem << ", " << id_count << "))\n";
open_scope();
stream << get_indent() << "#pragma omp taskyield\n";
close_scope("");
op->body.accept(this);
close_scope("");
}
void CodeGen_C::visit(const Atomic *op) {
if (!op->mutex_name.empty()) {
internal_assert(!inside_atomic_mutex_node)
<< "Nested atomic mutex locks detected. This might causes a deadlock.\n";
ScopedValue<bool> old_inside_atomic_mutex_node(inside_atomic_mutex_node, true);
op->body.accept(this);
} else {
// Issue atomic stores.
ScopedValue<bool> old_emit_atomic_stores(emit_atomic_stores, true);
op->body.accept(this);
}
}
void CodeGen_C::visit(const For *op) {
string id_min = print_expr(op->min);
string id_extent = print_expr(op->extent);
if (op->for_type == ForType::Parallel) {
stream << get_indent() << "#pragma omp parallel for\n";
} else {
internal_assert(op->for_type == ForType::Serial)
<< "Can only emit serial or parallel for loops to C\n";
}
stream << get_indent() << "for (int "
<< print_name(op->name)
<< " = " << id_min
<< "; "
<< print_name(op->name)
<< " < " << id_min
<< " + " << id_extent
<< "; "
<< print_name(op->name)
<< "++)\n";
open_scope();
op->body.accept(this);
close_scope("for " + print_name(op->name));
}
void CodeGen_C::visit(const Ramp *op) {
Type vector_type = op->type.with_lanes(op->lanes);
string id_base = print_expr(op->base);
string id_stride = print_expr(op->stride);
print_assignment(vector_type, print_type(vector_type) + "_ops::ramp(" + id_base + ", " + id_stride + ")");
}
void CodeGen_C::visit(const Broadcast *op) {
Type vector_type = op->type.with_lanes(op->lanes);
string id_value = print_expr(op->value);
string rhs;
if (op->lanes > 1) {
rhs = print_type(vector_type) + "_ops::broadcast(" + id_value + ")";
} else {
rhs = id_value;
}
print_assignment(vector_type, rhs);
}
void CodeGen_C::visit(const Provide *op) {
internal_error << "Cannot emit Provide statements as C\n";
}
void CodeGen_C::visit(const Allocate *op) {
open_scope();
string op_name = print_name(op->name);
string op_type = print_type(op->type, AppendSpace);
// For sizes less than 8k, do a stack allocation
bool on_stack = false;
int32_t constant_size;
string size_id;
Type size_id_type;
if (op->new_expr.defined()) {
Allocation alloc;
alloc.type = op->type;
allocations.push(op->name, alloc);
heap_allocations.push(op->name);
string new_e = print_expr(op->new_expr);
stream << get_indent() << op_type << " *" << op_name << " = (" << op_type << "*)" << new_e << ";\n";
} else {
constant_size = op->constant_allocation_size();
if (constant_size > 0) {
int64_t stack_bytes = (int64_t)constant_size * op->type.bytes();
if (stack_bytes > ((int64_t(1) << 31) - 1)) {
user_error << "Total size for allocation "
<< op->name << " is constant but exceeds 2^31 - 1.\n";
} else {
size_id_type = Int(32);
size_id = print_expr(make_const(size_id_type, constant_size));
if (op->memory_type == MemoryType::Stack ||
op->memory_type == MemoryType::Register ||
(op->memory_type == MemoryType::Auto &&
can_allocation_fit_on_stack(stack_bytes))) {
on_stack = true;
}
}
} else {
// Check that the allocation is not scalar (if it were scalar
// it would have constant size).
internal_assert(!op->extents.empty());
size_id = print_assignment(Int(64), print_expr(op->extents[0]));
size_id_type = Int(64);
for (size_t i = 1; i < op->extents.size(); i++) {
// Make the code a little less cluttered for two-dimensional case
string new_size_id_rhs;
string next_extent = print_expr(op->extents[i]);
if (i > 1) {
new_size_id_rhs = "(" + size_id + " > ((int64_t(1) << 31) - 1)) ? " + size_id + " : (" + size_id + " * " + next_extent + ")";
} else {
new_size_id_rhs = size_id + " * " + next_extent;
}
size_id = print_assignment(Int(64), new_size_id_rhs);
}
stream << get_indent() << "if (("
<< size_id << " > ((int64_t(1) << 31) - 1)) || (("
<< size_id << " * sizeof("
<< op_type << ")) > ((int64_t(1) << 31) - 1)))\n";
open_scope();
stream << get_indent();
// TODO: call halide_error_buffer_allocation_too_large() here instead
// TODO: call create_assertion() so that NoAssertions works
stream << "halide_error(_ucon, "
<< "\"32-bit signed overflow computing size of allocation " << op->name << "\\n\");\n";
stream << get_indent() << "return -1;\n";
close_scope("overflow test " + op->name);
}
// Check the condition to see if this allocation should actually be created.
// If the allocation is on the stack, the only condition we can respect is
// unconditional false (otherwise a non-constant-sized array declaration
// will be generated).
if (!on_stack || is_const_zero(op->condition)) {
Expr conditional_size = Select::make(op->condition,
Variable::make(size_id_type, size_id),
make_const(size_id_type, 0));
conditional_size = simplify(conditional_size);
size_id = print_assignment(Int(64), print_expr(conditional_size));
}
Allocation alloc;
alloc.type = op->type;
allocations.push(op->name, alloc);
stream << get_indent() << op_type;
if (on_stack) {
stream << op_name
<< "[" << size_id << "];\n";
} else {
stream << "*"
<< op_name
<< " = ("
<< op_type
<< " *)halide_malloc(_ucon, sizeof("
<< op_type
<< ")*" << size_id << ");\n";
heap_allocations.push(op->name);
}
}
if (!on_stack) {
create_assertion(op_name, Call::make(Int(32), "halide_error_out_of_memory", {}, Call::Extern));
stream << get_indent();
string free_function = op->free_function.empty() ? "halide_free" : op->free_function;
stream << "HalideFreeHelper " << op_name << "_free(_ucon, "
<< op_name << ", " << free_function << ");\n";
}
op->body.accept(this);
// Free the memory if it was allocated on the heap and there is no matching
// Free node.
print_heap_free(op->name);
if (allocations.contains(op->name)) {
allocations.pop(op->name);
}
close_scope("alloc " + print_name(op->name));
}
void CodeGen_C::print_heap_free(const std::string &alloc_name) {
if (heap_allocations.contains(alloc_name)) {
stream << get_indent() << print_name(alloc_name) << "_free.free();\n";
heap_allocations.pop(alloc_name);
}
}
void CodeGen_C::visit(const Free *op) {
print_heap_free(op->name);
allocations.pop(op->name);
}
void CodeGen_C::visit(const Realize *op) {
internal_error << "Cannot emit realize statements to C\n";
}
void CodeGen_C::visit(const Prefetch *op) {
internal_error << "Cannot emit prefetch statements to C\n";
}
void CodeGen_C::visit(const IfThenElse *op) {
string cond_id = print_expr(op->condition);
stream << get_indent() << "if (" << cond_id << ")\n";
open_scope();
op->then_case.accept(this);
close_scope("if " + cond_id);
if (op->else_case.defined()) {
stream << get_indent() << "else\n";
open_scope();
op->else_case.accept(this);
close_scope("if " + cond_id + " else");
}
}
void CodeGen_C::visit(const Evaluate *op) {
if (is_const(op->value)) {
return;
}
string id = print_expr(op->value);
stream << get_indent() << "halide_unused(" << id << ");\n";
}
void CodeGen_C::visit(const Shuffle *op) {
internal_assert(!op->vectors.empty());
internal_assert(op->vectors[0].type().is_vector());
for (size_t i = 1; i < op->vectors.size(); i++) {
internal_assert(op->vectors[0].type() == op->vectors[i].type());
}
internal_assert(op->type.lanes() == (int)op->indices.size());
const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size());
for (int i : op->indices) {
internal_assert(i >= -1 && i < max_index);
}
std::vector<string> vecs;
for (const Expr &v : op->vectors) {
vecs.push_back(print_expr(v));
}
ostringstream rhs;
if (op->type.is_scalar()) {
// Deduce which vector we need. Apparently it's not required
// that all vectors have identical lanes, so a loop is required.
// Since idx of -1 means "don't care", we'll treat it as 0 to simplify.
int idx = std::max(0, op->indices[0]);
for (size_t vec_idx = 0; vec_idx < op->vectors.size(); vec_idx++) {
const int vec_lanes = op->vectors[vec_idx].type().lanes();
if (idx < vec_lanes) {
rhs << vecs[vec_idx] << "[" << idx << "]";
break;
}
idx -= vec_lanes;
}
internal_assert(!rhs.str().empty());
} else {
string src = vecs[0];
if (op->vectors.size() > 1) {
// This code has always assumed/required that all the vectors
// have identical types, so let's verify
const Type t0 = op->vectors[0].type();
for (const auto &v : op->vectors) {
internal_assert(t0 == v.type());
}
ostringstream rhs;
string storage_name = unique_name('_');
// Combine them into one vector. Clang emits excellent code via this
// union approach (typically without going thru memory) for both x64 and arm64.
stream << get_indent() << "union { "
<< print_type(t0) << " src[" << vecs.size() << "]; "
<< print_type(op->type) << " dst; } "
<< storage_name << " = {{ " << with_commas(vecs) << " }};\n";
src = storage_name + ".dst";
}
rhs << print_type(op->type) << "_ops::shuffle<" << with_commas(op->indices) << ">(" << src << ")";
}
print_assignment(op->type, rhs.str());
}
void CodeGen_C::test() {
LoweredArgument buffer_arg("buf", Argument::OutputBuffer, Int(32), 3, ArgumentEstimates{});
LoweredArgument float_arg("alpha", Argument::InputScalar, Float(32), 0, ArgumentEstimates{});
LoweredArgument int_arg("beta", Argument::InputScalar, Int(32), 0, ArgumentEstimates{});
LoweredArgument user_context_arg("__user_context", Argument::InputScalar, type_of<const void *>(), 0, ArgumentEstimates{});
vector<LoweredArgument> args = {buffer_arg, float_arg, int_arg, user_context_arg};
Var x("x");
Param<float> alpha("alpha");
Param<int> beta("beta");
Expr e = Select::make(alpha > 4.0f, print_when(x < 1, 3), 2);
Stmt s = Store::make("buf", e, x, Parameter(), const_true(), ModulusRemainder());
s = LetStmt::make("x", beta + 1, s);
s = Block::make(s, Free::make("tmp.stack"));
s = Allocate::make("tmp.stack", Int(32), MemoryType::Stack, {127}, const_true(), s);
s = Allocate::make("tmp.heap", Int(32), MemoryType::Heap, {43, beta}, const_true(), s);
Expr buf = Variable::make(Handle(), "buf.buffer");
s = LetStmt::make("buf", Call::make(Handle(), Call::buffer_get_host, {buf}, Call::Extern), s);
Module m("", get_host_target());
m.append(LoweredFunc("test1", args, s, LinkageType::External));
ostringstream source;
ostringstream macros;
{
CodeGen_C cg(source, Target("host"), CodeGen_C::CImplementation);
cg.compile(m);
}
string src = source.str();
string correct_source =
headers +
globals +
string((const char *)halide_internal_runtime_header_HalideRuntime_h) + '\n' +
string((const char *)halide_internal_initmod_inlined_c) + '\n' +
macros.str() + '\n' + kDefineMustUseResult + R"GOLDEN_CODE(
#ifndef HALIDE_FUNCTION_ATTRS
#define HALIDE_FUNCTION_ATTRS
#endif
#ifdef __cplusplus
extern "C" {
#endif
HALIDE_FUNCTION_ATTRS
int test1(struct halide_buffer_t *_buf_buffer, float _alpha, int32_t _beta, void const *__user_context) {
void * const _ucon = const_cast<void *>(__user_context);
void *_0 = _halide_buffer_get_host(_buf_buffer);
void * _buf = _0;
{
int64_t _1 = 43;
int64_t _2 = _1 * _beta;
if ((_2 > ((int64_t(1) << 31) - 1)) || ((_2 * sizeof(int32_t )) > ((int64_t(1) << 31) - 1)))
{
halide_error(_ucon, "32-bit signed overflow computing size of allocation tmp.heap\n");
return -1;
} // overflow test tmp.heap
int64_t _3 = _2;
int32_t *_tmp_heap = (int32_t *)halide_malloc(_ucon, sizeof(int32_t )*_3);
if (!_tmp_heap)
{
int32_t _4 = halide_error_out_of_memory(_ucon);
return _4;
}
HalideFreeHelper _tmp_heap_free(_ucon, _tmp_heap, halide_free);
{
int32_t _tmp_stack[127];
int32_t _5 = _beta + 1;
int32_t _6;
bool _7 = _5 < 1;
if (_7)
{
char b0[1024];
snprintf(b0, 1024, "%lld%s", (long long)(3), "\n");
char const *_8 = b0;
halide_print(_ucon, _8);
int32_t _9 = 0;
int32_t _10 = return_second(_9, 3);
_6 = _10;
} // if _7
else
{
_6 = 3;
} // if _7 else
int32_t _11 = _6;
float _12 = float_from_bits(1082130432 /* 4 */);
bool _13 = _alpha > _12;
int32_t _14 = (int32_t)(_13 ? _11 : 2);
((int32_t *)_buf)[_5] = _14;
} // alloc _tmp_stack
_tmp_heap_free.free();
} // alloc _tmp_heap
return 0;
}
#ifdef __cplusplus
} // extern "C"
#endif
)GOLDEN_CODE";
if (src != correct_source) {
int diff = 0;
while (src[diff] == correct_source[diff]) {
diff++;
}
int diff_end = diff + 1;
while (diff > 0 && src[diff] != '\n') {
diff--;
}
while (diff_end < (int)src.size() && src[diff_end] != '\n') {
diff_end++;
}
internal_error
<< "Correct source code:\n"
<< correct_source
<< "Actual source code:\n"
<< src
<< "Difference starts at:\n"
<< "Correct: " << correct_source.substr(diff, diff_end - diff) << "\n"
<< "Actual: " << src.substr(diff, diff_end - diff) << "\n";
}
std::cout << "CodeGen_C test passed\n";
}
} // namespace Internal
} // namespace Halide