Content - bc1b1770f8eb3a5bde32c5ebb20733bd724805f5 - f802a4d/src/CodeGen_C.cpp

visit type:
Tip revision: b5e8217dfbe905ef1f30f7bd584a83dfb9e2260e authored by Steven Johnson on 19 January 2023, 19:40:52 UTC
Remove the watchdog timer from generator_main(). It was intended to kill pathologically slow builds, but in the environment it was added for (Google build servers), it ended up being redundant to existing mechanisms, and removing it allows us to remove a dependency on threading libraries in libHalide.
Tip revision: b5e8217
CodeGen_C.cpp
#include <array>
#include <iostream>
#include <limits>

#include "CodeGen_C.h"
#include "CodeGen_Internal.h"
#include "Deinterleave.h"
#include "FindIntrinsics.h"
#include "IROperator.h"
#include "Lerp.h"
#include "Param.h"
#include "Simplify.h"
#include "Substitute.h"
#include "Type.h"
#include "Util.h"
#include "Var.h"

namespace Halide {
namespace Internal {

using std::map;
using std::ostream;
using std::ostringstream;
using std::string;
using std::vector;

extern "C" unsigned char halide_internal_initmod_inlined_c[];
extern "C" unsigned char halide_internal_runtime_header_HalideRuntime_h[];
extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeCuda_h[];
extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeHexagonHost_h[];
extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeMetal_h[];
extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeOpenCL_h[];
extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeOpenGLCompute_h[];
extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeQurt_h[];
extern "C" unsigned char halide_internal_runtime_header_HalideRuntimeD3D12Compute_h[];

namespace {

// HALIDE_MUST_USE_RESULT defined here is intended to exactly
// duplicate the definition in HalideRuntime.h (so that either or
// both can be present, in any order).
const char *const kDefineMustUseResult = R"INLINE_CODE(#ifndef HALIDE_MUST_USE_RESULT
#ifdef __has_attribute
#if __has_attribute(nodiscard)
#define HALIDE_MUST_USE_RESULT [[nodiscard]]
#elif __has_attribute(warn_unused_result)
#define HALIDE_MUST_USE_RESULT __attribute__((warn_unused_result))
#else
#define HALIDE_MUST_USE_RESULT
#endif
#else
#define HALIDE_MUST_USE_RESULT
#endif
#endif
)INLINE_CODE";

const string headers = R"INLINE_CODE(
/* MACHINE GENERATED By Halide. */

#if !(__cplusplus >= 201103L || _MSVC_LANG >= 201103L)
#error "This code requires C++11 (or later); please upgrade your compiler."
#endif

#include <assert.h>
#include <float.h>
#include <limits.h>
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <type_traits>
#include <fenv.h>
)INLINE_CODE";

// We now add definitions of things in the runtime which are
// intended to be inlined into every module but are only expressed
// in .ll. The redundancy is regrettable (FIXME).
const string globals = R"INLINE_CODE(
extern "C" {
int64_t halide_current_time_ns(void *ctx);
void halide_profiler_pipeline_end(void *, void *);
struct halide_buffer_t;
char *halide_buffer_to_string(char *, char *, const halide_buffer_t *);
}

#ifdef _WIN32
#ifndef _MT
__declspec(dllimport) float __cdecl roundf(float);
__declspec(dllimport) double __cdecl round(double);
#endif
#else
inline float asinh_f32(float x) {return asinhf(x);}
inline float acosh_f32(float x) {return acoshf(x);}
inline float atanh_f32(float x) {return atanhf(x);}
inline double asinh_f64(double x) {return asinh(x);}
inline double acosh_f64(double x) {return acosh(x);}
inline double atanh_f64(double x) {return atanh(x);}
#endif
inline float sqrt_f32(float x) {return sqrtf(x);}
inline float sin_f32(float x) {return sinf(x);}
inline float asin_f32(float x) {return asinf(x);}
inline float cos_f32(float x) {return cosf(x);}
inline float acos_f32(float x) {return acosf(x);}
inline float tan_f32(float x) {return tanf(x);}
inline float atan_f32(float x) {return atanf(x);}
inline float atan2_f32(float x, float y) {return atan2f(x, y);}
inline float sinh_f32(float x) {return sinhf(x);}
inline float cosh_f32(float x) {return coshf(x);}
inline float tanh_f32(float x) {return tanhf(x);}
inline float hypot_f32(float x, float y) {return hypotf(x, y);}
inline float exp_f32(float x) {return expf(x);}
inline float log_f32(float x) {return logf(x);}
inline float pow_f32(float x, float y) {return powf(x, y);}
inline float floor_f32(float x) {return floorf(x);}
inline float ceil_f32(float x) {return ceilf(x);}

inline double sqrt_f64(double x) {return sqrt(x);}
inline double sin_f64(double x) {return sin(x);}
inline double asin_f64(double x) {return asin(x);}
inline double cos_f64(double x) {return cos(x);}
inline double acos_f64(double x) {return acos(x);}
inline double tan_f64(double x) {return tan(x);}
inline double atan_f64(double x) {return atan(x);}
inline double atan2_f64(double x, double y) {return atan2(x, y);}
inline double sinh_f64(double x) {return sinh(x);}
inline double cosh_f64(double x) {return cosh(x);}
inline double tanh_f64(double x) {return tanh(x);}
inline double hypot_f64(double x, double y) {return hypot(x, y);}
inline double exp_f64(double x) {return exp(x);}
inline double log_f64(double x) {return log(x);}
inline double pow_f64(double x, double y) {return pow(x, y);}
inline double floor_f64(double x) {return floor(x);}
inline double ceil_f64(double x) {return ceil(x);}

inline float nan_f32() {return NAN;}
inline float neg_inf_f32() {return -INFINITY;}
inline float inf_f32() {return INFINITY;}
inline bool is_nan_f32(float x) {return isnan(x);}
inline bool is_nan_f64(double x) {return isnan(x);}
inline bool is_inf_f32(float x) {return isinf(x);}
inline bool is_inf_f64(double x) {return isinf(x);}
inline bool is_finite_f32(float x) {return isfinite(x);}
inline bool is_finite_f64(double x) {return isfinite(x);}

template<typename A, typename B>
inline A reinterpret(const B &b) {
    static_assert(sizeof(A) == sizeof(B), "type size mismatch");
    A a;
    memcpy(&a, &b, sizeof(a));
    return a;
}
inline float float_from_bits(uint32_t bits) {
    return reinterpret<float, uint32_t>(bits);
}

template<typename T>
inline int halide_popcount(T a) {
    int bits_set = 0;
    while (a != 0) {
        bits_set += a & 1;
        a >>= 1;
    }
    return bits_set;
}

template<typename T>
inline int halide_count_leading_zeros(T a) {
    int leading_zeros = 0;
    int bit = sizeof(a) * 8 - 1;
    while (bit >= 0 && (a & (((T)1) << bit)) == 0) {
        leading_zeros++;
        bit--;
    }
    return leading_zeros;
}

template<typename T>
inline int halide_count_trailing_zeros(T a) {
    int trailing_zeros = 0;
    constexpr int bits = sizeof(a) * 8;
    int bit = 0;
    while (bit < bits && (a & (((T)1) << bit)) == 0) {
        trailing_zeros++;
        bit++;
    }
    return trailing_zeros;
}

template<typename T>
inline T halide_cpp_max(const T &a, const T &b) {return (a > b) ? a : b;}

template<typename T>
inline T halide_cpp_min(const T &a, const T &b) {return (a < b) ? a : b;}

template<typename T>
inline void halide_maybe_unused(const T&) {}

template<typename A, typename B>
const B &return_second(const A &a, const B &b) {
    halide_maybe_unused(a);
    return b;
}

namespace {
class HalideFreeHelper {
    typedef void (*FreeFunction)(void *user_context, void *p);
    void * user_context;
    void *p;
    FreeFunction free_function;
public:
    HalideFreeHelper(void *user_context, void *p, FreeFunction free_function)
        : user_context(user_context), p(p), free_function(free_function) {}
    ~HalideFreeHelper() { free(); }
    void free() {
        if (p) {
            // TODO: do all free_functions guarantee to ignore a nullptr?
            free_function(user_context, p);
            p = nullptr;
        }
    }
};
} // namespace
)INLINE_CODE";

const char *const constexpr_argument_info_docs = R"INLINE_CODE(
/**
 * This function returns a constexpr array of information about a Halide-generated
 * function's argument signature (e.g., number of arguments, type of each, etc).
 * While this is a subset of the information provided by the existing _metadata
 * function, it has the distinct advantage of allowing one to use the information
 * it at compile time (rather than runtime). This can be quite useful for producing
 * e.g. automatic call wrappers, etc.
 *
 * For instance, to compute the number of Buffers in a Function, one could do something
 * like:
 *
 *      using namespace HalideFunctionInfo;
 *
 *      template<size_t arg_count>
 *      constexpr size_t count_buffers(const std::array<ArgumentInfo, arg_count> args) {
 *          size_t buffer_count = 0;
 *          for (const auto a : args) {
 *              if (a.kind == InputBuffer || a.kind == OutputBuffer) {
 *                  buffer_count++;
 *              }
 *          }
 *          return buffer_count;
 *      }
 *
 *      constexpr size_t count = count_buffers(metadata_tester_argument_info());
 *
 * The value of `count` will be computed entirely at compile-time, with no runtime
 * impact aside from the numerical value of the constant.
 */

)INLINE_CODE";

class TypeInfoGatherer : public IRGraphVisitor {
private:
    using IRGraphVisitor::include;
    using IRGraphVisitor::visit;

    void include_type(const Type &t) {
        if (t.is_vector()) {
            if (t.is_bool()) {
                // bool vectors are always emitted as uint8 in the C++ backend
                // TODO: on some architectures, we could do better by choosing
                // a bitwidth that matches the other vectors in use; EliminateBoolVectors
                // could be used for this with a bit of work.
                vector_types_used.insert(UInt(8).with_lanes(t.lanes()));
            } else if (!t.is_handle()) {
                // Vector-handle types can be seen when processing (e.g.)
                // require() statements that are vectorized, but they
                // will all be scalarized away prior to use, so don't emit
                // them.
                vector_types_used.insert(t);
                if (t.is_int()) {
                    // If we are including an int-vector type, also include
                    // the same-width uint-vector type; there are various operations
                    // that can use uint vectors for intermediate results (e.g. lerp(),
                    // but also Mod, which can generate a call to abs() for int types,
                    // which always produces uint results for int inputs in Halide);
                    // it's easier to just err on the side of extra vectors we don't
                    // use since they are just type declarations.
                    vector_types_used.insert(t.with_code(halide_type_uint));
                }
            }
        }
    }

    void include_lerp_types(const Type &t) {
        if (t.is_vector() && t.is_int_or_uint() && (t.bits() >= 8 && t.bits() <= 32)) {
            include_type(t.widen());
        }
    }

protected:
    void include(const Expr &e) override {
        include_type(e.type());
        IRGraphVisitor::include(e);
    }

    // GCC's __builtin_shuffle takes an integer vector of
    // the size of its input vector. Make sure this type exists.
    void visit(const Shuffle *op) override {
        vector_types_used.insert(Int(32, op->vectors[0].type().lanes()));
        IRGraphVisitor::visit(op);
    }

    void visit(const For *op) override {
        for_types_used.insert(op->for_type);
        IRGraphVisitor::visit(op);
    }

    void visit(const Ramp *op) override {
        include_type(op->type.with_lanes(op->lanes));
        IRGraphVisitor::visit(op);
    }

    void visit(const Broadcast *op) override {
        include_type(op->type.with_lanes(op->lanes));
        IRGraphVisitor::visit(op);
    }

    void visit(const Cast *op) override {
        include_type(op->type);
        IRGraphVisitor::visit(op);
    }

    void visit(const Call *op) override {
        include_type(op->type);
        if (op->is_intrinsic(Call::lerp)) {
            // lower_lerp() can synthesize wider vector types.
            for (const auto &a : op->args) {
                include_lerp_types(a.type());
            }
        } else if (op->is_intrinsic()) {
            Expr lowered = lower_intrinsic(op);
            if (lowered.defined()) {
                lowered.accept(this);
                return;
            }
        }

        IRGraphVisitor::visit(op);
    }

public:
    std::set<ForType> for_types_used;
    std::set<Type> vector_types_used;
};

}  // namespace

CodeGen_C::CodeGen_C(ostream &s, const Target &t, OutputKind output_kind, const std::string &guard)
    : IRPrinter(s), id("$$ BAD ID $$"), target(t), output_kind(output_kind),
      extern_c_open(false), inside_atomic_mutex_node(false), emit_atomic_stores(false), using_vector_typedefs(false) {

    if (output_kind == CPlusPlusFunctionInfoHeader) {
        // If it's a header, emit an include guard.
        stream << "#ifndef HALIDE_FUNCTION_INFO_" << c_print_name(guard) << "\n"
               << "#define HALIDE_FUNCTION_INFO_" << c_print_name(guard) << "\n";
        stream << R"INLINE_CODE(
/* MACHINE GENERATED By Halide. */

#if !(__cplusplus >= 201703L || _MSVC_LANG >= 201703L)
#error "This file requires C++17 or later; please upgrade your compiler."
#endif

#include "HalideRuntime.h"

)INLINE_CODE";

        return;
    }

    if (is_header()) {
        // If it's a header, emit an include guard.
        stream << "#ifndef HALIDE_" << c_print_name(guard) << "\n"
               << "#define HALIDE_" << c_print_name(guard) << "\n"
               << "#include <stdint.h>\n"
               << "\n"
               << "// Forward declarations of the types used in the interface\n"
               << "// to the Halide pipeline.\n"
               << "//\n";
        if (target.has_feature(Target::NoRuntime)) {
            stream << "// For the definitions of these structs, include HalideRuntime.h\n";
        } else {
            stream << "// Definitions for these structs are below.\n";
        }
        stream << "\n"
               << "// Halide's representation of a multi-dimensional array.\n"
               << "// Halide::Runtime::Buffer is a more user-friendly wrapper\n"
               << "// around this. Its declaration is in HalideBuffer.h\n"
               << "struct halide_buffer_t;\n"
               << "\n"
               << "// Metadata describing the arguments to the generated function.\n"
               << "// Used to construct calls to the _argv version of the function.\n"
               << "struct halide_filter_metadata_t;\n"
               << "\n";
        // We just forward declared the following types:
        forward_declared.insert(type_of<halide_buffer_t *>().handle_type);
        forward_declared.insert(type_of<halide_filter_metadata_t *>().handle_type);
    } else if (is_extern_decl()) {
        // Extern decls to be wrapped inside other code (eg python extensions);
        // emit the forward decls with a minimum of noise. Note that we never
        // mess with legacy buffer types in this case.
        stream << "struct halide_buffer_t;\n"
               << "struct halide_filter_metadata_t;\n"
               << "\n";
        forward_declared.insert(type_of<halide_buffer_t *>().handle_type);
        forward_declared.insert(type_of<halide_filter_metadata_t *>().handle_type);
    } else {
        // Include declarations of everything generated C source might want
        stream
            << headers
            << globals
            << halide_internal_runtime_header_HalideRuntime_h << "\n"
            << halide_internal_initmod_inlined_c << "\n";
        stream << "\n";
    }

    stream << kDefineMustUseResult << "\n";

    // Throw in a default (empty) definition of HALIDE_FUNCTION_ATTRS
    // (some hosts may define this to e.g. __attribute__((warn_unused_result)))
    stream << "#ifndef HALIDE_FUNCTION_ATTRS\n";
    stream << "#define HALIDE_FUNCTION_ATTRS\n";
    stream << "#endif\n";
}

CodeGen_C::~CodeGen_C() {
    set_name_mangling_mode(NameMangling::Default);

    if (is_header()) {
        if (!target.has_feature(Target::NoRuntime)) {
            stream << "\n"
                   << "// The generated object file that goes with this header\n"
                   << "// includes a full copy of the Halide runtime so that it\n"
                   << "// can be used standalone. Declarations for the functions\n"
                   << "// in the Halide runtime are below.\n";
            if (target.os == Target::Windows) {
                stream
                    << "//\n"
                    << "// The inclusion of this runtime means that it is not legal\n"
                    << "// to link multiple Halide-generated object files together.\n"
                    << "// This problem is Windows-specific. On other platforms, we\n"
                    << "// use weak linkage.\n";
            } else {
                stream
                    << "//\n"
                    << "// The runtime is defined using weak linkage, so it is legal\n"
                    << "// to link multiple Halide-generated object files together,\n"
                    << "// or to clobber any of these functions with your own\n"
                    << "// definition.\n";
            }
            stream << "//\n"
                   << "// To generate an object file without a full copy of the\n"
                   << "// runtime, use the -no_runtime target flag. To generate a\n"
                   << "// standalone Halide runtime to use with such object files\n"
                   << "// use the -r flag with any Halide generator binary, e.g.:\n"
                   << "// $ ./my_generator -r halide_runtime -o . target=host\n"
                   << "\n"
                   << halide_internal_runtime_header_HalideRuntime_h << "\n";
            if (target.has_feature(Target::CUDA)) {
                stream << halide_internal_runtime_header_HalideRuntimeCuda_h << "\n";
            }
            if (target.has_feature(Target::HVX)) {
                stream << halide_internal_runtime_header_HalideRuntimeHexagonHost_h << "\n";
            }
            if (target.has_feature(Target::Metal)) {
                stream << halide_internal_runtime_header_HalideRuntimeMetal_h << "\n";
            }
            if (target.has_feature(Target::OpenCL)) {
                stream << halide_internal_runtime_header_HalideRuntimeOpenCL_h << "\n";
            }
            if (target.has_feature(Target::OpenGLCompute)) {
                stream << halide_internal_runtime_header_HalideRuntimeOpenGLCompute_h << "\n";
            }
            if (target.has_feature(Target::D3D12Compute)) {
                stream << halide_internal_runtime_header_HalideRuntimeD3D12Compute_h << "\n";
            }
        }
        stream << "#endif\n";
    }
}

void CodeGen_C::add_vector_typedefs(const std::set<Type> &vector_types) {
    if (!vector_types.empty()) {
        // MSVC has a limit of ~16k for string literals, so split
        // up these declarations accordingly
        const char *cpp_vector_decl = R"INLINE_CODE(
#if !defined(__has_attribute)
    #define __has_attribute(x) 0
#endif

#if !defined(__has_builtin)
    #define __has_builtin(x) 0
#endif

namespace {

// We can't use std::array because that has its own overload of operator<, etc,
// which will interfere with ours.
template <typename ElementType, size_t Lanes>
struct CppVector {
    ElementType elements[Lanes];

    HALIDE_ALWAYS_INLINE
    ElementType& operator[](size_t i) {
        return elements[i];
    }

    HALIDE_ALWAYS_INLINE
    const ElementType operator[](size_t i) const {
        return elements[i];
    }

    HALIDE_ALWAYS_INLINE
    ElementType *data() {
        return elements;
    }

    HALIDE_ALWAYS_INLINE
    const ElementType *data() const {
        return elements;
    }
};

template <typename ElementType_, size_t Lanes_>
class CppVectorOps {
public:
    using ElementType = ElementType_ ;
    static constexpr size_t Lanes = Lanes_;

    using Vec = CppVector<ElementType, Lanes>;
    using Mask = CppVector<uint8_t, Lanes>;

    CppVectorOps() = delete;

    static Vec broadcast(const ElementType v) {
        Vec r;
        for (size_t i = 0; i < Lanes; i++) {
            r[i] = v;
        }
        return r;
    }

    static Vec ramp(const ElementType base, const ElementType stride) {
        Vec r;
        for (size_t i = 0; i < Lanes; i++) {
            r[i] = base + stride * i;
        }
        return r;
    }

    static Vec load(const void *base, int32_t offset) {
        Vec r;
        memcpy(r.data(), ((const ElementType*)base + offset), sizeof(ElementType) * Lanes);
        return r;
    }

    static Vec load_gather(const void *base, const CppVector<int32_t, Lanes> &offset) {
        Vec r;
        for (size_t i = 0; i < Lanes; i++) {
            r[i] = ((const ElementType*)base)[offset[i]];
        }
        return r;
    }

    static Vec load_predicated(const void *base, const CppVector<int32_t, Lanes> &offset, const Mask &predicate) {
        Vec r;
        for (size_t i = 0; i < Lanes; i++) {
            if (predicate[i]) {
                r[i] = ((const ElementType*)base)[offset[i]];
            }
        }
        return r;
    }

    static void store(const Vec &v, void *base, int32_t offset) {
        memcpy(((ElementType*)base + offset), v.data(), sizeof(ElementType) * Lanes);
    }

    static void store_scatter(const Vec &v, void *base, const CppVector<int32_t, Lanes> &offset) {
        for (size_t i = 0; i < Lanes; i++) {
            ((ElementType*)base)[offset[i]] = v[i];
        }
    }

    static void store_predicated(const Vec &v, void *base, const CppVector<int32_t, Lanes> &offset, const Mask &predicate) {
        for (size_t i = 0; i < Lanes; i++) {
            if (predicate[i]) {
                ((ElementType*)base)[offset[i]] = v[i];
            }
        }
    }

    template<int... Indices, typename InputVec>
    static Vec shuffle(const InputVec &a) {
        static_assert(sizeof...(Indices) == Lanes, "shuffle() requires an exact match of lanes");
        Vec r = { a[Indices]... };
        return r;
    }

    static Vec replace(const Vec &v, size_t i, const ElementType b) {
        Vec r = v;
        r[i] = b;
        return r;
    }

    template <typename OtherVec>
    static Vec convert_from(const OtherVec &src) {
        Vec r;
        for (size_t i = 0; i < Lanes; i++) {
            r[i] = static_cast<ElementType>(src[i]);
        }
        return r;
    }

    static Vec max(const Vec &a, const Vec &b) {
        Vec r;
        for (size_t i = 0; i < Lanes; i++) {
            r[i] = ::halide_cpp_max(a[i], b[i]);
        }
        return r;
    }

    static Vec min(const Vec &a, const Vec &b) {
        Vec r;
        for (size_t i = 0; i < Lanes; i++) {
            r[i] = ::halide_cpp_min(a[i], b[i]);
        }
        return r;
    }

    static Vec select(const Mask &cond, const Vec &true_value, const Vec &false_value) {
        Vec r;
        for (size_t i = 0; i < Lanes; i++) {
            r[i] = cond[i] ? true_value[i] : false_value[i];
        }
        return r;
    }

    static Mask logical_or(const Vec &a, const Vec &b) {
        CppVector<uint8_t, Lanes> r;
        for (size_t i = 0; i < Lanes; i++) {
            r[i] = a[i] || b[i] ? 0xff : 0x00;
        }
        return r;
    }

    static Mask logical_and(const Vec &a, const Vec &b) {
        CppVector<uint8_t, Lanes> r;
        for (size_t i = 0; i < Lanes; i++) {
            r[i] = a[i] && b[i] ? 0xff : 0x00;
        }
        return r;
    }

    static Mask lt(const Vec &a, const Vec &b) {
        CppVector<uint8_t, Lanes> r;
        for (size_t i = 0; i < Lanes; i++) {
            r[i] = a[i] < b[i] ? 0xff : 0x00;
        }
        return r;
    }

    static Mask le(const Vec &a, const Vec &b) {
        CppVector<uint8_t, Lanes> r;
        for (size_t i = 0; i < Lanes; i++) {
            r[i] = a[i] <= b[i] ? 0xff : 0x00;
        }
        return r;
    }

    static Mask gt(const Vec &a, const Vec &b) {
        CppVector<uint8_t, Lanes> r;
        for (size_t i = 0; i < Lanes; i++) {
            r[i] = a[i] > b[i] ? 0xff : 0x00;
        }
        return r;
    }

    static Mask ge(const Vec &a, const Vec &b) {
        CppVector<uint8_t, Lanes> r;
        for (size_t i = 0; i < Lanes; i++) {
            r[i] = a[i] >= b[i] ? 0xff : 0x00;
        }
        return r;
    }

    static Mask eq(const Vec &a, const Vec &b) {
        CppVector<uint8_t, Lanes> r;
        for (size_t i = 0; i < Lanes; i++) {
            r[i] = a[i] == b[i] ? 0xff : 0x00;
        }
        return r;
    }

    static Mask ne(const Vec &a, const Vec &b) {
        CppVector<uint8_t, Lanes> r;
        for (size_t i = 0; i < Lanes; i++) {
            r[i] = a[i] != b[i] ? 0xff : 0x00;
        }
        return r;
    }
};

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator~(const CppVector<ElementType, Lanes> &v) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = ~v[i];
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator!(const CppVector<ElementType, Lanes> &v) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = !v[i];
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator+(const CppVector<ElementType, Lanes> &a, const CppVector<ElementType, Lanes> &b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a[i] + b[i];
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator-(const CppVector<ElementType, Lanes> &a, const CppVector<ElementType, Lanes> &b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a[i] - b[i];
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator*(const CppVector<ElementType, Lanes> &a, const CppVector<ElementType, Lanes> &b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a[i] * b[i];
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator/(const CppVector<ElementType, Lanes> &a, const CppVector<ElementType, Lanes> &b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a[i] / b[i];
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator%(const CppVector<ElementType, Lanes> &a, const CppVector<ElementType, Lanes> &b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a[i] % b[i];
    }
    return r;
}

template <typename ElementType, size_t Lanes, typename OtherElementType>
CppVector<ElementType, Lanes> operator<<(const CppVector<ElementType, Lanes> &a, const CppVector<OtherElementType, Lanes> &b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a[i] << b[i];
    }
    return r;
}

template <typename ElementType, size_t Lanes, typename OtherElementType>
CppVector<ElementType, Lanes> operator>>(const CppVector<ElementType, Lanes> &a, const CppVector<OtherElementType, Lanes> &b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a[i] >> b[i];
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator&(const CppVector<ElementType, Lanes> &a, const CppVector<ElementType, Lanes> &b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a[i] & b[i];
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator|(const CppVector<ElementType, Lanes> &a, const CppVector<ElementType, Lanes> &b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a[i] | b[i];
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator^(const CppVector<ElementType, Lanes> &a, const CppVector<ElementType, Lanes> &b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a[i] ^ b[i];
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator+(const CppVector<ElementType, Lanes> &a, const ElementType b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a[i] + b;
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator-(const CppVector<ElementType, Lanes> &a, const ElementType b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a[i] - b;
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator*(const CppVector<ElementType, Lanes> &a, const ElementType b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a[i] * b;
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator/(const CppVector<ElementType, Lanes> &a, const ElementType b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a[i] / b;
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator%(const CppVector<ElementType, Lanes> &a, const ElementType b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a[i] % b;
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator>>(const CppVector<ElementType, Lanes> &a, const ElementType b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a[i] >> b;
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator<<(const CppVector<ElementType, Lanes> &a, const ElementType b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a[i] << b;
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator&(const CppVector<ElementType, Lanes> &a, const ElementType b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a[i] & b;
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator|(const CppVector<ElementType, Lanes> &a, const ElementType b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a[i] | b;
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator^(const CppVector<ElementType, Lanes> &a, const ElementType b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a[i] ^ b;
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator+(const ElementType a, const CppVector<ElementType, Lanes> &b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a + b[i];
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator-(const ElementType a, const CppVector<ElementType, Lanes> &b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a - b[i];
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator*(const ElementType a, const CppVector<ElementType, Lanes> &b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a * b[i];
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator/(const ElementType a, const CppVector<ElementType, Lanes> &b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a / b[i];
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator%(const ElementType a, const CppVector<ElementType, Lanes> &b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a % b[i];
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator>>(const ElementType a, const CppVector<ElementType, Lanes> &b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a >> b[i];
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator<<(const ElementType a, const CppVector<ElementType, Lanes> &b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a << b[i];
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator&(const ElementType a, const CppVector<ElementType, Lanes> &b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a & b[i];
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator|(const ElementType a, const CppVector<ElementType, Lanes> &b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a | b[i];
    }
    return r;
}

template<typename ElementType, size_t Lanes>
CppVector<ElementType, Lanes> operator^(const ElementType a, const CppVector<ElementType, Lanes> &b) {
    CppVector<ElementType, Lanes> r;
    for (size_t i = 0; i < Lanes; i++) {
        r[i] = a ^ b[i];
    }
    return r;
}

}  // namespace


)INLINE_CODE";

        const char *native_vector_decl = R"INLINE_CODE(
namespace {

#if __has_attribute(ext_vector_type) || __has_attribute(vector_size)

#if __has_attribute(ext_vector_type)
    // Clang
    template<typename ElementType, size_t Lanes>
    using NativeVector __attribute__((ext_vector_type(Lanes), aligned(sizeof(ElementType)))) = ElementType;
#elif __has_attribute(vector_size) || defined(__GNUC__)
    // GCC
    template<typename ElementType, size_t Lanes>
    using NativeVector __attribute__((vector_size(Lanes * sizeof(ElementType)), aligned(sizeof(ElementType)))) = ElementType;
#else
    #error
#endif

template<typename T>
struct NativeVectorComparisonType {
    using type = void;
};

template<>
struct NativeVectorComparisonType<int8_t> { using type = char; };

template<>
struct NativeVectorComparisonType<int16_t> { using type = int16_t; };

template<>
struct NativeVectorComparisonType<int32_t> { using type = int32_t; };

template<>
struct NativeVectorComparisonType<int64_t> { using type = int64_t; };

template<>
struct NativeVectorComparisonType<uint8_t> { using type = char; };

template<>
struct NativeVectorComparisonType<uint16_t> { using type = int16_t; };

template<>
struct NativeVectorComparisonType<uint32_t> { using type = int32_t; };

template<>
struct NativeVectorComparisonType<uint64_t> { using type = int64_t; };

template<>
struct NativeVectorComparisonType<float> { using type = int32_t; };

template<>
struct NativeVectorComparisonType<double> { using type = int64_t; };

template <typename ElementType_, size_t Lanes_>
class NativeVectorOps {
public:
    using ElementType = ElementType_ ;
    static constexpr size_t Lanes = Lanes_;

    using Vec = NativeVector<ElementType, Lanes>;
    using Mask = NativeVector<uint8_t, Lanes>;

    NativeVectorOps() = delete;

    static Vec broadcast(const ElementType v) {
        const Vec zero = {}; // Zero-initialized native vector.
        return v - zero;
    }

    static Vec ramp(const ElementType base, const ElementType stride) {
        Vec r;
        for (size_t i = 0; i < Lanes; i++) {
            r[i] = base + stride * i;
        }
        return r;
    }

    static Vec load(const void *base, int32_t offset) {
        Vec r;
        // We only require Vec to be element-aligned, so we can't safely just read
        // directly from memory (might segfault). Use memcpy for safety.
        //
        // If Vec is a non-power-of-two (e.g. uint8x48), the actual implementation
        // might be larger (e.g. it might really be a uint8x64). Only copy the amount
        // that is in the logical type, to avoid possible overreads.
        memcpy(&r, ((const ElementType*)base + offset), sizeof(ElementType) * Lanes);
        return r;
    }

    static Vec load_gather(const void *base, const NativeVector<int32_t, Lanes> offset) {
        Vec r;
        for (size_t i = 0; i < Lanes; i++) {
            r[i] = ((const ElementType*)base)[offset[i]];
        }
        return r;
    }

    static Vec load_predicated(const void *base, const NativeVector<int32_t, Lanes> offset, const NativeVector<uint8_t, Lanes> predicate) {
        Vec r;
        for (size_t i = 0; i < Lanes; i++) {
            if (predicate[i]) {
                r[i] = ((const ElementType*)base)[offset[i]];
            }
        }
        return r;
    }
    static void store(const Vec v, void *base, int32_t offset) {
        // We only require Vec to be element-aligned, so we can't safely just write
        // directly from memory (might segfault). Use memcpy for safety.
        //
        // If Vec is a non-power-of-two (e.g. uint8x48), the actual implementation
        // might be larger (e.g. it might really be a uint8x64). Only copy the amount
        // that is in the logical type, to avoid possible overreads.
        memcpy(((ElementType*)base + offset), &v, sizeof(ElementType) * Lanes);
    }

    static void store_scatter(const Vec v, void *base, const NativeVector<int32_t, Lanes> offset) {
        for (size_t i = 0; i < Lanes; i++) {
            ((ElementType*)base)[offset[i]] = v[i];
        }
    }

    static void store_predicated(const Vec v, void *base, const NativeVector<int32_t, Lanes> offset, const NativeVector<uint8_t, Lanes> predicate) {
        for (size_t i = 0; i < Lanes; i++) {
            if (predicate[i]) {
                ((ElementType*)base)[offset[i]] = v[i];
            }
        }
    }

    template<int... Indices, typename InputVec>
    static Vec shuffle(const InputVec a) {
        static_assert(sizeof...(Indices) == Lanes, "shuffle() requires an exact match of lanes");
#if __has_builtin(__builtin_shufflevector)
        // Exists in clang and gcc >= 12. Gcc's __builtin_shuffle can't
        // be used, because it can't handle changing the number of vector
        // lanes between input and output.
        return __builtin_shufflevector(a, a, Indices...);
#else
        Vec r = { a[Indices]... };
        return r;
#endif
    }

    static Vec replace(Vec v, size_t i, const ElementType b) {
        v[i] = b;
        return v;
    }

    template <typename OtherVec>
    static Vec convert_from(const OtherVec src) {
#if __has_builtin(__builtin_convertvector)
        // Don't use __builtin_convertvector for float->int: it appears to have
        // different float->int rounding behavior in at least some situations;
        // for now we'll use the much-slower-but-correct explicit C++ code.
        // (https://github.com/halide/Halide/issues/2080)
        constexpr bool is_float_to_int = std::is_floating_point<OtherVec>::value &&
                                         std::is_integral<Vec>::value;
        if (!is_float_to_int) {
            return __builtin_convertvector(src, Vec);
        }
#endif
        // Fallthru for float->int, or degenerate compilers that support native vectors
        // but not __builtin_convertvector (Intel?)
        Vec r;
        for (size_t i = 0; i < Lanes; i++) {
            r[i] = static_cast<ElementType>(src[i]);
        }
        return r;
    }

    static Vec max(const Vec a, const Vec b) {
#if defined(__GNUC__) && !defined(__clang__)
        // TODO: GCC doesn't seem to recognize this pattern, and scalarizes instead
        return a > b ? a : b;
#else
        // Clang doesn't do ternary operator for vectors, but recognizes this pattern
        Vec r;
        for (size_t i = 0; i < Lanes; i++) {
            r[i] = a[i] > b[i] ? a[i] : b[i];
        }
        return r;
#endif
    }

    static Vec min(const Vec a, const Vec b) {
#if defined(__GNUC__) && !defined(__clang__)
        // TODO: GCC doesn't seem to recognize this pattern, and scalarizes instead
        return a < b ? a : b;
#else
        // Clang doesn't do ternary operator for vectors, but recognizes this pattern
        Vec r;
        for (size_t i = 0; i < Lanes; i++) {
            r[i] = a[i] < b[i] ? a[i] : b[i];
        }
        return r;
#endif
    }

    static Vec select(const Mask cond, const Vec true_value, const Vec false_value) {
#if defined(__GNUC__) && !defined(__clang__)
        // This should do the correct lane-wise select.
        using T = typename NativeVectorComparisonType<ElementType>::type;
        auto b = NativeVectorOps<T, Lanes>::convert_from(cond);
        return b ? true_value : false_value;
#else
        // Clang doesn't do ternary operator for vectors, but recognizes this pattern
        Vec r;
        for (size_t i = 0; i < Lanes; i++) {
            r[i] = cond[i] ? true_value[i] : false_value[i];
        }
        return r;
#endif
    }

    // The relational operators produce signed-int of same width as input; our codegen expects uint8.
    static Mask logical_or(const Vec a, const Vec b) {
        using T = typename NativeVectorComparisonType<ElementType>::type;
        const NativeVector<T, Lanes> r = a || b;
        return NativeVectorOps<uint8_t, Lanes>::convert_from(r);
    }

    static Mask logical_and(const Vec a, const Vec b) {
        using T = typename NativeVectorComparisonType<ElementType>::type;
        const NativeVector<T, Lanes> r = a && b;
        return NativeVectorOps<uint8_t, Lanes>::convert_from(r);
    }

    static Mask lt(const Vec a, const Vec b) {
        using T = typename NativeVectorComparisonType<ElementType>::type;
        const NativeVector<T, Lanes> r = a < b;
        return NativeVectorOps<uint8_t, Lanes>::convert_from(r);
    }

    static Mask le(const Vec a, const Vec b) {
        using T = typename NativeVectorComparisonType<ElementType>::type;
        const NativeVector<T, Lanes> r = a <= b;
        return NativeVectorOps<uint8_t, Lanes>::convert_from(r);
    }

    static Mask gt(const Vec a, const Vec b) {
        using T = typename NativeVectorComparisonType<ElementType>::type;
        const NativeVector<T, Lanes> r = a > b;
        return NativeVectorOps<uint8_t, Lanes>::convert_from(r);
    }

    static Mask ge(const Vec a, const Vec b) {
        using T = typename NativeVectorComparisonType<ElementType>::type;
        const NativeVector<T, Lanes> r = a >= b;
        return NativeVectorOps<uint8_t, Lanes>::convert_from(r);
    }

    static Mask eq(const Vec a, const Vec b) {
        using T = typename NativeVectorComparisonType<ElementType>::type;
        const NativeVector<T, Lanes> r = a == b;
        return NativeVectorOps<uint8_t, Lanes>::convert_from(r);
    }

    static Mask ne(const Vec a, const Vec b) {
        using T = typename NativeVectorComparisonType<ElementType>::type;
        const NativeVector<T, Lanes> r = a != b;
        return NativeVectorOps<uint8_t, Lanes>::convert_from(r);
    }
};


#endif  // __has_attribute(ext_vector_type) || __has_attribute(vector_size)

}  // namespace

)INLINE_CODE";

        const char *vector_selection_decl = R"INLINE_CODE(
// Dec. 1, 2018: Apparently emscripten compilation runs with the __has_attribute true,
// then fails to handle the vector intrinsics later.
#if !defined(__EMSCRIPTEN__) && (__has_attribute(ext_vector_type) || __has_attribute(vector_size))
    #if __GNUC__ && !__clang__
        // GCC only allows powers-of-two; fall back to CppVector for other widths
        #define halide_cpp_use_native_vector(type, lanes) ((lanes & (lanes - 1)) == 0)
    #else
        #define halide_cpp_use_native_vector(type, lanes) (true)
    #endif
#else
    // No NativeVector available
    #define halide_cpp_use_native_vector(type, lanes) (false)
#endif  // __has_attribute(ext_vector_type) || __has_attribute(vector_size)

// Failsafe to allow forcing non-native vectors in case of unruly compilers
#if HALIDE_CPP_ALWAYS_USE_CPP_VECTORS
    #undef halide_cpp_use_native_vector
    #define halide_cpp_use_native_vector(type, lanes) (false)
#endif

)INLINE_CODE";

        // Vodoo fix: on at least one config (our arm32 buildbot running gcc 5.4),
        // emitting this long text string was regularly garbled in a predictable pattern;
        // flushing the stream before or after heals it. Since C++ codegen is rarely
        // on a compilation critical path, we'll just band-aid it in this way.
        stream << std::flush;
        stream << cpp_vector_decl << native_vector_decl << vector_selection_decl;
        stream << std::flush;

        for (const auto &t : vector_types) {
            string name = print_type(t, DoNotAppendSpace);
            string scalar_name = print_type(t.element_of(), DoNotAppendSpace);
            stream << "#if halide_cpp_use_native_vector(" << scalar_name << ", " << t.lanes() << ")\n";
            stream << "using " << name << " = NativeVector<" << scalar_name << ", " << t.lanes() << ">;\n";
            stream << "using " << name << "_ops = NativeVectorOps<" << scalar_name << ", " << t.lanes() << ">;\n";
            // Useful for debugging which Vector implementation is being selected
            // stream << "#pragma message \"using NativeVector for " << t << "\"\n";
            stream << "#else\n";
            stream << "using " << name << " = CppVector<" << scalar_name << ", " << t.lanes() << ">;\n";
            stream << "using " << name << "_ops = CppVectorOps<" << scalar_name << ", " << t.lanes() << ">;\n";
            // Useful for debugging which Vector implementation is being selected
            // stream << "#pragma message \"using CppVector for " << t << "\"\n";
            stream << "#endif\n";
        }
    }

    using_vector_typedefs = true;
}

void CodeGen_C::set_name_mangling_mode(NameMangling mode) {
    if (extern_c_open && mode != NameMangling::C) {
        stream << R"INLINE_CODE(
#ifdef __cplusplus
}  // extern "C"
#endif

)INLINE_CODE";
        extern_c_open = false;
    } else if (!extern_c_open && mode == NameMangling::C) {
        stream << R"INLINE_CODE(
#ifdef __cplusplus
extern "C" {
#endif

)INLINE_CODE";
        extern_c_open = true;
    }
}

string CodeGen_C::print_type(Type type, AppendSpaceIfNeeded space_option) {
    return type_to_c_type(type, space_option == AppendSpace);
}

string CodeGen_C::print_reinterpret(Type type, const Expr &e) {
    ostringstream oss;
    if (type.is_handle() || e.type().is_handle()) {
        // Use a c-style cast if either src or dest is a handle --
        // note that although Halide declares a "Handle" to always be 64 bits,
        // the source "handle" might actually be a 32-bit pointer (from
        // a function parameter), so calling reinterpret<> (which just memcpy's)
        // would be garbage-producing.
        oss << "(" << print_type(type) << ")";
    } else {
        oss << "reinterpret<" << print_type(type) << ">";
    }
    oss << "(" << print_expr(e) << ")";
    return oss.str();
}

string CodeGen_C::print_name(const string &name) {
    return c_print_name(name);
}

namespace {
class ExternCallPrototypes : public IRGraphVisitor {
    struct NamespaceOrCall {
        const Call *call;  // nullptr if this is a subnamespace
        std::map<string, NamespaceOrCall> names;
        NamespaceOrCall(const Call *call = nullptr)
            : call(call) {
        }
    };
    std::map<string, NamespaceOrCall> c_plus_plus_externs;
    std::map<string, const Call *> c_externs;
    std::set<std::string> processed;
    std::set<std::string> internal_linkage;
    std::set<std::string> destructors;

    using IRGraphVisitor::visit;

    void visit(const Call *op) override {
        IRGraphVisitor::visit(op);

        if (!processed.count(op->name)) {
            if (op->call_type == Call::Extern || op->call_type == Call::PureExtern) {
                c_externs.insert({op->name, op});
            } else if (op->call_type == Call::ExternCPlusPlus) {
                std::vector<std::string> namespaces;
                std::string name = extract_namespaces(op->name, namespaces);
                std::map<string, NamespaceOrCall> *namespace_map = &c_plus_plus_externs;
                for (const auto &ns : namespaces) {
                    auto insertion = namespace_map->insert({ns, NamespaceOrCall()});
                    namespace_map = &insertion.first->second.names;
                }
                namespace_map->insert({name, NamespaceOrCall(op)});
            }
            processed.insert(op->name);
        }

        if (op->is_intrinsic(Call::register_destructor)) {
            internal_assert(op->args.size() == 2);
            const StringImm *fn = op->args[0].as<StringImm>();
            internal_assert(fn);
            destructors.insert(fn->value);
        }
    }

    void visit(const Allocate *op) override {
        IRGraphVisitor::visit(op);
        if (!op->free_function.empty()) {
            destructors.insert(op->free_function);
        }
    }

    void emit_function_decl(ostream &stream, const Call *op, const std::string &name) const {
        // op->name (rather than the name arg) since we need the fully-qualified C++ name
        if (internal_linkage.count(op->name)) {
            stream << "static ";
        }
        stream << type_to_c_type(op->type, /* append_space */ true) << name << "(";
        if (function_takes_user_context(name)) {
            stream << "void *";
            if (!op->args.empty()) {
                stream << ", ";
            }
        }
        for (size_t i = 0; i < op->args.size(); i++) {
            if (i > 0) {
                stream << ", ";
            }
            if (op->args[i].as<StringImm>()) {
                stream << "const char *";
            } else {
                stream << type_to_c_type(op->args[i].type(), true);
            }
        }
        stream << ");\n";
    }

    void emit_namespace_or_call(ostream &stream, const NamespaceOrCall &ns_or_call, const std::string &name) const {
        if (ns_or_call.call == nullptr) {
            stream << "namespace " << name << " {\n";
            for (const auto &ns_or_call_inner : ns_or_call.names) {
                emit_namespace_or_call(stream, ns_or_call_inner.second, ns_or_call_inner.first);
            }
            stream << "} // namespace " << name << "\n";
        } else {
            emit_function_decl(stream, ns_or_call.call, name);
        }
    }

public:
    ExternCallPrototypes() {
        // Make sure we don't catch calls that are already in the global declarations
        const char *strs[] = {globals.c_str(),
                              (const char *)halide_internal_runtime_header_HalideRuntime_h,
                              (const char *)halide_internal_initmod_inlined_c};
        for (const char *str : strs) {
            size_t j = 0;
            for (size_t i = 0; str[i]; i++) {
                char c = str[i];
                if (c == '(' && i > j + 1) {
                    // Could be the end of a function_name.
                    string name(str + j + 1, i - j - 1);
                    processed.insert(name);
                }

                if (('A' <= c && c <= 'Z') ||
                    ('a' <= c && c <= 'z') ||
                    c == '_' ||
                    ('0' <= c && c <= '9')) {
                    // Could be part of a function name.
                } else {
                    j = i;
                }
            }
        }
    }

    void set_internal_linkage(const std::string &name) {
        internal_linkage.insert(name);
    }

    bool has_c_declarations() const {
        return !c_externs.empty();
    }

    bool has_c_plus_plus_declarations() const {
        return !c_plus_plus_externs.empty();
    }

    void emit_c_declarations(ostream &stream) const {
        for (const auto &call : c_externs) {
            emit_function_decl(stream, call.second, call.first);
        }
        for (const auto &d : destructors) {
            stream << "void " << d << "(void *, void *);\n";
        }
        stream << "\n";
    }

    void emit_c_plus_plus_declarations(ostream &stream) const {
        for (const auto &ns_or_call : c_plus_plus_externs) {
            emit_namespace_or_call(stream, ns_or_call.second, ns_or_call.first);
        }
        stream << "\n";
    }
};
}  // namespace

void CodeGen_C::forward_declare_type_if_needed(const Type &t) {
    if (!t.handle_type ||
        forward_declared.count(t.handle_type) ||
        t.handle_type->inner_name.cpp_type_type == halide_cplusplus_type_name::Simple) {
        return;
    }
    for (const auto &ns : t.handle_type->namespaces) {
        stream << "namespace " << ns << " { ";
    }
    switch (t.handle_type->inner_name.cpp_type_type) {
    case halide_cplusplus_type_name::Simple:
        // nothing
        break;
    case halide_cplusplus_type_name::Struct:
        stream << "struct " << t.handle_type->inner_name.name << ";";
        break;
    case halide_cplusplus_type_name::Class:
        stream << "class " << t.handle_type->inner_name.name << ";";
        break;
    case halide_cplusplus_type_name::Union:
        stream << "union " << t.handle_type->inner_name.name << ";";
        break;
    case halide_cplusplus_type_name::Enum:
        internal_error << "Passing pointers to enums is unsupported\n";
        break;
    }
    for (const auto &ns : t.handle_type->namespaces) {
        (void)ns;
        stream << " }";
    }
    stream << "\n";
    forward_declared.insert(t.handle_type);
}

void CodeGen_C::emit_argv_wrapper(const std::string &function_name,
                                  const std::vector<LoweredArgument> &args) {
    if (is_header_or_extern_decl()) {
        stream << "\nHALIDE_FUNCTION_ATTRS\nint " << function_name << "_argv(void **args);\n";
        return;
    }

    stream << "\nHALIDE_FUNCTION_ATTRS\nint " << function_name << "_argv(void **args) {\n";
    indent += 1;

    stream << get_indent() << "return " << function_name << "(\n";
    indent += 1;

    for (size_t i = 0; i < args.size(); i++) {
        if (args[i].is_buffer()) {
            stream << get_indent() << "(halide_buffer_t *)args[" << i << "]";
        } else {
            stream << get_indent() << "*(" << type_to_c_type(args[i].type, false) << " const *)args[" << i << "]";
        }
        if (i + 1 < args.size()) {
            stream << ",";
        }
        stream << "\n";
    }

    indent -= 1;
    stream << ");\n";

    indent -= 1;
    stream << "}";
}

void CodeGen_C::emit_metadata_getter(const std::string &function_name,
                                     const std::vector<LoweredArgument> &args,
                                     const MetadataNameMap &metadata_name_map) {
    if (is_header_or_extern_decl()) {
        stream << "\nHALIDE_FUNCTION_ATTRS\nconst struct halide_filter_metadata_t *" << function_name << "_metadata();\n";
        return;
    }

    auto map_name = [&metadata_name_map](const std::string &from) -> std::string {
        auto it = metadata_name_map.find(from);
        return it == metadata_name_map.end() ? from : it->second;
    };

    stream << "\nHALIDE_FUNCTION_ATTRS\nconst struct halide_filter_metadata_t *" << function_name << "_metadata() {\n";

    indent += 1;

    static const char *const kind_names[] = {
        "halide_argument_kind_input_scalar",
        "halide_argument_kind_input_buffer",
        "halide_argument_kind_output_buffer",
    };

    static const char *const type_code_names[] = {
        "halide_type_int",
        "halide_type_uint",
        "halide_type_float",
        "halide_type_handle",
        "halide_type_bfloat",
    };

    std::set<int64_t> constant_int64_in_use;
    const auto emit_constant_int64 = [this, &constant_int64_in_use](Expr e) -> std::string {
        if (!e.defined()) {
            return "nullptr";
        }

        internal_assert(!e.type().is_handle()) << "Should never see Handle types here.";
        if (!is_const(e)) {
            e = simplify(e);
            internal_assert(is_const(e)) << "Should only see constant values here.";
        }

        const IntImm *int_imm = e.as<IntImm>();
        internal_assert(int_imm && int_imm->type == Int(64));

        const std::string id = "const_" + std::to_string(int_imm->value);
        if (!constant_int64_in_use.count(int_imm->value)) {
            stream << get_indent() << "static const int64_t " << id << " = " << int_imm->value << "LL;\n";
            constant_int64_in_use.insert(int_imm->value);
        }
        return "&" + id;
    };

    int next_scalar_value_id = 0;
    const auto emit_constant_scalar_value = [this, &next_scalar_value_id](Expr e) -> std::string {
        if (!e.defined()) {
            return "nullptr";
        }

        internal_assert(!e.type().is_handle()) << "Should never see Handle types here.";
        if (!is_const(e)) {
            e = simplify(e);
            internal_assert(is_const(e)) << "Should only see constant values here.";
        }

        const IntImm *int_imm = e.as<IntImm>();
        const UIntImm *uint_imm = e.as<UIntImm>();
        const FloatImm *float_imm = e.as<FloatImm>();
        internal_assert(int_imm || uint_imm || float_imm);
        std::string value;
        if (int_imm) {
            value = std::to_string(int_imm->value);
        } else if (uint_imm) {
            value = std::to_string(uint_imm->value);
        } else if (float_imm) {
            value = std::to_string(float_imm->value);
        }

        std::string c_type = type_to_c_type(e.type(), false);
        std::string id = "halide_scalar_value_" + std::to_string(next_scalar_value_id++);

        // It's important that we allocate a full scalar_value_t_type here,
        // even if the type of the value is smaller; downstream consumers should
        // be able to correctly load an entire scalar_value_t_type regardless of its
        // type, and if we emit just (say) a uint8 value here, the pointer may be
        // misaligned and/or the storage after may be unmapped. We'll fake it by
        // making a constant array of the elements we need, setting the first to the
        // constant we want, and setting the rest to all-zeros. (This happens to work because
        // sizeof(halide_scalar_value_t) is evenly divisible by sizeof(any-union-field.)

        const size_t value_size = e.type().bytes();
        internal_assert(value_size > 0 && value_size <= sizeof(halide_scalar_value_t));

        const size_t array_size = sizeof(halide_scalar_value_t) / value_size;
        internal_assert(array_size * value_size == sizeof(halide_scalar_value_t));

        stream << get_indent() << "alignas(alignof(halide_scalar_value_t)) static const " << c_type << " " << id << "[" << array_size << "] = {" << value;
        for (size_t i = 1; i < array_size; i++) {
            stream << ", 0";
        }
        stream << "};\n";

        return "(const halide_scalar_value_t *)&" + id;
    };

    for (const auto &arg : args) {
        const auto legalized_name = c_print_name(map_name(arg.name));

        auto argument_estimates = arg.argument_estimates;
        if (arg.type.is_handle()) {
            // Handle values are always emitted into metadata as "undefined", regardless of
            // what sort of Expr is provided.
            argument_estimates = ArgumentEstimates{};
        }

        const auto defined_count = [](const Region &r) -> size_t {
            size_t c = 0;
            for (const auto &be : r) {
                c += be.min.defined() ? 1 : 0;
                c += be.extent.defined() ? 1 : 0;
            }
            return c;
        };

        std::string buffer_estimates_array_ptr = "nullptr";
        if (arg.is_buffer() && defined_count(argument_estimates.buffer_estimates) > 0) {
            internal_assert((int)argument_estimates.buffer_estimates.size() == arg.dimensions);
            std::vector<std::string> constants;
            for (const auto &be : argument_estimates.buffer_estimates) {
                Expr min = be.min;
                if (min.defined()) {
                    min = cast<int64_t>(min);
                }
                Expr extent = be.extent;
                if (extent.defined()) {
                    extent = cast<int64_t>(extent);
                }
                constants.push_back(emit_constant_int64(min));
                constants.push_back(emit_constant_int64(extent));
            }

            stream << get_indent() << "static const int64_t * const buffer_estimates_" << legalized_name << "[" << (int)arg.dimensions * 2 << "] = {\n";
            indent += 1;
            for (const auto &c : constants) {
                stream << get_indent() << c << ",\n";
            }
            indent -= 1;
            stream << get_indent() << "};\n";
        } else {
            stream << get_indent() << "int64_t const *const *buffer_estimates_" << legalized_name << " = nullptr;\n";
        }

        auto scalar_def = emit_constant_scalar_value(argument_estimates.scalar_def);
        auto scalar_min = emit_constant_scalar_value(argument_estimates.scalar_min);
        auto scalar_max = emit_constant_scalar_value(argument_estimates.scalar_max);
        auto scalar_estimate = emit_constant_scalar_value(argument_estimates.scalar_estimate);

        stream << get_indent() << "const halide_scalar_value_t *scalar_def_" << legalized_name << " = " << scalar_def << ";\n";
        stream << get_indent() << "const halide_scalar_value_t *scalar_min_" << legalized_name << " = " << scalar_min << ";\n";
        stream << get_indent() << "const halide_scalar_value_t *scalar_max_" << legalized_name << " = " << scalar_max << ";\n";
        stream << get_indent() << "const halide_scalar_value_t *scalar_estimate_" << legalized_name << " = " << scalar_estimate << ";\n";
    }

    stream << get_indent() << "static const halide_filter_argument_t args[" << args.size() << "] = {\n";
    indent += 1;
    for (const auto &arg : args) {
        const auto name = map_name(arg.name);
        const auto legalized_name = c_print_name(name);

        stream << get_indent() << "{\n";
        indent += 1;
        stream << get_indent() << "\"" << name << "\",\n";
        internal_assert(arg.kind < sizeof(kind_names) / sizeof(kind_names[0]));
        stream << get_indent() << kind_names[arg.kind] << ",\n";
        stream << get_indent() << (int)arg.dimensions << ",\n";
        internal_assert(arg.type.code() < sizeof(type_code_names) / sizeof(type_code_names[0]));
        stream << get_indent() << "{" << type_code_names[arg.type.code()] << ", " << (int)arg.type.bits() << ", " << (int)arg.type.lanes() << "},\n";
        stream << get_indent() << "scalar_def_" << legalized_name << ",\n";
        stream << get_indent() << "scalar_min_" << legalized_name << ",\n";
        stream << get_indent() << "scalar_max_" << legalized_name << ",\n";
        stream << get_indent() << "scalar_estimate_" << legalized_name << ",\n";
        stream << get_indent() << "buffer_estimates_" << legalized_name << ",\n";
        stream << get_indent() << "},\n";
        indent -= 1;
    }
    stream << get_indent() << "};\n";
    indent -= 1;

    stream << get_indent() << "static const halide_filter_metadata_t md = {\n";

    indent += 1;

    stream << get_indent() << "halide_filter_metadata_t::VERSION,\n";
    stream << get_indent() << args.size() << ",\n";
    stream << get_indent() << "args,\n";
    stream << get_indent() << "\"" << target.to_string() << "\",\n";
    stream << get_indent() << "\"" << function_name << "\",\n";
    stream << get_indent() << "};\n";
    indent -= 1;

    stream << get_indent() << "return &md;\n";

    indent -= 1;

    stream << "}\n";
}

void CodeGen_C::emit_constexpr_function_info(const std::string &function_name,
                                             const std::vector<LoweredArgument> &args,
                                             const MetadataNameMap &metadata_name_map) {
    internal_assert(!extern_c_open)
        << "emit_constexpr_function_info() must not be called from inside an extern \"C\" block";

    if (!is_header()) {
        return;
    }

    auto map_name = [&metadata_name_map](const std::string &from) -> std::string {
        auto it = metadata_name_map.find(from);
        return it == metadata_name_map.end() ? from : it->second;
    };

    static const std::array<const char *, 3> kind_names = {
        "::HalideFunctionInfo::InputScalar",
        "::HalideFunctionInfo::InputBuffer",
        "::HalideFunctionInfo::OutputBuffer",
    };

    static const std::array<const char *, 5> type_code_names = {
        "halide_type_int",
        "halide_type_uint",
        "halide_type_float",
        "halide_type_handle",
        "halide_type_bfloat",
    };

    stream << constexpr_argument_info_docs;

    stream << "inline constexpr std::array<::HalideFunctionInfo::ArgumentInfo, " << args.size() << "> "
           << function_name << "_argument_info() {\n";

    indent += 1;

    stream << get_indent() << "return {{\n";
    indent += 1;
    for (const auto &arg : args) {
        internal_assert(arg.kind < kind_names.size());
        internal_assert(arg.type.code() < type_code_names.size());

        const auto name = map_name(arg.name);

        stream << get_indent() << "{\"" << name << "\", " << kind_names[arg.kind] << ", " << (int)arg.dimensions
               << ", halide_type_t{" << type_code_names[arg.type.code()] << ", " << (int)arg.type.bits()
               << ", " << (int)arg.type.lanes() << "}},\n";
    }
    indent -= 1;
    stream << get_indent() << "}};\n";
    indent -= 1;
    internal_assert(indent == 0);

    stream << "}\n";
}

void CodeGen_C::compile(const Module &input) {
    TypeInfoGatherer type_info;
    for (const auto &f : input.functions()) {
        if (f.body.defined()) {
            f.body.accept(&type_info);
        }
    }
    uses_gpu_for_loops = (type_info.for_types_used.count(ForType::GPUBlock) ||
                          type_info.for_types_used.count(ForType::GPUThread) ||
                          type_info.for_types_used.count(ForType::GPULane));

    // Forward-declare all the types we need; this needs to happen before
    // we emit function prototypes, since those may need the types.
    if (output_kind != CPlusPlusFunctionInfoHeader) {
        stream << "\n";
        for (const auto &f : input.functions()) {
            for (const auto &arg : f.args) {
                forward_declare_type_if_needed(arg.type);
            }
        }
        stream << "\n";
    }

    if (!is_header_or_extern_decl()) {
        add_vector_typedefs(type_info.vector_types_used);

        // Emit prototypes for all external and internal-only functions.
        // Gather them up and do them all up front, to reduce duplicates,
        // and to make it simpler to get internal-linkage functions correct.
        ExternCallPrototypes e;
        for (const auto &f : input.functions()) {
            f.body.accept(&e);
            if (f.linkage == LinkageType::Internal) {
                // We can't tell at the call site if a LoweredFunc is intended to be internal
                // or not, so mark them explicitly.
                e.set_internal_linkage(f.name);
            }
        }

        if (e.has_c_plus_plus_declarations()) {
            set_name_mangling_mode(NameMangling::CPlusPlus);
            e.emit_c_plus_plus_declarations(stream);
        }

        if (e.has_c_declarations()) {
            set_name_mangling_mode(NameMangling::C);
            e.emit_c_declarations(stream);
        }
    }

    for (const auto &b : input.buffers()) {
        compile(b);
    }
    const auto metadata_name_map = input.get_metadata_name_map();
    for (const auto &f : input.functions()) {
        compile(f, metadata_name_map);
    }
}

void CodeGen_C::compile(const LoweredFunc &f, const MetadataNameMap &metadata_name_map) {
    // Don't put non-external function declarations in headers.
    if (is_header_or_extern_decl() && f.linkage == LinkageType::Internal) {
        return;
    }

    const std::vector<LoweredArgument> &args = f.args;

    have_user_context = false;
    for (const auto &arg : args) {
        // TODO: check that its type is void *?
        have_user_context |= (arg.name == "__user_context");
    }

    NameMangling name_mangling = f.name_mangling;
    if (name_mangling == NameMangling::Default) {
        name_mangling = (target.has_feature(Target::CPlusPlusMangling) || output_kind == CPlusPlusFunctionInfoHeader ? NameMangling::CPlusPlus : NameMangling::C);
    }

    set_name_mangling_mode(name_mangling);

    std::vector<std::string> namespaces;
    std::string simple_name = c_print_name(extract_namespaces(f.name, namespaces), false);
    if (!is_c_plus_plus_interface()) {
        user_assert(namespaces.empty()) << "Namespace qualifiers not allowed on function name if not compiling with Target::CPlusPlusNameMangling.\n";
    }

    if (!namespaces.empty()) {
        for (const auto &ns : namespaces) {
            stream << "namespace " << ns << " {\n";
        }
        stream << "\n";
    }

    if (output_kind != CPlusPlusFunctionInfoHeader) {
        // Emit the function prototype
        if (f.linkage == LinkageType::Internal) {
            // If the function isn't public, mark it static.
            stream << "static ";
        }
        stream << "HALIDE_FUNCTION_ATTRS\n";
        stream << "int " << simple_name << "(";
        for (size_t i = 0; i < args.size(); i++) {
            if (args[i].is_buffer()) {
                stream << "struct halide_buffer_t *"
                       << print_name(args[i].name)
                       << "_buffer";
            } else {
                stream << print_type(args[i].type, AppendSpace)
                       << print_name(args[i].name);
            }

            if (i < args.size() - 1) {
                stream << ", ";
            }
        }

        if (is_header_or_extern_decl()) {
            stream << ");\n";
        } else {
            stream << ") ";
            open_scope();

            if (uses_gpu_for_loops) {
                stream << get_indent() << "halide_error("
                       << (have_user_context ? "const_cast<void *>(__user_context)" : "nullptr")
                       << ", \"C++ Backend does not support gpu_blocks() or gpu_threads() yet, "
                       << "this function will always fail at runtime\");\n";
                stream << get_indent() << "return halide_error_code_device_malloc_failed;\n";
            } else {
                // Emit a local user_context we can pass in all cases, either
                // aliasing __user_context or nullptr.
                stream << get_indent() << "void * const _ucon = "
                       << (have_user_context ? "const_cast<void *>(__user_context)" : "nullptr")
                       << ";\n";

                // Always declare it unused, since this could be a generated closure that doesn't
                // use _ucon at all, regardless of NoAsserts.
                stream << get_indent() << "halide_maybe_unused(_ucon);\n";

                // Emit the body
                print(f.body);

                // Return success.
                stream << get_indent() << "return 0;\n";
                cache.clear();
            }

            // Ensure we use open/close_scope, so that the cache doesn't try to linger
            // across function boundaries for internal closures.
            close_scope("");
        }

        if (f.linkage == LinkageType::ExternalPlusArgv || f.linkage == LinkageType::ExternalPlusMetadata) {
            // Emit the argv version
            emit_argv_wrapper(simple_name, args);
        }

        if (f.linkage == LinkageType::ExternalPlusMetadata) {
            // Emit the metadata.
            emit_metadata_getter(simple_name, args, metadata_name_map);
        }
    } else {
        if (f.linkage != LinkageType::Internal) {
            emit_constexpr_function_info(simple_name, args, metadata_name_map);
        }
    }

    if (!namespaces.empty()) {
        stream << "\n";
        for (size_t i = namespaces.size(); i > 0; i--) {
            stream << "}  // namespace " << namespaces[i - 1] << "\n";
        }
        stream << "\n";
    }
}

void CodeGen_C::compile(const Buffer<> &buffer) {
    // Don't define buffers in headers or extern decls.
    if (is_header_or_extern_decl()) {
        return;
    }

    string name = print_name(buffer.name());
    halide_buffer_t b = *(buffer.raw_buffer());

    user_assert(b.host) << "Can't embed image: " << buffer.name() << " because it has a null host pointer\n";
    user_assert(!b.device_dirty()) << "Can't embed image: " << buffer.name() << "because it has a dirty device pointer\n";

    // Figure out the offset of the last pixel.
    size_t num_elems = 1;
    for (int d = 0; d < b.dimensions; d++) {
        num_elems += b.dim[d].stride * (size_t)(b.dim[d].extent - 1);
    }

    // For now, we assume buffers that aren't scalar are constant,
    // while scalars can be mutated. This accommodates all our existing
    // use cases, which is that all buffers are constant, except those
    // used to store stateful module information in offloading runtimes.
    bool is_constant = buffer.dimensions() != 0;

    // If it is an GPU source kernel, we would like to see the actual output, not the
    // uint8 representation. We use a string literal for this.
    if (ends_with(name, "gpu_source_kernels")) {
        stream << "static const char *" << name << "_string = R\"BUFCHARSOURCE(";
        stream.write((char *)b.host, num_elems);
        stream << ")BUFCHARSOURCE\";\n";

        stream << "static const HALIDE_ATTRIBUTE_ALIGN(32) uint8_t *" << name << "_data = (const uint8_t *) "
               << name << "_string;\n";
    } else {
        // Emit the data
        stream << "static " << (is_constant ? "const" : "") << " HALIDE_ATTRIBUTE_ALIGN(32) uint8_t " << name << "_data[] = {\n";
        stream << get_indent();
        for (size_t i = 0; i < num_elems * b.type.bytes(); i++) {
            if (i > 0) {
                stream << ",";
                if (i % 16 == 0) {
                    stream << "\n";
                    stream << get_indent();
                } else {
                    stream << " ";
                }
            }
            stream << (int)(b.host[i]);
        }
        stream << "\n};\n";
    }

    std::string buffer_shape = "nullptr";
    if (buffer.dimensions()) {
        // Emit the shape -- note that we can't use this for scalar buffers because
        // we'd emit a statement of the form "foo_buffer_shape[] = {}", and a zero-length
        // array will make some compilers unhappy.
        stream << "static const halide_dimension_t " << name << "_buffer_shape[] = {";
        for (int i = 0; i < buffer.dimensions(); i++) {
            stream << "halide_dimension_t(" << buffer.dim(i).min()
                   << ", " << buffer.dim(i).extent()
                   << ", " << buffer.dim(i).stride() << ")";
            if (i < buffer.dimensions() - 1) {
                stream << ", ";
            }
        }
        stream << "};\n";
        buffer_shape = "const_cast<halide_dimension_t*>(" + name + "_buffer_shape)";
    }

    Type t = buffer.type();

    // Emit the buffer struct. Note that although our shape and (usually) our host
    // data is const, the buffer itself isn't: embedded buffers in one pipeline
    // can be passed to another pipeline (e.g. for an extern stage), in which
    // case the buffer objects need to be non-const, because the constness
    // (from the POV of the extern stage) is a runtime property.
    stream << "static halide_buffer_t " << name << "_buffer_ = {"
           << "0, "                                              // device
           << "nullptr, "                                        // device_interface
           << "const_cast<uint8_t*>(&" << name << "_data[0]), "  // host
           << "0, "                                              // flags
           << "halide_type_t((halide_type_code_t)(" << (int)t.code() << "), " << t.bits() << ", " << t.lanes() << "), "
           << buffer.dimensions() << ", "
           << buffer_shape << "};\n";

    // Make a global pointer to it.
    stream << "static halide_buffer_t * const " << name << "_buffer = &" << name << "_buffer_;\n";
}

string CodeGen_C::print_expr(const Expr &e) {
    id = "$$ BAD ID $$";
    e.accept(this);
    return id;
}

string CodeGen_C::print_cast_expr(const Type &t, const Expr &e) {
    string value = print_expr(e);
    string type = print_type(t);
    if (t.is_vector() &&
        t.lanes() == e.type().lanes() &&
        t != e.type()) {
        return print_assignment(t, type + "_ops::convert_from<" + print_type(e.type()) + ">(" + value + ")");
    } else {
        return print_assignment(t, "(" + type + ")(" + value + ")");
    }
}

void CodeGen_C::print_stmt(const Stmt &s) {
    s.accept(this);
}

string CodeGen_C::print_assignment(Type t, const std::string &rhs) {
    auto cached = cache.find(rhs);
    if (cached == cache.end()) {
        id = unique_name('_');
        const char *const_flag = output_kind == CPlusPlusImplementation ? " const " : "";
        if (t.is_handle()) {
            // Don't print void *, which might lose useful type information. just use auto.
            stream << get_indent() << "auto *";
        } else {
            stream << get_indent() << print_type(t, AppendSpace);
        }
        stream << const_flag << id << " = " << rhs << ";\n";
        cache[rhs] = id;
    } else {
        id = cached->second;
    }
    return id;
}

void CodeGen_C::open_scope() {
    cache.clear();
    stream << get_indent();
    indent++;
    stream << "{\n";
}

void CodeGen_C::close_scope(const std::string &comment) {
    cache.clear();
    indent--;
    stream << get_indent();
    if (!comment.empty()) {
        stream << "} // " << comment << "\n";
    } else {
        stream << "}\n";
    }
}

void CodeGen_C::visit(const Variable *op) {
    if (starts_with(op->name, "::")) {
        // This is the name of a global, so we can't modify it.
        id = op->name;
    } else {
        // This substitution ensures const correctness for all calls
        if (op->name == "__user_context") {
            id = "_ucon";
        } else {
            id = print_name(op->name);
        }
    }
}

void CodeGen_C::visit(const Cast *op) {
    id = print_cast_expr(op->type, op->value);
}

void CodeGen_C::visit(const Reinterpret *op) {
    id = print_assignment(op->type, print_reinterpret(op->type, op->value));
}

void CodeGen_C::visit_binop(Type t, const Expr &a, const Expr &b, const char *op) {
    string sa = print_expr(a);
    string sb = print_expr(b);
    print_assignment(t, sa + " " + op + " " + sb);
}

void CodeGen_C::visit(const Add *op) {
    visit_binop(op->type, op->a, op->b, "+");
}

void CodeGen_C::visit(const Sub *op) {
    visit_binop(op->type, op->a, op->b, "-");
}

void CodeGen_C::visit(const Mul *op) {
    visit_binop(op->type, op->a, op->b, "*");
}

void CodeGen_C::visit(const Div *op) {
    int bits;
    if (is_const_power_of_two_integer(op->b, &bits)) {
        visit_binop(op->type, op->a, make_const(op->a.type(), bits), ">>");
    } else if (op->type.is_int()) {
        print_expr(lower_euclidean_div(op->a, op->b));
    } else {
        visit_binop(op->type, op->a, op->b, "/");
    }
}

void CodeGen_C::visit(const Mod *op) {
    int bits;
    if (is_const_power_of_two_integer(op->b, &bits)) {
        visit_binop(op->type, op->a, make_const(op->a.type(), (1 << bits) - 1), "&");
    } else if (op->type.is_int()) {
        print_expr(lower_euclidean_mod(op->a, op->b));
    } else if (op->type.is_float()) {
        string arg0 = print_expr(op->a);
        string arg1 = print_expr(op->b);
        ostringstream rhs;
        rhs << "fmod(" << arg0 << ", " << arg1 << ")";
        print_assignment(op->type, rhs.str());
    } else {
        visit_binop(op->type, op->a, op->b, "%");
    }
}

void CodeGen_C::visit(const Max *op) {
    // clang doesn't support the ternary operator on OpenCL style vectors.
    // See: https://bugs.llvm.org/show_bug.cgi?id=33103
    if (op->type.is_scalar()) {
        print_expr(Call::make(op->type, "::halide_cpp_max", {op->a, op->b}, Call::Extern));
    } else {
        ostringstream rhs;
        rhs << print_type(op->type) << "_ops::max(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
        print_assignment(op->type, rhs.str());
    }
}

void CodeGen_C::visit(const Min *op) {
    // clang doesn't support the ternary operator on OpenCL style vectors.
    // See: https://bugs.llvm.org/show_bug.cgi?id=33103
    if (op->type.is_scalar()) {
        print_expr(Call::make(op->type, "::halide_cpp_min", {op->a, op->b}, Call::Extern));
    } else {
        ostringstream rhs;
        rhs << print_type(op->type) << "_ops::min(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
        print_assignment(op->type, rhs.str());
    }
}

void CodeGen_C::visit_relop(Type t, const Expr &a, const Expr &b, const char *scalar_op, const char *vector_op) {
    if (t.is_scalar() || !using_vector_typedefs) {
        visit_binop(t, a, b, scalar_op);
    } else {
        internal_assert(a.type() == b.type());
        string sa = print_expr(a);
        string sb = print_expr(b);
        print_assignment(t, print_type(a.type()) + "_ops::" + vector_op + "(" + sa + ", " + sb + ")");
    }
}

void CodeGen_C::visit(const EQ *op) {
    visit_relop(op->type, op->a, op->b, "==", "eq");
}

void CodeGen_C::visit(const NE *op) {
    visit_relop(op->type, op->a, op->b, "!=", "ne");
}

void CodeGen_C::visit(const LT *op) {
    visit_relop(op->type, op->a, op->b, "<", "lt");
}

void CodeGen_C::visit(const LE *op) {
    visit_relop(op->type, op->a, op->b, "<=", "le");
}

void CodeGen_C::visit(const GT *op) {
    visit_relop(op->type, op->a, op->b, ">", "gt");
}

void CodeGen_C::visit(const GE *op) {
    visit_relop(op->type, op->a, op->b, ">=", "ge");
}

void CodeGen_C::visit(const Or *op) {
    visit_relop(op->type, op->a, op->b, "||", "logical_or");
}

void CodeGen_C::visit(const And *op) {
    visit_relop(op->type, op->a, op->b, "&&", "logical_and");
}

void CodeGen_C::visit(const Not *op) {
    print_assignment(op->type, "!(" + print_expr(op->a) + ")");
}

void CodeGen_C::visit(const IntImm *op) {
    if (op->type == Int(32)) {
        id = std::to_string(op->value);
    } else {
        static const char *const suffixes[3] = {
            "ll",  // PlainC
            "l",   // OpenCL
            "",    // HLSL
        };
        print_assignment(op->type, "(" + print_type(op->type) + ")(" + std::to_string(op->value) + suffixes[(int)integer_suffix_style] + ")");
    }
}

void CodeGen_C::visit(const UIntImm *op) {
    static const char *const suffixes[3] = {
        "ull",  // PlainC
        "ul",   // OpenCL
        "",     // HLSL
    };
    print_assignment(op->type, "(" + print_type(op->type) + ")(" + std::to_string(op->value) + suffixes[(int)integer_suffix_style] + ")");
}

void CodeGen_C::visit(const StringImm *op) {
    ostringstream oss;
    oss << Expr(op);
    id = oss.str();
}

// NaN is the only float/double for which this is true... and
// surprisingly, there doesn't seem to be a portable isnan function
// (dsharlet).
template<typename T>
static bool isnan(T x) {
    return x != x;
}

template<typename T>
static bool isinf(T x) {
    return std::numeric_limits<T>::has_infinity && (x == std::numeric_limits<T>::infinity() ||
                                                    x == -std::numeric_limits<T>::infinity());
}

void CodeGen_C::visit(const FloatImm *op) {
    if (isnan(op->value)) {
        id = "nan_f32()";
    } else if (isinf(op->value)) {
        if (op->value > 0) {
            id = "inf_f32()";
        } else {
            id = "neg_inf_f32()";
        }
    } else {
        // Write the constant as reinterpreted uint to avoid any bits lost in conversion.
        union {
            uint32_t as_uint;
            float as_float;
        } u;
        u.as_float = op->value;

        ostringstream oss;
        if (op->type.bits() == 64) {
            oss << "(double) ";
        }
        oss << "float_from_bits(" << u.as_uint << " /* " << u.as_float << " */)";
        print_assignment(op->type, oss.str());
    }
}

void CodeGen_C::visit(const Call *op) {

    internal_assert(op->is_extern() || op->is_intrinsic())
        << "Can only codegen extern calls and intrinsics\n";

    ostringstream rhs;

    // Handle intrinsics first
    if (op->is_intrinsic(Call::debug_to_file)) {
        internal_assert(op->args.size() == 3);
        const StringImm *string_imm = op->args[0].as<StringImm>();
        internal_assert(string_imm);
        string filename = string_imm->value;
        string typecode = print_expr(op->args[1]);
        string buffer = print_name(print_expr(op->args[2]));

        rhs << "halide_debug_to_file(_ucon, "
            << "\"" << filename << "\", "
            << typecode
            << ", (struct halide_buffer_t *)" << buffer << ")";
    } else if (op->is_intrinsic(Call::bitwise_and)) {
        internal_assert(op->args.size() == 2);
        string a0 = print_expr(op->args[0]);
        string a1 = print_expr(op->args[1]);
        rhs << a0 << " & " << a1;
    } else if (op->is_intrinsic(Call::bitwise_xor)) {
        internal_assert(op->args.size() == 2);
        string a0 = print_expr(op->args[0]);
        string a1 = print_expr(op->args[1]);
        rhs << a0 << " ^ " << a1;
    } else if (op->is_intrinsic(Call::bitwise_or)) {
        internal_assert(op->args.size() == 2);
        string a0 = print_expr(op->args[0]);
        string a1 = print_expr(op->args[1]);
        rhs << a0 << " | " << a1;
    } else if (op->is_intrinsic(Call::bitwise_not)) {
        internal_assert(op->args.size() == 1);
        rhs << "~" << print_expr(op->args[0]);
    } else if (op->is_intrinsic(Call::shift_left)) {
        internal_assert(op->args.size() == 2);
        if (op->args[1].type().is_uint()) {
            string a0 = print_expr(op->args[0]);
            string a1 = print_expr(op->args[1]);
            rhs << a0 << " << " << a1;
        } else {
            rhs << print_expr(lower_signed_shift_left(op->args[0], op->args[1]));
        }
    } else if (op->is_intrinsic(Call::shift_right)) {
        internal_assert(op->args.size() == 2);
        if (op->args[1].type().is_uint()) {
            string a0 = print_expr(op->args[0]);
            string a1 = print_expr(op->args[1]);
            rhs << a0 << " >> " << a1;
        } else {
            rhs << print_expr(lower_signed_shift_right(op->args[0], op->args[1]));
        }
    } else if (op->is_intrinsic(Call::count_leading_zeros) ||
               op->is_intrinsic(Call::count_trailing_zeros) ||
               op->is_intrinsic(Call::popcount)) {
        internal_assert(op->args.size() == 1);
        if (op->args[0].type().is_vector()) {
            rhs << print_scalarized_expr(op);
        } else {
            string a0 = print_expr(op->args[0]);
            rhs << "halide_" << op->name << "(" << a0 << ")";
        }
    } else if (op->is_intrinsic(Call::lerp)) {
        internal_assert(op->args.size() == 3);
        Expr e = lower_lerp(op->type, op->args[0], op->args[1], op->args[2], target);
        rhs << print_expr(e);
    } else if (op->is_intrinsic(Call::absd)) {
        internal_assert(op->args.size() == 2);
        Expr a = op->args[0];
        Expr b = op->args[1];
        Expr e = cast(op->type, select(a < b, b - a, a - b));
        rhs << print_expr(e);
    } else if (op->is_intrinsic(Call::return_second)) {
        internal_assert(op->args.size() == 2);
        string arg0 = print_expr(op->args[0]);
        string arg1 = print_expr(op->args[1]);
        rhs << "return_second(" << arg0 << ", " << arg1 << ")";
    } else if (op->is_intrinsic(Call::if_then_else)) {
        internal_assert(op->args.size() == 2 || op->args.size() == 3);

        string result_id = unique_name('_');

        stream << get_indent() << print_type(op->args[1].type(), AppendSpace)
               << result_id << ";\n";

        string cond_id = print_expr(op->args[0]);

        stream << get_indent() << "if (" << cond_id << ")\n";
        open_scope();
        string true_case = print_expr(op->args[1]);
        stream << get_indent() << result_id << " = " << true_case << ";\n";
        close_scope("if " + cond_id);
        if (op->args.size() == 3) {
            stream << get_indent() << "else\n";
            open_scope();
            string false_case = print_expr(op->args[2]);
            stream << get_indent() << result_id << " = " << false_case << ";\n";
            close_scope("if " + cond_id + " else");
        }
        rhs << result_id;
    } else if (op->is_intrinsic(Call::require)) {
        internal_assert(op->args.size() == 3);
        if (op->args[0].type().is_vector()) {
            rhs << print_scalarized_expr(op);
        } else {
            create_assertion(op->args[0], op->args[2]);
            rhs << print_expr(op->args[1]);
        }
    } else if (op->is_intrinsic(Call::round)) {
        // There's no way to get rounding with ties to nearest even that works
        // in all contexts where someone might be compiling generated C++ code,
        // so we just lower it into primitive operations.
        rhs << print_expr(lower_round_to_nearest_ties_to_even(op->args[0]));
    } else if (op->is_intrinsic(Call::abs)) {
        internal_assert(op->args.size() == 1);
        Expr a0 = op->args[0];
        rhs << print_expr(cast(op->type, select(a0 > 0, a0, -a0)));
    } else if (op->is_intrinsic(Call::memoize_expr)) {
        internal_assert(!op->args.empty());
        string arg = print_expr(op->args[0]);
        rhs << "(" << arg << ")";
    } else if (op->is_intrinsic(Call::alloca)) {
        internal_assert(op->args.size() == 1);
        internal_assert(op->type.is_handle());
        const int64_t *sz = as_const_int(op->args[0]);
        if (op->type == type_of<struct halide_buffer_t *>() &&
            Call::as_intrinsic(op->args[0], {Call::size_of_halide_buffer_t})) {
            stream << get_indent();
            string buf_name = unique_name('b');
            stream << "halide_buffer_t " << buf_name << ";\n";
            rhs << "&" << buf_name;
        } else if (op->type == type_of<struct halide_semaphore_t *>() &&
                   sz && *sz == 16) {
            stream << get_indent();
            string semaphore_name = unique_name("sema");
            stream << "halide_semaphore_t " << semaphore_name << ";\n";
            rhs << "&" << semaphore_name;
        } else {
            // Make a stack of uint64_ts
            string size = print_expr(simplify((op->args[0] + 7) / 8));
            stream << get_indent();
            string array_name = unique_name('a');
            stream << "uint64_t " << array_name << "[" << size << "];\n";
            rhs << "(" << print_type(op->type) << ")(&" << array_name << ")";
        }
    } else if (op->is_intrinsic(Call::make_struct)) {
        if (op->args.empty()) {
            internal_assert(op->type.handle_type);
            // Add explicit cast so that different structs can't cache to the same value
            rhs << "(" << print_type(op->type) << ")(NULL)";
        } else if (op->type == type_of<halide_dimension_t *>()) {
            // Emit a shape

            // Get the args
            vector<string> values;
            for (const auto &arg : op->args) {
                values.push_back(print_expr(arg));
            }

            static_assert(sizeof(halide_dimension_t) == 4 * sizeof(int32_t),
                          "CodeGen_C assumes a halide_dimension_t is four densely-packed int32_ts");

            internal_assert(values.size() % 4 == 0);
            int dimension = values.size() / 4;

            string shape_name = unique_name('s');
            stream
                << get_indent() << "struct halide_dimension_t " << shape_name
                << "[" << dimension << "] = {\n";
            indent++;
            for (int i = 0; i < dimension; i++) {
                stream
                    << get_indent() << "{"
                    << values[i * 4 + 0] << ", "
                    << values[i * 4 + 1] << ", "
                    << values[i * 4 + 2] << ", "
                    << values[i * 4 + 3] << "},\n";
            }
            indent--;
            stream << get_indent() << "};\n";

            rhs << shape_name;
        } else {
            // Emit a declaration like:
            // struct {int f_0, int f_1, char f_2} foo = {3, 4, 'c'};

            // Get the args
            vector<string> values;
            for (const auto &arg : op->args) {
                values.push_back(print_expr(arg));
            }
            stream << get_indent() << "struct {\n";
            // List the types.
            indent++;
            for (size_t i = 0; i < op->args.size(); i++) {
                stream << get_indent() << print_type(op->args[i].type()) << " f_" << i << ";\n";
            }
            indent--;
            string struct_name = unique_name('s');
            stream << get_indent() << "} " << struct_name << " = {\n";
            // List the values.
            indent++;
            for (size_t i = 0; i < op->args.size(); i++) {
                stream << get_indent() << values[i];
                if (i < op->args.size() - 1) {
                    stream << ",";
                }
                stream << "\n";
            }
            indent--;
            stream << get_indent() << "};\n";

            // Return a pointer to it of the appropriate type

            // TODO: This is dubious type-punning. We really need to
            // find a better way to do this. We dodge the problem for
            // the specific case of buffer shapes in the case above.
            if (op->type.handle_type) {
                rhs << "(" << print_type(op->type) << ")";
            }
            rhs << "(&" << struct_name << ")";
        }
    } else if (op->is_intrinsic(Call::load_typed_struct_member)) {
        // Given a void * instance of a typed struct, an in-scope prototype
        // struct of the same type, and the index of a slot, load the value of
        // that slot.
        //
        // It is assumed that the slot index is valid for the given typed struct.
        //
        // TODO: this comment is replicated in CodeGen_LLVM and should be updated there too.
        // TODO: https://github.com/halide/Halide/issues/6468

        internal_assert(op->args.size() == 3);
        std::string struct_instance = print_expr(op->args[0]);
        std::string struct_prototype = print_expr(op->args[1]);
        const int64_t *index = as_const_int(op->args[2]);
        internal_assert(index != nullptr);
        rhs << "((decltype(" << struct_prototype << "))"
            << struct_instance << ")->f_" << *index;
    } else if (op->is_intrinsic(Call::get_user_context)) {
        internal_assert(op->args.empty());
        rhs << "_ucon";
    } else if (op->is_intrinsic(Call::stringify)) {
        // Rewrite to an snprintf
        vector<string> printf_args;
        string format_string = "";
        for (size_t i = 0; i < op->args.size(); i++) {
            Type t = op->args[i].type();
            if (t == type_of<halide_buffer_t *>()) {
                string buf_name = unique_name('b');
                printf_args.push_back(buf_name);
                // In Codegen_LLVM, we use 512 as a guesstimate for halide_buffer_t space:
                // Not a strict upper bound (there isn't one), but ought to be enough for most buffers.
                constexpr int buf_size = 512;
                stream << get_indent() << "char " << buf_name << "[" << buf_size << "];\n";
                stream << get_indent() << "halide_buffer_to_string(" << buf_name << ", " << buf_name << " + " << buf_size << ", " << print_expr(op->args[i]) << ");\n";
                format_string += "%s";
            } else {
                printf_args.push_back(print_expr(op->args[i]));
                if (t.is_int()) {
                    format_string += "%lld";
                    printf_args[i] = "(long long)(" + printf_args[i] + ")";
                } else if (t.is_uint()) {
                    format_string += "%llu";
                    printf_args[i] = "(long long unsigned)(" + printf_args[i] + ")";
                } else if (t.is_float()) {
                    if (t.bits() == 32) {
                        format_string += "%f";
                    } else {
                        format_string += "%e";
                    }
                } else if (op->args[i].as<StringImm>()) {
                    format_string += "%s";
                } else {
                    internal_assert(t.is_handle());
                    format_string += "%p";
                }
            }
        }
        string buf_name = unique_name('b');
        stream << get_indent() << "char " << buf_name << "[1024];\n";
        stream << get_indent() << "snprintf(" << buf_name << ", 1024, \"" << format_string << "\", " << with_commas(printf_args) << ");\n";
        rhs << buf_name;
    } else if (op->is_intrinsic(Call::register_destructor)) {
        internal_assert(op->args.size() == 2);
        const StringImm *fn = op->args[0].as<StringImm>();
        internal_assert(fn);
        string arg = print_expr(op->args[1]);

        stream << get_indent();
        // Make a struct on the stack that calls the given function as a destructor
        string struct_name = unique_name('s');
        string instance_name = unique_name('d');
        stream << "struct " << struct_name << " { "
               << "void * const ucon; "
               << "void * const arg; "
               << "" << struct_name << "(void *ucon, void *a) : ucon(ucon), arg(a) {} "
               << "~" << struct_name << "() { " << fn->value + "(ucon, arg); } "
               << "} " << instance_name << "(_ucon, " << arg << ");\n";
        rhs << "(void *)nullptr";
    } else if (op->is_intrinsic(Call::div_round_to_zero)) {
        rhs << print_expr(op->args[0]) << " / " << print_expr(op->args[1]);
    } else if (op->is_intrinsic(Call::mod_round_to_zero)) {
        rhs << print_expr(op->args[0]) << " % " << print_expr(op->args[1]);
    } else if (op->is_intrinsic(Call::mux)) {
        rhs << print_expr(lower_mux(op));
    } else if (op->is_intrinsic(Call::signed_integer_overflow)) {
        user_error << "Signed integer overflow occurred during constant-folding. Signed"
                      " integer overflow for int32 and int64 is undefined behavior in"
                      " Halide.\n";
    } else if (op->is_intrinsic(Call::undef)) {
        user_error << "undef not eliminated before code generation. Please report this as a Halide bug.\n";
    } else if (op->is_intrinsic(Call::prefetch)) {
        user_assert((op->args.size() == 4) && is_const_one(op->args[2]))
            << "Only prefetch of 1 cache line is supported in C backend.\n";

        const Expr &base_address = op->args[0];
        const Expr &base_offset = op->args[1];
        // const Expr &extent0 = op->args[2];  // unused
        // const Expr &stride0 = op->args[3];  // unused

        const Variable *base = base_address.as<Variable>();
        internal_assert(base && base->type.is_handle());
        // TODO: provide some way to customize the rw and locality?
        rhs << "__builtin_prefetch("
            << "((" << print_type(op->type) << " *)" << print_name(base->name)
            << " + " << print_expr(base_offset) << "), /*rw*/0, /*locality*/0)";
    } else if (op->is_intrinsic(Call::size_of_halide_buffer_t)) {
        rhs << "(sizeof(halide_buffer_t))";
    } else if (op->is_intrinsic(Call::strict_float)) {
        internal_assert(op->args.size() == 1);
        string arg0 = print_expr(op->args[0]);
        rhs << "(" << arg0 << ")";
    } else if (op->is_intrinsic()) {
        Expr lowered = lower_intrinsic(op);
        if (lowered.defined()) {
            rhs << print_expr(lowered);
        } else {
            // TODO: other intrinsics
            internal_error << "Unhandled intrinsic in C backend: " << op->name << "\n";
        }
    } else {
        // Generic extern calls
        rhs << print_extern_call(op);
    }

    // Special-case halide_print, which has IR that returns int, but really return void.
    // The clean thing to do would be to change the definition of halide_print() to return
    // an ignored int, but as halide_print() has many overrides downstream (and in third-party
    // consumers), this is arguably a simpler fix for allowing halide_print() to work in the C++ backend.
    if (op->name == "halide_print") {
        stream << get_indent() << rhs.str() << ";\n";
        // Make an innocuous assignment value for our caller (probably an Evaluate node) to ignore.
        print_assignment(op->type, "0");
    } else {
        print_assignment(op->type, rhs.str());
    }
}

string CodeGen_C::print_scalarized_expr(const Expr &e) {
    Type t = e.type();
    internal_assert(t.is_vector());
    string v = unique_name('_');
    // All of the lanes of this vector will get replaced, so in theory
    // we don't need to initialize it to anything, but if we don't,
    // we'll get "possible uninitialized var" warnings. Since this code
    // is already hopelessly inefficient at this point, let's just init
    // it with a broadcast(0) to avoid any possible weirdness.
    stream << get_indent() << print_type(t, AppendSpace) << v << " = " << print_type(t) + "_ops::broadcast(0);\n";
    for (int lane = 0; lane < t.lanes(); lane++) {
        Expr e2 = extract_lane(e, lane);
        string elem = print_expr(e2);
        ostringstream rhs;
        rhs << print_type(t) + "_ops::replace(" << v << ", " << lane << ", " << elem << ")";
        v = print_assignment(t, rhs.str());
    }
    return v;
}

string CodeGen_C::print_extern_call(const Call *op) {
    if (op->type.is_vector()) {
        // Need to split into multiple scalar calls.
        return print_scalarized_expr(op);
    }
    ostringstream rhs;
    vector<string> args(op->args.size());
    for (size_t i = 0; i < op->args.size(); i++) {
        args[i] = print_expr(op->args[i]);
        // This substitution ensures const correctness for all calls
        if (args[i] == "__user_context") {
            args[i] = "_ucon";
        }
    }
    if (function_takes_user_context(op->name)) {
        args.insert(args.begin(), "_ucon");
    }
    rhs << op->name << "(" << with_commas(args) << ")";
    return rhs.str();
}

void CodeGen_C::visit(const Load *op) {
    // TODO: We could replicate the logic in the llvm codegen which decides whether
    // the vector access can be aligned. Doing so would also require introducing
    // aligned type equivalents for all the vector types.
    ostringstream rhs;

    Type t = op->type;
    string name = print_name(op->name);

    // If we're loading a contiguous ramp into a vector, just load the vector
    Expr dense_ramp_base = strided_ramp_base(op->index, 1);
    if (dense_ramp_base.defined() && is_const_one(op->predicate)) {
        internal_assert(t.is_vector());
        string id_ramp_base = print_expr(dense_ramp_base);
        rhs << print_type(t) + "_ops::load(" << name << ", " << id_ramp_base << ")";
    } else if (op->index.type().is_vector()) {
        // If index is a vector, gather vector elements.
        internal_assert(t.is_vector());
        string id_index = print_expr(op->index);
        if (is_const_one(op->predicate)) {
            rhs << print_type(t) + "_ops::load_gather(" << name << ", " << id_index << ")";
        } else {
            string id_predicate = print_expr(op->predicate);
            rhs << print_type(t) + "_ops::load_predicated(" << name << ", " << id_index << ", " << id_predicate << ")";
        }
    } else {
        user_assert(is_const_one(op->predicate)) << "Predicated scalar load is not supported by C backend.\n";

        string id_index = print_expr(op->index);
        bool type_cast_needed = !(allocations.contains(op->name) &&
                                  allocations.get(op->name).type.element_of() == t.element_of());
        if (type_cast_needed) {
            const char *const_flag = output_kind == CPlusPlusImplementation ? " const" : "";
            rhs << "((" << print_type(t.element_of()) << const_flag << " *)" << name << ")";
        } else {
            rhs << name;
        }
        rhs << "[" << id_index << "]";
    }
    print_assignment(t, rhs.str());
}

void CodeGen_C::visit(const Store *op) {
    Type t = op->value.type();

    if (inside_atomic_mutex_node) {
        user_assert(t.is_scalar())
            << "The vectorized atomic operation for the store" << op->name
            << " is lowered into a mutex lock, which does not support vectorization.\n";
    }

    // Issue atomic store if we are in the designated producer.
    if (emit_atomic_stores) {
        stream << "#if defined(_OPENMP)\n";
        stream << "#pragma omp atomic\n";
        stream << "#else\n";
        stream << "#error \"Atomic stores in the C backend are only supported in compilers that support OpenMP.\"\n";
        stream << "#endif\n";
    }

    string id_value = print_expr(op->value);
    string name = print_name(op->name);

    // TODO: We could replicate the logic in the llvm codegen which decides whether
    // the vector access can be aligned. Doing so would also require introducing
    // aligned type equivalents for all the vector types.

    // If we're writing a contiguous ramp, just store the vector.
    Expr dense_ramp_base = strided_ramp_base(op->index, 1);
    if (dense_ramp_base.defined() && is_const_one(op->predicate)) {
        internal_assert(op->value.type().is_vector());
        string id_ramp_base = print_expr(dense_ramp_base);
        stream << get_indent() << print_type(t) + "_ops::store(" << id_value << ", " << name << ", " << id_ramp_base << ");\n";
    } else if (op->index.type().is_vector()) {
        // If index is a vector, scatter vector elements.
        internal_assert(t.is_vector());
        string id_index = print_expr(op->index);
        if (is_const_one(op->predicate)) {
            stream << get_indent() << print_type(t) + "_ops::store_scatter(" << id_value << ", " << name << ", " << id_index << ");\n";
        } else {
            string id_predicate = print_expr(op->predicate);
            stream << get_indent() << print_type(t) + "_ops::store_predicated(" << id_value << ", " << name << ", " << id_index << ", " << id_predicate << ");\n";
        }
    } else {
        user_assert(is_const_one(op->predicate)) << "Predicated scalar store is not supported by C backend.\n";

        bool type_cast_needed =
            t.is_handle() ||
            !allocations.contains(op->name) ||
            allocations.get(op->name).type != t;

        string id_index = print_expr(op->index);
        stream << get_indent();
        if (type_cast_needed) {
            stream << "((" << print_type(t) << " *)" << name << ")";
        } else {
            stream << name;
        }
        stream << "[" << id_index << "] = " << id_value << ";\n";
    }
    cache.clear();
}

void CodeGen_C::visit(const Let *op) {
    string id_value = print_expr(op->value);
    Expr body = op->body;
    if (op->value.type().is_handle() && op->name != "__user_context") {
        // The body might contain a Load that references this directly
        // by name, so we can't rewrite the name.
        std::string name = print_name(op->name);
        stream << get_indent() << "auto "
               << name << " = " << id_value << ";\n";
        stream << get_indent() << "halide_maybe_unused(" << name << ");\n";
    } else {
        Expr new_var = Variable::make(op->value.type(), id_value);
        body = substitute(op->name, new_var, body);
    }
    print_expr(body);
}

void CodeGen_C::visit(const Select *op) {
    ostringstream rhs;
    string type = print_type(op->type);
    string true_val = print_expr(op->true_value);
    string false_val = print_expr(op->false_value);
    string cond = print_expr(op->condition);

    // clang doesn't support the ternary operator on OpenCL style vectors.
    // See: https://bugs.llvm.org/show_bug.cgi?id=33103
    if (op->condition.type().is_scalar()) {
        rhs << "(" << type << ")"
            << "(" << cond
            << " ? " << true_val
            << " : " << false_val
            << ")";
    } else {
        rhs << type << "_ops::select(" << cond << ", " << true_val << ", " << false_val << ")";
    }
    print_assignment(op->type, rhs.str());
}

Expr CodeGen_C::scalarize_vector_reduce(const VectorReduce *op) {
    Expr (*binop)(Expr, Expr) = nullptr;
    switch (op->op) {
    case VectorReduce::Add:
        binop = Add::make;
        break;
    case VectorReduce::Mul:
        binop = Mul::make;
        break;
    case VectorReduce::Min:
        binop = Min::make;
        break;
    case VectorReduce::Max:
        binop = Max::make;
        break;
    case VectorReduce::And:
        binop = And::make;
        break;
    case VectorReduce::Or:
        binop = Or::make;
        break;
    case VectorReduce::SaturatingAdd:
        binop = saturating_add;
        break;
    }

    std::vector<Expr> lanes;
    int outer_lanes = op->type.lanes();
    int inner_lanes = op->value.type().lanes() / outer_lanes;
    for (int outer = 0; outer < outer_lanes; outer++) {
        Expr reduction = extract_lane(op->value, outer * inner_lanes);
        for (int inner = 1; inner < inner_lanes; inner++) {
            reduction = binop(reduction, extract_lane(op->value, outer * inner_lanes + inner));
        }
        lanes.push_back(reduction);
    }

    // No need to concat if there is only a single value.
    if (lanes.size() == 1) {
        return lanes[0];
    }

    return Shuffle::make_concat(lanes);
}

void CodeGen_C::visit(const VectorReduce *op) {
    stream << get_indent() << "// Vector reduce: " << op->op << "\n";

    Expr scalarized = scalarize_vector_reduce(op);
    if (scalarized.type().is_scalar()) {
        print_assignment(op->type, print_expr(scalarized));
    } else {
        print_assignment(op->type, print_scalarized_expr(scalarized));
    }
}

void CodeGen_C::visit(const LetStmt *op) {
    string id_value = print_expr(op->value);
    Stmt body = op->body;

    if (op->value.type().is_handle() && op->name != "__user_context") {
        // The body might contain a Load or Store that references this
        // directly by name, so we can't rewrite the name.
        std::string name = print_name(op->name);
        stream << get_indent() << "auto "
               << name << " = " << id_value << ";\n";
        stream << get_indent() << "halide_maybe_unused(" << name << ");\n";
    } else {
        Expr new_var = Variable::make(op->value.type(), id_value);
        body = substitute(op->name, new_var, body);
    }
    body.accept(this);
}

// Halide asserts have different semantics to C asserts.  They're
// supposed to clean up and make the containing function return
// -1, so we can't use the C version of assert. Instead we convert
// to an if statement.
void CodeGen_C::create_assertion(const string &id_cond, const Expr &message) {
    internal_assert(!message.defined() || message.type() == Int(32))
        << "Assertion result is not an int: " << message;

    if (target.has_feature(Target::NoAsserts)) {
        stream << get_indent() << "halide_maybe_unused(" << id_cond << ");\n";
        return;
    }

    stream << get_indent() << "if (!" << id_cond << ")\n";
    open_scope();
    string id_msg = print_expr(message);
    stream << get_indent() << "return " << id_msg << ";\n";
    close_scope("");
}

void CodeGen_C::create_assertion(const Expr &cond, const Expr &message) {
    create_assertion(print_expr(cond), message);
}

void CodeGen_C::visit(const AssertStmt *op) {
    create_assertion(op->condition, op->message);
}

void CodeGen_C::visit(const ProducerConsumer *op) {
    stream << get_indent();
    if (op->is_producer) {
        stream << "// produce " << op->name << "\n";
    } else {
        stream << "// consume " << op->name << "\n";
    }
    print_stmt(op->body);
}

void CodeGen_C::visit(const Fork *op) {
    // TODO: This doesn't actually work with nested tasks
    stream << get_indent() << "#pragma omp parallel\n";
    open_scope();
    stream << get_indent() << "#pragma omp single\n";
    open_scope();
    stream << get_indent() << "#pragma omp task\n";
    open_scope();
    print_stmt(op->first);
    close_scope("");
    stream << get_indent() << "#pragma omp task\n";
    open_scope();
    print_stmt(op->rest);
    close_scope("");
    stream << get_indent() << "#pragma omp taskwait\n";
    close_scope("");
    close_scope("");
}

void CodeGen_C::visit(const Acquire *op) {
    string id_sem = print_expr(op->semaphore);
    string id_count = print_expr(op->count);
    open_scope();
    stream << get_indent() << "while (!halide_semaphore_try_acquire(" << id_sem << ", " << id_count << "))\n";
    open_scope();
    stream << get_indent() << "#pragma omp taskyield\n";
    close_scope("");
    op->body.accept(this);
    close_scope("");
}

void CodeGen_C::visit(const Atomic *op) {
    if (!op->mutex_name.empty()) {
        internal_assert(!inside_atomic_mutex_node)
            << "Nested atomic mutex locks detected. This might causes a deadlock.\n";
        ScopedValue<bool> old_inside_atomic_mutex_node(inside_atomic_mutex_node, true);
        op->body.accept(this);
    } else {
        // Issue atomic stores.
        ScopedValue<bool> old_emit_atomic_stores(emit_atomic_stores, true);
        op->body.accept(this);
    }
}

void CodeGen_C::visit(const For *op) {
    string id_min = print_expr(op->min);
    string id_extent = print_expr(op->extent);

    if (op->for_type == ForType::Parallel) {
        stream << get_indent() << "#pragma omp parallel for\n";
    } else {
        internal_assert(op->for_type == ForType::Serial)
            << "Can only emit serial or parallel for loops to C\n";
    }

    stream << get_indent() << "for (int "
           << print_name(op->name)
           << " = " << id_min
           << "; "
           << print_name(op->name)
           << " < " << id_min
           << " + " << id_extent
           << "; "
           << print_name(op->name)
           << "++)\n";

    open_scope();
    op->body.accept(this);
    close_scope("for " + print_name(op->name));
}

void CodeGen_C::visit(const Ramp *op) {
    Type vector_type = op->type.with_lanes(op->lanes);
    string id_base = print_expr(op->base);
    string id_stride = print_expr(op->stride);
    print_assignment(vector_type, print_type(vector_type) + "_ops::ramp(" + id_base + ", " + id_stride + ")");
}

void CodeGen_C::visit(const Broadcast *op) {
    Type vector_type = op->type.with_lanes(op->lanes);
    string id_value = print_expr(op->value);
    string rhs;
    if (op->lanes > 1) {
        rhs = print_type(vector_type) + "_ops::broadcast(" + id_value + ")";
    } else {
        rhs = id_value;
    }

    print_assignment(vector_type, rhs);
}

void CodeGen_C::visit(const Provide *op) {
    internal_error << "Cannot emit Provide statements as C\n";
}

void CodeGen_C::visit(const Allocate *op) {
    open_scope();

    string op_name = print_name(op->name);
    string op_type = print_type(op->type, AppendSpace);

    // For sizes less than 8k, do a stack allocation
    bool on_stack = false;
    int32_t constant_size;
    string size_id;
    Type size_id_type;

    if (op->new_expr.defined()) {
        Allocation alloc;
        alloc.type = op->type;
        allocations.push(op->name, alloc);
        heap_allocations.push(op->name);
        string new_e = print_expr(op->new_expr);
        stream << get_indent() << op_type << " *" << op_name << " = (" << op_type << "*)" << new_e << ";\n";
    } else {
        constant_size = op->constant_allocation_size();
        if (constant_size > 0) {
            int64_t stack_bytes = (int64_t)constant_size * op->type.bytes();

            if (stack_bytes > ((int64_t(1) << 31) - 1)) {
                user_error << "Total size for allocation "
                           << op->name << " is constant but exceeds 2^31 - 1.\n";
            } else {
                size_id_type = Int(32);
                size_id = print_expr(make_const(size_id_type, constant_size));

                if (op->memory_type == MemoryType::Stack ||
                    op->memory_type == MemoryType::Register ||
                    (op->memory_type == MemoryType::Auto &&
                     can_allocation_fit_on_stack(stack_bytes))) {
                    on_stack = true;
                }
            }
        } else {
            // Check that the allocation is not scalar (if it were scalar
            // it would have constant size).
            internal_assert(!op->extents.empty());

            size_id = print_assignment(Int(64), print_expr(op->extents[0]));
            size_id_type = Int(64);

            for (size_t i = 1; i < op->extents.size(); i++) {
                // Make the code a little less cluttered for two-dimensional case
                string new_size_id_rhs;
                string next_extent = print_expr(op->extents[i]);
                if (i > 1) {
                    new_size_id_rhs = "(" + size_id + " > ((int64_t(1) << 31) - 1)) ? " + size_id + " : (" + size_id + " * " + next_extent + ")";
                } else {
                    new_size_id_rhs = size_id + " * " + next_extent;
                }
                size_id = print_assignment(Int(64), new_size_id_rhs);
            }
            stream << get_indent() << "if (("
                   << size_id << " > ((int64_t(1) << 31) - 1)) || (("
                   << size_id << " * sizeof("
                   << op_type << ")) > ((int64_t(1) << 31) - 1)))\n";
            open_scope();
            stream << get_indent();
            // TODO: call halide_error_buffer_allocation_too_large() here instead
            // TODO: call create_assertion() so that NoAssertions works
            stream << "halide_error(_ucon, "
                   << "\"32-bit signed overflow computing size of allocation " << op->name << "\\n\");\n";
            stream << get_indent() << "return -1;\n";
            close_scope("overflow test " + op->name);
        }

        // Check the condition to see if this allocation should actually be created.
        // If the allocation is on the stack, the only condition we can respect is
        // unconditional false (otherwise a non-constant-sized array declaration
        // will be generated).
        if (!on_stack || is_const_zero(op->condition)) {
            Expr conditional_size = Select::make(op->condition,
                                                 Variable::make(size_id_type, size_id),
                                                 make_const(size_id_type, 0));
            conditional_size = simplify(conditional_size);
            size_id = print_assignment(Int(64), print_expr(conditional_size));
        }

        Allocation alloc;
        alloc.type = op->type;
        allocations.push(op->name, alloc);

        stream << get_indent() << op_type;

        if (on_stack) {
            stream << op_name
                   << "[" << size_id << "];\n";
        } else {
            // Shouldn't ever currently be possible to have !on_stack && size_id.empty(),
            // but reality-check in case things change in the future.
            internal_assert(!size_id.empty());
            stream << "*"
                   << op_name
                   << " = ("
                   << op_type
                   << " *)halide_malloc(_ucon, sizeof("
                   << op_type
                   << ")*" << size_id << ");\n";
            heap_allocations.push(op->name);
        }
    }

    if (!on_stack) {
        ostringstream check;
        if (is_const_zero(op->condition)) {
            // Assertion always succeeds here, since allocation is never used
            check << print_expr(const_true());
        } else {
            // Assert that the allocation worked....
            // Note that size_id can be empty if the "allocation" is via a custom_new that
            // wraps _halide_buffer_get_host(), so don't emit malformed code in that case.
            check << "(" << op_name << " != nullptr)";
            if (!size_id.empty()) {
                check << " || (" << size_id << " == 0)";
            }
            if (!is_const_one(op->condition)) {
                // ...but if the condition is false, it's OK for the new_expr to be null.
                string op_condition = print_assignment(Bool(), print_expr(op->condition));
                check << " || (!" << op_condition << ")";
            }
        }
        create_assertion("(" + check.str() + ")", Call::make(Int(32), "halide_error_out_of_memory", {}, Call::Extern));

        stream << get_indent();
        string free_function = op->free_function.empty() ? "halide_free" : op->free_function;
        stream << "HalideFreeHelper " << op_name << "_free(_ucon, "
               << op_name << ", " << free_function << ");\n";
    }

    op->body.accept(this);

    // Free the memory if it was allocated on the heap and there is no matching
    // Free node.
    print_heap_free(op->name);
    if (allocations.contains(op->name)) {
        allocations.pop(op->name);
    }

    close_scope("alloc " + print_name(op->name));
}

void CodeGen_C::print_heap_free(const std::string &alloc_name) {
    if (heap_allocations.contains(alloc_name)) {
        stream << get_indent() << print_name(alloc_name) << "_free.free();\n";
        heap_allocations.pop(alloc_name);
    }
}

void CodeGen_C::visit(const Free *op) {
    print_heap_free(op->name);
    allocations.pop(op->name);
}

void CodeGen_C::visit(const Realize *op) {
    internal_error << "Cannot emit realize statements to C\n";
}

void CodeGen_C::visit(const Prefetch *op) {
    internal_error << "Cannot emit prefetch statements to C\n";
}

void CodeGen_C::visit(const IfThenElse *op) {
    string cond_id = print_expr(op->condition);

    stream << get_indent() << "if (" << cond_id << ")\n";
    open_scope();
    op->then_case.accept(this);
    close_scope("if " + cond_id);

    if (op->else_case.defined()) {
        stream << get_indent() << "else\n";
        open_scope();
        op->else_case.accept(this);
        close_scope("if " + cond_id + " else");
    }
}

void CodeGen_C::visit(const Evaluate *op) {
    if (is_const(op->value)) {
        return;
    }
    string id = print_expr(op->value);
    stream << get_indent() << "halide_maybe_unused(" << id << ");\n";
}

void CodeGen_C::visit(const Shuffle *op) {
    internal_assert(!op->vectors.empty());
    for (size_t i = 1; i < op->vectors.size(); i++) {
        internal_assert(op->vectors[0].type() == op->vectors[i].type());
    }
    internal_assert(op->type.lanes() == (int)op->indices.size());
    const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size());
    for (int i : op->indices) {
        internal_assert(i >= -1 && i < max_index);
    }

    std::vector<string> vecs;
    for (const Expr &v : op->vectors) {
        vecs.push_back(print_expr(v));
    }
    ostringstream rhs;
    if (op->type.is_scalar()) {
        // Deduce which vector we need. Apparently it's not required
        // that all vectors have identical lanes, so a loop is required.
        // Since idx of -1 means "don't care", we'll treat it as 0 to simplify.
        int idx = std::max(0, op->indices[0]);
        for (size_t vec_idx = 0; vec_idx < op->vectors.size(); vec_idx++) {
            const int vec_lanes = op->vectors[vec_idx].type().lanes();
            if (idx < vec_lanes) {
                rhs << vecs[vec_idx];
                if (op->vectors[vec_idx].type().is_vector()) {
                    rhs << "[" << idx << "]";
                }
                break;
            }
            idx -= vec_lanes;
        }
        internal_assert(!rhs.str().empty());
    } else {
        internal_assert(op->vectors[0].type().is_vector());
        string src = vecs[0];
        if (op->vectors.size() > 1) {
            // This code has always assumed/required that all the vectors
            // have identical types, so let's verify
            const Type t0 = op->vectors[0].type();
            for (const auto &v : op->vectors) {
                internal_assert(t0 == v.type());
            }
            ostringstream rhs;
            string storage_name = unique_name('_');
            // Combine them into one vector. Clang emits excellent code via this
            // union approach (typically without going thru memory) for both x64 and arm64.
            stream << get_indent() << "union { "
                   << print_type(t0) << " src[" << vecs.size() << "]; "
                   << print_type(op->type) << " dst; } "
                   << storage_name << " = {{ " << with_commas(vecs) << " }};\n";
            src = storage_name + ".dst";
        }
        rhs << print_type(op->type) << "_ops::shuffle<" << with_commas(op->indices) << ">(" << src << ")";
    }
    print_assignment(op->type, rhs.str());
}

void CodeGen_C::test() {
    LoweredArgument buffer_arg("buf", Argument::OutputBuffer, Int(32), 3, ArgumentEstimates{});
    LoweredArgument float_arg("alpha", Argument::InputScalar, Float(32), 0, ArgumentEstimates{});
    LoweredArgument int_arg("beta", Argument::InputScalar, Int(32), 0, ArgumentEstimates{});
    LoweredArgument user_context_arg("__user_context", Argument::InputScalar, type_of<const void *>(), 0, ArgumentEstimates{});
    vector<LoweredArgument> args = {buffer_arg, float_arg, int_arg, user_context_arg};
    Var x("x");
    Param<float> alpha("alpha");
    Param<int> beta("beta");
    Expr e = Select::make(alpha > 4.0f, print_when(x < 1, 3), 2);
    Stmt s = Store::make("buf", e, x, Parameter(), const_true(), ModulusRemainder());
    s = LetStmt::make("x", beta + 1, s);
    s = Block::make(s, Free::make("tmp.stack"));
    s = Allocate::make("tmp.stack", Int(32), MemoryType::Stack, {127}, const_true(), s);
    s = Allocate::make("tmp.heap", Int(32), MemoryType::Heap, {43, beta}, const_true(), s);
    Expr buf = Variable::make(Handle(), "buf.buffer");
    s = LetStmt::make("buf", Call::make(Handle(), Call::buffer_get_host, {buf}, Call::Extern), s);

    Module m("", get_host_target());
    m.append(LoweredFunc("test1", args, s, LinkageType::External));

    ostringstream source;
    {
        CodeGen_C cg(source, Target("host"), CodeGen_C::CImplementation);
        cg.compile(m);
    }

    string correct_source =
        headers +
        globals +
        string((const char *)halide_internal_runtime_header_HalideRuntime_h) + '\n' +
        string((const char *)halide_internal_initmod_inlined_c) + '\n' +
        '\n' + kDefineMustUseResult + R"GOLDEN_CODE(
#ifndef HALIDE_FUNCTION_ATTRS
#define HALIDE_FUNCTION_ATTRS
#endif



#ifdef __cplusplus
extern "C" {
#endif

HALIDE_FUNCTION_ATTRS
int test1(struct halide_buffer_t *_buf_buffer, float _alpha, int32_t _beta, void const *__user_context) {
 void * const _ucon = const_cast<void *>(__user_context);
 halide_maybe_unused(_ucon);
 auto *_0 = _halide_buffer_get_host(_buf_buffer);
 auto _buf = _0;
 halide_maybe_unused(_buf);
 {
  int64_t _1 = 43;
  int64_t _2 = _1 * _beta;
  if ((_2 > ((int64_t(1) << 31) - 1)) || ((_2 * sizeof(int32_t )) > ((int64_t(1) << 31) - 1)))
  {
   halide_error(_ucon, "32-bit signed overflow computing size of allocation tmp.heap\n");
   return -1;
  } // overflow test tmp.heap
  int64_t _3 = _2;
  int32_t *_tmp_heap = (int32_t  *)halide_malloc(_ucon, sizeof(int32_t )*_3);
  if (!((_tmp_heap != nullptr) || (_3 == 0)))
  {
   int32_t _4 = halide_error_out_of_memory(_ucon);
   return _4;
  }
  HalideFreeHelper _tmp_heap_free(_ucon, _tmp_heap, halide_free);
  {
   int32_t _tmp_stack[127];
   int32_t _5 = _beta + 1;
   int32_t _6;
   bool _7 = _5 < 1;
   if (_7)
   {
    char b0[1024];
    snprintf(b0, 1024, "%lld%s", (long long)(3), "\n");
    auto *_8 = b0;
    halide_print(_ucon, _8);
    int32_t _9 = 0;
    int32_t _10 = return_second(_9, 3);
    _6 = _10;
   } // if _7
   else
   {
    _6 = 3;
   } // if _7 else
   int32_t _11 = _6;
   float _12 = float_from_bits(1082130432 /* 4 */);
   bool _13 = _alpha > _12;
   int32_t _14 = (int32_t)(_13 ? _11 : 2);
   ((int32_t *)_buf)[_5] = _14;
  } // alloc _tmp_stack
  _tmp_heap_free.free();
 } // alloc _tmp_heap
 return 0;
}

#ifdef __cplusplus
}  // extern "C"
#endif

)GOLDEN_CODE";

    const auto compare_srcs = [](const string &actual, const string &expected) {
        if (actual != expected) {
            int diff = 0;
            while (actual[diff] == expected[diff]) {
                diff++;
            }
            int diff_end = diff + 1;
            while (diff > 0 && actual[diff] != '\n') {
                diff--;
            }
            while (diff_end < (int)actual.size() && actual[diff_end] != '\n') {
                diff_end++;
            }

            internal_error
                << "Correct source code:\n"
                << expected
                << "Actual source code:\n"
                << actual
                << "Difference starts at:\n"
                << "Correct: " << expected.substr(diff, diff_end - diff) << "\n"
                << "Actual: " << actual.substr(diff, diff_end - diff) << "\n";
        }
    };

    compare_srcs(source.str(), correct_source);

    ostringstream function_info;
    {
        CodeGen_C cg(function_info, Target("host-no_runtime"), CodeGen_C::CPlusPlusFunctionInfoHeader, "Function/Info/Test");
        cg.compile(m);
    }

    string correct_function_info = R"GOLDEN_CODE(#ifndef HALIDE_FUNCTION_INFO__Function___Info___Test
#define HALIDE_FUNCTION_INFO__Function___Info___Test

/* MACHINE GENERATED By Halide. */

#if !(__cplusplus >= 201703L || _MSVC_LANG >= 201703L)
#error "This file requires C++17 or later; please upgrade your compiler."
#endif

#include "HalideRuntime.h"


/**
 * This function returns a constexpr array of information about a Halide-generated
 * function's argument signature (e.g., number of arguments, type of each, etc).
 * While this is a subset of the information provided by the existing _metadata
 * function, it has the distinct advantage of allowing one to use the information
 * it at compile time (rather than runtime). This can be quite useful for producing
 * e.g. automatic call wrappers, etc.
 *
 * For instance, to compute the number of Buffers in a Function, one could do something
 * like:
 *
 *      using namespace HalideFunctionInfo;
 *
 *      template<size_t arg_count>
 *      constexpr size_t count_buffers(const std::array<ArgumentInfo, arg_count> args) {
 *          size_t buffer_count = 0;
 *          for (const auto a : args) {
 *              if (a.kind == InputBuffer || a.kind == OutputBuffer) {
 *                  buffer_count++;
 *              }
 *          }
 *          return buffer_count;
 *      }
 *
 *      constexpr size_t count = count_buffers(metadata_tester_argument_info());
 *
 * The value of `count` will be computed entirely at compile-time, with no runtime
 * impact aside from the numerical value of the constant.
 */

inline constexpr std::array<::HalideFunctionInfo::ArgumentInfo, 4> test1_argument_info() {
 return {{
  {"buf", ::HalideFunctionInfo::OutputBuffer, 3, halide_type_t{halide_type_int, 32, 1}},
  {"alpha", ::HalideFunctionInfo::InputScalar, 0, halide_type_t{halide_type_float, 32, 1}},
  {"beta", ::HalideFunctionInfo::InputScalar, 0, halide_type_t{halide_type_int, 32, 1}},
  {"__user_context", ::HalideFunctionInfo::InputScalar, 0, halide_type_t{halide_type_handle, 64, 1}},
 }};
}
#endif
)GOLDEN_CODE";

    compare_srcs(function_info.str(), correct_function_info);

    std::cout << "CodeGen_C test passed\n";
}

}  // namespace Internal
}  // namespace Halide