Revision 1212efb878b5c42caedd4f37de4c0f6b3d19a12c authored by Steven Johnson on 06 January 2022, 00:46:45 UTC, committed by Steven Johnson on 06 January 2022, 00:46:45 UTC
1 parent b244a83
CodeGen_Xtensa.cpp
#include "CodeGen_Xtensa.h"
#include <string>
#include "CodeGen_Internal.h"
#include "IROperator.h"
#include "IRVisitor.h"
#include "Lerp.h"
#include "Simplify.h"
#include "Substitute.h"
#include "XtensaOptimize.h"
namespace Halide {
namespace Internal {
using std::ostream;
using std::ostringstream;
using std::string;
using std::vector;
class UsesDmaCopy : public IRGraphVisitor {
private:
using IRGraphVisitor::visit;
protected:
void visit(const Call *op) override {
if (op->name == "halide_xtensa_copy_1d") {
uses_dma = true;
}
IRGraphVisitor::visit(op);
}
public:
bool uses_dma = false;
};
void CodeGen_Xtensa::compile(const Module &module) {
CodeGen_C::compile(module);
}
void CodeGen_Xtensa::compile(const Buffer<> &buffer) {
CodeGen_C::compile(buffer);
}
void CodeGen_Xtensa::compile(const LoweredFunc &f, const std::map<std::string, std::string> &metadata_name_map) {
// Don't put non-external function declarations in headers.
if (is_header_or_extern_decl() && f.linkage == LinkageType::Internal) {
return;
}
const std::vector<LoweredArgument> &args = f.args;
have_user_context = false;
for (size_t i = 0; i < args.size(); i++) {
// TODO: check that its type is void *?
have_user_context |= (args[i].name == "__user_context");
}
NameMangling name_mangling = f.name_mangling;
if (name_mangling == NameMangling::Default) {
name_mangling = (target.has_feature(Target::CPlusPlusMangling) ? NameMangling::CPlusPlus : NameMangling::C);
}
set_name_mangling_mode(name_mangling);
std::vector<std::string> namespaces;
std::string simple_name = extract_namespaces(f.name, namespaces);
if (!is_c_plus_plus_interface()) {
user_assert(namespaces.empty()) << "Namespace qualifiers not allowed on function name if not compiling with Target::CPlusPlusNameMangling.\n";
}
if (!namespaces.empty()) {
for (const auto &ns : namespaces) {
stream << "namespace " << ns << " {\n";
}
stream << "\n";
}
Stmt body = match_xtensa_patterns(f.body, target);
// Emit the function prototype
if (f.linkage == LinkageType::Internal) {
// If the function isn't public, mark it static.
stream << "static ";
}
stream << "HALIDE_FUNCTION_ATTRS\n";
stream << "int " << simple_name << "(";
for (size_t i = 0; i < args.size(); i++) {
if (args[i].is_buffer()) {
external_buffers.insert(args[i].name);
stream << "struct halide_buffer_t *"
<< print_name(args[i].name)
<< "_buffer";
} else {
stream << print_type(args[i].type, AppendSpace)
<< print_name(args[i].name);
}
if (i < args.size() - 1) stream << ", ";
}
if (is_header_or_extern_decl()) {
stream << ");\n";
} else {
stream << ") {\n";
indent += 1;
if (uses_gpu_for_loops) {
stream << get_indent() << "halide_error("
<< (have_user_context ? "__user_context_" : "nullptr")
<< ", \"C++ Backend does not support gpu_blocks() or gpu_threads() yet, "
<< "this function will always fail at runtime\");\n";
stream << get_indent() << "return halide_error_code_device_malloc_failed;\n";
} else {
// Emit a local user_context we can pass in all cases, either
// aliasing __user_context or nullptr.
stream << get_indent() << "void * const _ucon = "
<< (have_user_context ? "const_cast<void *>(__user_context)" : "nullptr")
<< ";\n";
if (target.has_feature(Target::NoAsserts)) {
stream << get_indent() << "halide_unused(_ucon);";
}
UsesDmaCopy uses_dma;
body.accept(&uses_dma);
if (uses_dma.uses_dma) {
stream << "ScopedDmaInitializer dma_initializer;\n";
}
// stream << "printf(\"" << simple_name << "\\n\");";
// Emit the body
print(body);
// stream << "printf(\"[end]" << simple_name << "\\n\");";
// Return success.
stream << get_indent() << "return 0;\n";
}
indent -= 1;
stream << "}\n";
}
if (f.linkage == LinkageType::ExternalPlusMetadata) {
// Emit the argv version
emit_argv_wrapper(simple_name, args);
// And also the metadata.
emit_metadata_getter(simple_name, args, metadata_name_map);
}
if (!namespaces.empty()) {
stream << "\n";
for (size_t i = namespaces.size(); i > 0; i--) {
stream << "} // namespace " << namespaces[i - 1] << "\n";
}
stream << "\n";
}
}
void CodeGen_Xtensa::add_vector_typedefs(const std::set<Type> &vector_types) {
if (!vector_types.empty()) {
const char *native_typedef_decl = R"INLINE_CODE(
#if defined(__XTENSA__)
#include <xtensa/sim.h>
#include <xtensa/tie/xt_ivpn.h>
#include <xtensa/tie/xt_timer.h>
// This inline function is needed by application to get the cycle count from ISS
inline int GetCycleCount() {
return XT_RSR_CCOUNT();
}
#endif
#include <xtensa/tie/xt_ivpn.h>
#define HALIDE_MAYBE_UNUSED __attribute__ ((unused))
// NOTE(vksnk): we can use clang native vectors in place of Xtensa
// data types, and while they should be much more convinient, there is
// a slight performance degradation, which needs to be investigated.
// typedef int8_t int8x64_t __attribute__((ext_vector_type(64)));
// typedef uint8_t uint8x64_t __attribute__((ext_vector_type(64)));
// typedef int16_t int16x32_t __attribute__((ext_vector_type(32)));
// typedef uint16_t uint16x32_t __attribute__((ext_vector_type(32)));
// typedef int32_t int32x16_t __attribute__((ext_vector_type(16)));
// typedef uint32_t uint32x16_t __attribute__((ext_vector_type(16)));
typedef int32_t common_int32x16_t __attribute__((ext_vector_type(16)));
typedef uint32_t common_uint32x16_t __attribute__((ext_vector_type(16)));
using int8x64_t = xb_vec2Nx8;
using uint8x64_t = xb_vec2Nx8U;
using int16x32_t = xb_vecNx16;
using uint16x32_t = xb_vecNx16U;
using int24_t = xb_int24;
using int24x64_t = xb_vec2Nx24;
using uint24x64_t = xb_vec2Nx24;
using int32x16_t = xb_vecN_2x32v;
using uint32x16_t = xb_vecN_2x32Uv;
using int48_t = xb_int48;
using int48x32_t = xb_vecNx48;
using uint48x32_t = xb_vecNx48;
using int64x16_t = xb_vecN_2x64w;
using uint1x16_t = vboolN_2;
using uint1x32_t = vboolN;
using uint1x64_t = vbool2N;
using float32x16_t = xb_vecN_2xf32;
using int8x4_t = xb_int32pr;
using uint8x4_t = xb_int32pr;
using int8x8_t = xb_int64pr;
using uint8x8_t = xb_int64pr;
template <typename NativeVector, int N>
struct MultipleOfNativeVector {
NativeVector __attribute__((aligned(64))) native_vector[N];
MultipleOfNativeVector() {}
// TODO(vksnk): figure out a better/safer way to construct it.
enum FromCppVector { from_native_vector };
inline MultipleOfNativeVector(FromCppVector, const NativeVector &src1, const NativeVector &src2) {
static_assert(N == 2, "Wrong kind of constructor");
native_vector[0] = src1;
native_vector[1] = src2;
}
inline MultipleOfNativeVector(FromCppVector, const NativeVector &src1, const NativeVector &src2, const NativeVector &src3, const NativeVector &src4) {
static_assert(N == 4, "Wrong kind of constructor");
native_vector[0] = src1;
native_vector[1] = src2;
native_vector[2] = src3;
native_vector[3] = src4;
}
inline MultipleOfNativeVector(FromCppVector, const NativeVector &src1, const NativeVector &src2, const NativeVector &src3, const NativeVector &src4,
const NativeVector &src5, const NativeVector &src6, const NativeVector &src7, const NativeVector &src8) {
static_assert(N == 8, "Wrong kind of constructor");
native_vector[0] = src1;
native_vector[1] = src2;
native_vector[2] = src3;
native_vector[3] = src4;
native_vector[4] = src5;
native_vector[5] = src6;
native_vector[6] = src7;
native_vector[7] = src8;
}
inline MultipleOfNativeVector(FromCppVector, const NativeVector &src1, const NativeVector &src2, const NativeVector &src3, const NativeVector &src4,
const NativeVector &src5, const NativeVector &src6, const NativeVector &src7, const NativeVector &src8,
const NativeVector &src9, const NativeVector &src10, const NativeVector &src11, const NativeVector &src12) {
static_assert(N == 12, "Wrong kind of constructor");
native_vector[0] = src1;
native_vector[1] = src2;
native_vector[2] = src3;
native_vector[3] = src4;
native_vector[4] = src5;
native_vector[5] = src6;
native_vector[6] = src7;
native_vector[7] = src8;
native_vector[8] = src9;
native_vector[9] = src10;
native_vector[10] = src11;
native_vector[11] = src12;
}
inline MultipleOfNativeVector(FromCppVector, const NativeVector &src1, const NativeVector &src2, const NativeVector &src3, const NativeVector &src4,
const NativeVector &src5, const NativeVector &src6, const NativeVector &src7, const NativeVector &src8,
const NativeVector &src9, const NativeVector &src10, const NativeVector &src11, const NativeVector &src12,
const NativeVector &src13, const NativeVector &src14, const NativeVector &src15, const NativeVector &src16) {
static_assert(N == 16, "Wrong kind of constructor");
native_vector[0] = src1;
native_vector[1] = src2;
native_vector[2] = src3;
native_vector[3] = src4;
native_vector[4] = src5;
native_vector[5] = src6;
native_vector[6] = src7;
native_vector[7] = src8;
native_vector[8] = src9;
native_vector[9] = src10;
native_vector[10] = src11;
native_vector[11] = src12;
native_vector[12] = src13;
native_vector[13] = src14;
native_vector[14] = src15;
native_vector[15] = src16;
}
};
using uint1x256_t = MultipleOfNativeVector<uint1x64_t, 4>;
using int8x128_t = MultipleOfNativeVector<int8x64_t, 2>;
using int8x256_t = MultipleOfNativeVector<int8x64_t, 4>;
using uint8x128_t = MultipleOfNativeVector<uint8x64_t, 2>;
using uint8x192_t = MultipleOfNativeVector<uint8x64_t, 3>;
using uint8x256_t = MultipleOfNativeVector<uint8x64_t, 4>;
using int16x64_t = MultipleOfNativeVector<int16x32_t, 2>;
using uint16x64_t = MultipleOfNativeVector<uint16x32_t, 2>;
using int16x128_t = MultipleOfNativeVector<int16x32_t, 4>;
using uint16x128_t = MultipleOfNativeVector<uint16x32_t, 4>;
using int24x128_t = MultipleOfNativeVector<int24x64_t, 2>;
using int32x32_t = MultipleOfNativeVector<int32x16_t, 2>;
using uint32x32_t = MultipleOfNativeVector<uint32x16_t, 2>;
using int32x64_t = MultipleOfNativeVector<int32x16_t, 4>;
using uint32x64_t = MultipleOfNativeVector<uint32x16_t, 4>;
// TODO(vksnk): this one should be generated automatically, but isn't.
using int32x192_t = MultipleOfNativeVector<int32x16_t, 12>;
using int32x256_t = MultipleOfNativeVector<int32x16_t, 16>;
using int48x64_t = MultipleOfNativeVector<int48x32_t, 2>;
using float32x32_t = MultipleOfNativeVector<float32x16_t, 2>;
using float32x64_t = MultipleOfNativeVector<float32x16_t, 4>;
template <typename ResultType>
HALIDE_ALWAYS_INLINE ResultType ramp(int32_t base, int32_t stride) = delete;
template <typename ResultType>
HALIDE_ALWAYS_INLINE ResultType dense_ramp(int32_t base) = delete;
template<>
HALIDE_ALWAYS_INLINE int32x32_t ramp<int32x32_t>(int32_t base, int32_t stride) {
int32x16_t one_to_n = IVP_SEQN_2X32();
int32x16_t base_w = base;
int32x16_t stride_w = stride;
int32x16_t lanes_2 = 16;
return int32x32_t(int32x32_t::from_native_vector, IVP_ADDN_2X32(base_w, IVP_PACKLN_2X64W(IVP_MULN_2X32(one_to_n, stride_w))),
IVP_ADDN_2X32(base_w, IVP_PACKLN_2X64W(IVP_MULN_2X32(lanes_2 + one_to_n, stride_w))));
}
template<>
HALIDE_ALWAYS_INLINE int32x32_t dense_ramp<int32x32_t>(int32_t base) {
const int32x16_t base_w = int32x16_t(base) + IVP_SEQN_2X32();
const int32x16_t lanes_2 = 16;
return int32x32_t(int32x32_t::from_native_vector, base_w, base_w + lanes_2);
}
template<>
HALIDE_ALWAYS_INLINE int32x64_t ramp<int32x64_t>(int32_t base, int32_t stride) {
int32x16_t one_to_n = IVP_SEQN_2X32();
int32x16_t base_w = base;
int32x16_t stride_w = stride;
int32x16_t lanes_2 = 16;
int32x16_t lanes_3 = 32;
int32x16_t lanes_4 = 48;
return int32x64_t(int32x64_t::from_native_vector,
IVP_ADDN_2X32(base_w, IVP_PACKLN_2X64W(IVP_MULN_2X32(one_to_n, stride_w))),
IVP_ADDN_2X32(base_w, IVP_PACKLN_2X64W(IVP_MULN_2X32(lanes_2 + one_to_n, stride_w))),
IVP_ADDN_2X32(base_w, IVP_PACKLN_2X64W(IVP_MULN_2X32(lanes_3 + one_to_n, stride_w))),
IVP_ADDN_2X32(base_w, IVP_PACKLN_2X64W(IVP_MULN_2X32(lanes_4 + one_to_n, stride_w))));
}
template<>
HALIDE_ALWAYS_INLINE int32x64_t dense_ramp<int32x64_t>(int32_t base) {
int32x16_t base_w = IVP_ADDN_2X32(int32x16_t(base), IVP_SEQN_2X32());
int32x16_t lanes_2 = 16;
int32x16_t lanes_3 = 32;
int32x16_t lanes_4 = 48;
return int32x64_t(int32x64_t::from_native_vector,
base_w,
IVP_ADDN_2X32(base_w, lanes_2),
IVP_ADDN_2X32(base_w, lanes_3),
IVP_ADDN_2X32(base_w, lanes_4));
}
template <typename ResultType, typename BaseType>
HALIDE_ALWAYS_INLINE ResultType broadcast(BaseType value) = delete;
template <>
HALIDE_ALWAYS_INLINE uint8x4_t broadcast<uint8x4_t, uint8_t>(uint8_t value) {
uint8x64_t v = value;
return IVP_EXTRPRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(v)), 0);
}
template <>
HALIDE_ALWAYS_INLINE uint8x8_t broadcast<uint8x8_t, uint8_t>(uint8_t value) {
uint8x64_t v = value;
return IVP_EXTRPR64N_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(v)), 0);
}
template <typename VectorType, typename BaseType, int Lanes>
HALIDE_ALWAYS_INLINE VectorType aligned_load(const void *base, int32_t offset) {
return *((const VectorType *)((const BaseType*)base + offset));
}
template <>
HALIDE_ALWAYS_INLINE HALIDE_MAYBE_UNUSED int32x32_t aligned_load<int32x32_t, int32_t, 32>(const void *base, int32_t offset) {
const int32x16_t * __restrict ptr = ((const int32x16_t *)((const int32_t*)base + offset));
int32x32_t r;
r.native_vector[0] = *ptr++;
r.native_vector[1] = *ptr++;
return r;
}
template <>
HALIDE_ALWAYS_INLINE HALIDE_MAYBE_UNUSED int8x256_t aligned_load<int8x256_t, int8_t, 256>(const void *base, int32_t offset) {
const int8x64_t * __restrict ptr = ((const int8x64_t *)((const int8_t*)base + offset));
int8x256_t r;
r.native_vector[0] = *ptr++;
r.native_vector[1] = *ptr++;
r.native_vector[2] = *ptr++;
r.native_vector[3] = *ptr++;
return r;
}
template <typename VectorType, typename BaseType, int Lanes>
HALIDE_ALWAYS_INLINE VectorType load(const void *base, int32_t offset) {
VectorType r;
memcpy(&r, ((const BaseType*)base + offset), sizeof(BaseType) * Lanes);
return r;
}
template <typename VectorType, typename BaseType, int Lanes>
HALIDE_ALWAYS_INLINE void aligned_store(const VectorType& a, void *base, int32_t offset) {
*((VectorType *)((BaseType*)base + offset)) = a;
}
template <typename VectorType, typename BaseType, int Lanes>
HALIDE_ALWAYS_INLINE void store(const VectorType& a, void *base, int32_t offset) {
memcpy(((BaseType*)base + offset), &a, sizeof(BaseType) * Lanes);
}
template <typename VectorType, typename BaseType, int Lanes>
HALIDE_ALWAYS_INLINE VectorType load_variable(const void *base, int32_t offset, int32_t count) {
VectorType r;
memcpy(&r, ((const BaseType*)base + offset), sizeof(BaseType) * count);
return r;
}
template <typename VectorType, typename BaseType, int Lanes>
HALIDE_ALWAYS_INLINE void store_variable(const VectorType& a, void *base, int32_t offset, int32_t count) {
memcpy(((BaseType*)base + offset), &a, sizeof(BaseType) * count);
}
template <>
HALIDE_ALWAYS_INLINE void store_variable<uint8x64_t, uint8_t, 64>(const uint8x64_t& a, void *base, int32_t offset, int32_t count) {
valign align;
xb_vec2Nx8U* __restrict ptr = (xb_vec2Nx8U*)((uint8_t*)base + offset);
IVP_SAV2NX8U_XP(a, align, ptr, count);
IVP_SAPOS2NX8U_FP(align, ptr);
}
template <typename VectorType, typename OffsetType, typename BaseType, int Lanes>
HALIDE_ALWAYS_INLINE VectorType gather_load(const void *base, const OffsetType& offset) {
BaseType __attribute__((aligned(64))) tmp[Lanes];
int offsets[Lanes];
store<OffsetType, int32_t, Lanes>(offset, &offsets[0], 0);
for (int i = 0; i < Lanes; i++) {
tmp[i] = ((const BaseType*)base)[offsets[i]];
}
return *((VectorType *)tmp);
}
template <typename VectorType, typename OffsetType, typename BaseType, int Lanes>
HALIDE_ALWAYS_INLINE void store_scatter(const VectorType& a, void *base, const OffsetType& offset) {
BaseType __attribute__((aligned(64))) tmp[Lanes];
aligned_store<VectorType, BaseType, Lanes>(a, &tmp[0], 0);
int __attribute__((aligned(64))) offsets[Lanes];
aligned_store<OffsetType, int32_t, Lanes>(offset, &offsets[0], 0);
for (int i = 0; i < Lanes; i++) {
((BaseType*)base)[offsets[i]] = tmp[i];
}
}
template <typename VectorType, typename OffsetType, typename PredicateType, typename BaseType, int Lanes>
HALIDE_ALWAYS_INLINE VectorType load_predicated(const void *base, const OffsetType& offset, const PredicateType& predicate) = delete;
template <>
HALIDE_ALWAYS_INLINE uint8x64_t load_predicated<uint8x64_t, int32x64_t, uint1x64_t, uint8_t, 64>(const void *base, const int32x64_t& offset, const uint1x64_t& predicate) {
int __attribute__((aligned(64))) offsets[64];
aligned_store<int32x64_t, int32_t, 64>(offset, &offsets[0], 0);
uint8x64_t vmask = IVP_MOV2NX8T(uint8x64_t(1), uint8x64_t(1), predicate);
uint8_t __attribute__((aligned(64))) mask[64];
aligned_store<uint8x64_t, uint8_t, 64>(vmask, &mask[0], 0);
uint8_t __attribute__((aligned(64))) output[64];
for (int i = 0; i < 64; i++) {
if (mask[i] == 1) {
output[i] = ((const uint8_t*)base)[offsets[i]];
} else {
output[i] = 0;
}
}
return *((uint8x64_t *)output);
}
template <>
HALIDE_ALWAYS_INLINE int16x32_t load_predicated<int16x32_t, int32x32_t, uint1x32_t, int16_t, 32>(const void *base, const int32x32_t& offset, const uint1x32_t& predicate) {
int __attribute__((aligned(64))) offsets[32];
aligned_store<int32x32_t, int32_t, 32>(offset, &offsets[0], 0);
int16x32_t vmask = IVP_MOVNX16T(int16x32_t(1), int16x32_t(1), predicate);
uint8_t __attribute__((aligned(64))) mask[32];
aligned_store<int16x32_t, uint8_t, 32>(vmask, &mask[0], 0);
uint8_t __attribute__((aligned(64))) output[32];
for (int i = 0; i < 32; i++) {
if (mask[i] == 1) {
output[i] = ((const uint8_t*)base)[offsets[i]];
} else {
output[i] = 0;
}
}
return *((int16x32_t *)output);
}
template <>
HALIDE_ALWAYS_INLINE int32x64_t load_predicated<int32x64_t, int32x64_t, uint1x64_t, int32_t, 64>(const void *base, const int32x64_t& offset, const uint1x64_t& predicate) {
int __attribute__((aligned(64))) offsets[64];
aligned_store<int32x64_t, int32_t, 64>(offset, &offsets[0], 0);
uint8x64_t vmask = IVP_MOV2NX8T(uint8x64_t(1), uint8x64_t(1), predicate);
uint8_t __attribute__((aligned(64))) mask[64];
aligned_store<uint8x64_t, uint8_t, 64>(vmask, &mask[0], 0);
int32_t __attribute__((aligned(64))) output[64];
for (int i = 0; i < 64; i++) {
if (mask[i] == 1) {
output[i] = ((const int32_t*)base)[offsets[i]];
} else {
output[i] = 0;
}
}
return *((int32x64_t *)output);
}
template <typename VectorType, typename OffsetType, typename PredicateType, typename BaseType, int Lanes>
HALIDE_ALWAYS_INLINE void store_predicated(const VectorType& a, void *base, const OffsetType& offset, const PredicateType& predicate) = delete;
template <>
HALIDE_ALWAYS_INLINE void store_predicated<uint8x64_t, int32x64_t, uint1x64_t, uint8_t, 64>(const uint8x64_t& a, void *base, const int32x64_t& offset, const uint1x64_t& predicate) {
uint8_t __attribute__((aligned(64))) tmp[64];
aligned_store<uint8x64_t, uint8_t, 64>(a, &tmp[0], 0);
int __attribute__((aligned(64))) offsets[64];
aligned_store<int32x64_t, int32_t, 64>(offset, &offsets[0], 0);
uint8x64_t vmask = IVP_MOV2NX8T(uint8x64_t(1), uint8x64_t(1), predicate);
uint8_t __attribute__((aligned(64))) mask[64];
aligned_store<uint8x64_t, uint8_t, 64>(vmask, &mask[0], 0);
for (int i = 0; i < 64; i++) {
if (mask[i]) {
((uint8_t*)base)[offsets[i]] = tmp[i];
}
}
}
template <>
HALIDE_ALWAYS_INLINE void store_predicated<uint8x256_t, int32x256_t, uint1x256_t, uint8_t, 256>(const uint8x256_t& a, void *base, const int32x256_t& offset, const uint1x256_t& predicate) {
uint8_t __attribute__((aligned(64))) tmp[256];
aligned_store<uint8x256_t, uint8_t, 256>(a, &tmp[0], 0);
int __attribute__((aligned(64))) offsets[256];
aligned_store<int32x256_t, int32_t, 256>(offset, &offsets[0], 0);
uint8x64_t vmask0 = IVP_MOV2NX8T(uint8x64_t(1), uint8x64_t(1), predicate.native_vector[0]);
uint8x64_t vmask1 = IVP_MOV2NX8T(uint8x64_t(1), uint8x64_t(1), predicate.native_vector[1]);
uint8x64_t vmask2 = IVP_MOV2NX8T(uint8x64_t(1), uint8x64_t(1), predicate.native_vector[2]);
uint8x64_t vmask3 = IVP_MOV2NX8T(uint8x64_t(1), uint8x64_t(1), predicate.native_vector[3]);
uint8_t __attribute__((aligned(64))) mask[256];
aligned_store<uint8x256_t, uint8_t, 256>(
uint8x256_t(uint8x256_t::from_native_vector, vmask0, vmask1, vmask2, vmask3), &mask[0], 0);
for (int i = 0; i < 256; i++) {
if (mask[i]) {
((uint8_t*)base)[offsets[i]] = tmp[i];
}
}
}
template <typename VectorTypeFrom, typename VectorTypeTo, typename BaseType, int LanesFrom, int LanesTo>
HALIDE_ALWAYS_INLINE VectorTypeTo shuffle(const VectorTypeFrom& a, const int32_t indices[LanesTo]) {
BaseType __attribute__((aligned(64))) tmp1[LanesFrom];
BaseType __attribute__((aligned(64))) tmp2[LanesTo];
store<VectorTypeFrom, BaseType, LanesFrom>(a, &tmp1[0], 0);
for (int i = 0; i < LanesTo; i++) {
tmp2[i] = tmp1[indices[i]];
}
return *((VectorTypeTo *)tmp2);
}
template <typename ResultType, typename ArgType, typename BaseType, int LanesResult, int LanesArg>
HALIDE_ALWAYS_INLINE ResultType concat(const ArgType& a, const ArgType& b) {
BaseType __attribute__((aligned(64))) tmp[LanesResult];
store<ArgType, BaseType, LanesArg>(a, &tmp[0], 0);
store<ArgType, BaseType, LanesArg>(b, &tmp[0], LanesArg);
return *((ResultType *)tmp);
}
template <typename ResultType, typename ArgType, typename BaseType, int LanesResult, int LanesArg>
HALIDE_ALWAYS_INLINE ResultType concat(const ArgType& a, const ArgType& b, const ArgType& c) {
BaseType __attribute__((aligned(64))) tmp[LanesResult];
store<ArgType, BaseType, LanesArg>(a, &tmp[0], 0);
store<ArgType, BaseType, LanesArg>(b, &tmp[0], LanesArg);
store<ArgType, BaseType, LanesArg>(c, &tmp[0], 2 * LanesArg);
return *((ResultType *)tmp);
}
template <typename ResultType, typename ArgType, typename BaseType, int LanesResult, int LanesArg>
HALIDE_ALWAYS_INLINE ResultType concat(const ArgType& a, const ArgType& b, const ArgType& c, const ArgType& d) {
BaseType __attribute__((aligned(64))) tmp[LanesResult];
store<ArgType, BaseType, LanesArg>(a, &tmp[0], 0);
store<ArgType, BaseType, LanesArg>(b, &tmp[0], LanesArg);
store<ArgType, BaseType, LanesArg>(c, &tmp[0], 2 * LanesArg);
store<ArgType, BaseType, LanesArg>(d, &tmp[0], 3 * LanesArg);
return *((ResultType *)tmp);
}
template <>
HALIDE_ALWAYS_INLINE int32x32_t concat<int32x32_t, int32x16_t, int32_t, 32, 16>(const int32x16_t& a, const int32x16_t& b) {
return int32x32_t(int32x32_t::from_native_vector, a, b);
}
template <>
HALIDE_ALWAYS_INLINE int32x64_t concat<int32x64_t, int32x16_t, int32_t, 64, 16>(const int32x16_t& a, const int32x16_t& b, const int32x16_t& c, const int32x16_t& d) {
return int32x64_t(int32x64_t::from_native_vector, a, b, c, d);
}
template <>
HALIDE_ALWAYS_INLINE int16x64_t concat<int16x64_t, int16x32_t, int16_t, 64, 32>(const int16x32_t& a, const int16x32_t& b) {
return int16x64_t(int16x64_t::from_native_vector, a, b);
}
template <>
HALIDE_ALWAYS_INLINE uint16x64_t concat<uint16x64_t, uint16x32_t, uint16_t, 64, 32>(const uint16x32_t& a, const uint16x32_t& b) {
return uint16x64_t(uint16x64_t::from_native_vector, a, b);
}
template <>
HALIDE_ALWAYS_INLINE uint8x128_t concat<uint8x128_t, uint8x64_t, uint8_t, 128, 64>(const uint8x64_t& a, const uint8x64_t& b) {
return uint8x128_t(uint8x128_t::from_native_vector, a, b);
}
template <>
HALIDE_ALWAYS_INLINE float32x32_t concat<float32x32_t, float32x16_t, float, 32, 16>(const float32x16_t& a, const float32x16_t& b) {
return float32x32_t(float32x32_t::from_native_vector, a, b);
}
template <>
HALIDE_ALWAYS_INLINE int24x128_t concat<int24x128_t, int24x64_t, int24_t, 128, 64>(const int24x64_t& a, const int24x64_t& b) {
return int24x128_t(int24x128_t::from_native_vector, a, b);
}
template <typename VectorTypeFrom, typename VectorTypeTo, typename BaseType, int LanesFrom, int LanesTo>
HALIDE_ALWAYS_INLINE VectorTypeTo halide_xtensa_pad_to_native(const VectorTypeFrom& a, int lanes) {
BaseType __attribute__((aligned(64))) tmp[LanesTo];
store<VectorTypeFrom, BaseType, LanesFrom>(a, tmp, 0);
return load<VectorTypeTo, BaseType, LanesTo>(tmp, 0);
}
template <typename VectorTypeFrom, typename VectorTypeTo, typename BaseType, int LanesFrom, int LanesTo>
HALIDE_ALWAYS_INLINE VectorTypeTo halide_xtensa_slice_from_padded(const VectorTypeFrom& a, int lanes) {
BaseType __attribute__((aligned(64))) tmp[LanesFrom];
store<VectorTypeFrom, BaseType, LanesFrom>(a, tmp, 0);
return load<VectorTypeTo, BaseType, LanesTo>(tmp, 0);
}
template <>
HALIDE_ALWAYS_INLINE uint1x32_t halide_xtensa_pad_to_native<uint1x16_t, uint1x32_t, bool, 16, 32>(const uint1x16_t& a, int lanes) {
return IVP_JOINBN_2(a, a);
}
template <>
HALIDE_ALWAYS_INLINE uint1x64_t halide_xtensa_pad_to_native<uint1x32_t, uint1x64_t, bool, 32, 64>(const uint1x32_t& a, int lanes) {
return IVP_JOINBN(a, a);
}
template <>
HALIDE_ALWAYS_INLINE uint1x64_t halide_xtensa_pad_to_native<uint1x16_t, uint1x64_t, bool, 16, 64>(const uint1x16_t& a, int lanes) {
return IVP_JOINBN(IVP_JOINBN_2(a, a), IVP_JOINBN_2(a, a));
}
template<>
HALIDE_ALWAYS_INLINE HALIDE_MAYBE_UNUSED int8x4_t load<int8x4_t, int8_t, 4>(const void *base, int32_t offset) {
return *((const int8x4_t*)((const int8_t*)base + offset));
}
template<>
HALIDE_ALWAYS_INLINE HALIDE_MAYBE_UNUSED uint8x4_t load<uint8x4_t, uint8_t, 4>(const void *base, int32_t offset) {
return *((const uint8x4_t*)((const uint8_t*)base + offset));
}
template<>
HALIDE_ALWAYS_INLINE HALIDE_MAYBE_UNUSED uint8x64_t load<uint8x64_t, uint8_t, 64>(const void *base, int32_t offset) {
uint8x64_t r;
const xb_vec2Nx8U* __restrict ptr = (const xb_vec2Nx8U*)((const uint8_t*)base + offset);
IVP_L2U2NX8U_XP(r, ptr, 0);
return r;
}
template<>
HALIDE_ALWAYS_INLINE void store<int8x64_t, int8_t, 64>(const int8x64_t& a, void *base, int32_t offset) {
valign align;
xb_vec2Nx8* __restrict ptr = (xb_vec2Nx8*)((int8_t*)base + offset);
IVP_SA2NX8_IP(a, align, ptr);
IVP_SAPOS2NX8_FP(align, ptr);
}
template<>
HALIDE_ALWAYS_INLINE void store<uint8x64_t, uint8_t, 64>(const uint8x64_t& a, void *base, int32_t offset) {
valign align;
xb_vec2Nx8U* __restrict ptr = (xb_vec2Nx8U*)((uint8_t*)base + offset);
IVP_SA2NX8U_IP(a, align, ptr);
IVP_SAPOS2NX8U_FP(align, ptr);
}
template<>
HALIDE_ALWAYS_INLINE HALIDE_MAYBE_UNUSED int16x32_t load<int16x32_t, int16_t, 32>(const void *base, int32_t offset) {
xb_vecNx16 r;
const xb_vec2Nx8* __restrict ptr8 = (const xb_vec2Nx8*)((const int16_t*)base + offset);
valign align = IVP_LA_PP(ptr8);
IVP_LANX16_IP(r, align, (const xb_vecNx16*)ptr8);
return r;
}
template<>
HALIDE_ALWAYS_INLINE void store<int16x32_t, int16_t, 32>(const int16x32_t& a, void *base, int32_t offset) {
valign align;
xb_vecNx16* ptr = (xb_vecNx16*)((int16_t*)base + offset);
IVP_SANX16_IP(a, align, ptr);
// Flush alignment register.
IVP_SAPOSNX16_FP(align, ptr);
}
template<>
HALIDE_ALWAYS_INLINE HALIDE_MAYBE_UNUSED uint16x32_t load<uint16x32_t, uint16_t, 32>(const void *base, int32_t offset) {
xb_vecNx16U r;
const xb_vec2Nx8* __restrict ptr8 = (const xb_vec2Nx8*)((const uint16_t*)base + offset);
valign align = IVP_LA_PP(ptr8);
IVP_LANX16U_IP(r, align, (const xb_vecNx16U*)ptr8);
return r;
}
template<>
HALIDE_ALWAYS_INLINE void store<uint16x32_t, uint16_t, 32>(const uint16x32_t& a, void *base, int32_t offset) {
valign align;
xb_vecNx16U* ptr = (xb_vecNx16U*)((uint16_t*)base + offset);
IVP_SANX16U_IP(a, align, ptr);
IVP_SAPOSNX16U_FP(align, ptr);
}
// It seems that this is buggy
/*
template<>
HALIDE_ALWAYS_INLINE HALIDE_MAYBE_UNUSED int16x64_t load<int16x64_t, int16_t, 64>(const void *base, int32_t offset) {
xb_vecNx16 r1, r2;
const xb_vecNx16* ptr = (const xb_vecNx16*)((const int16_t*)base + offset);
IVP_L2UNX16_XP(r1, ptr, 0);
ptr++;
IVP_L2UNX16_XP(r2, ptr, 0);
return int16x64_t(int16x64_t::from_native_vector, r1, r2);
}
*/
template<>
HALIDE_ALWAYS_INLINE HALIDE_MAYBE_UNUSED int32x32_t load<int32x32_t, int32_t, 32>(const void *base, int32_t offset) {
xb_vecN_2x32v nv8_0, nv8_1;
const xb_vecN_2x32v* __restrict ptr = (const xb_vecN_2x32v*)((const int32_t*)base + offset);
valign align = IVP_LA_PP((const xb_vec2Nx8 *)ptr);
IVP_LAN_2X32_IP(nv8_0, align, ptr);
IVP_LAN_2X32_IP(nv8_1, align, ptr);
return int32x32_t(int32x32_t::from_native_vector, nv8_0, nv8_1);
}
template<>
HALIDE_ALWAYS_INLINE HALIDE_MAYBE_UNUSED int32x64_t load<int32x64_t, int32_t, 32>(const void *base, int32_t offset) {
xb_vecN_2x32v nv8_0, nv8_1, nv8_2, nv8_3;
const xb_vecN_2x32v* __restrict ptr = (const xb_vecN_2x32v*)((const int32_t*)base + offset);
valign align = IVP_LA_PP((const xb_vec2Nx8 *)ptr);
IVP_LAN_2X32_IP(nv8_0, align, ptr);
IVP_LAN_2X32_IP(nv8_1, align, ptr);
IVP_LAN_2X32_IP(nv8_2, align, ptr);
IVP_LAN_2X32_IP(nv8_3, align, ptr);
return int32x64_t(int32x64_t::from_native_vector, nv8_0, nv8_1, nv8_2, nv8_3);
}
template <typename ResultType, typename LoadType>
HALIDE_ALWAYS_INLINE ResultType widening_load(const void *base, int32_t offset) = delete;
template<>
HALIDE_ALWAYS_INLINE HALIDE_MAYBE_UNUSED int16x32_t widening_load<int16x32_t, uint8_t>(const void *base, int32_t offset) {
xb_vecNx16 r;
const xb_vec2Nx8* __restrict ptr8 = (const xb_vec2Nx8*)((const uint8_t*)base + offset);
valign align = IVP_LA_PP(ptr8);
IVP_LANX8U_IP(r, align, (const xb_vecNx8U*)ptr8);
return r;
}
template<>
HALIDE_ALWAYS_INLINE HALIDE_MAYBE_UNUSED int16x64_t widening_load<int16x64_t, uint8_t>(const void *base, int32_t offset) {
xb_vecNx16 r1, r2;
const xb_vec2Nx8* __restrict ptr8 = (const xb_vec2Nx8*)((const uint8_t*)base + offset);
valign align = IVP_LA_PP(ptr8);
IVP_LANX8U_IP(r1, align, (const xb_vecNx8U*)ptr8);
// Pointer is automatically incremented by previous call.
IVP_LANX8U_IP(r2, align, (const xb_vecNx8U*)ptr8);
return int16x64_t(int16x64_t::from_native_vector, r1, r2);
}
template<>
HALIDE_ALWAYS_INLINE HALIDE_MAYBE_UNUSED uint16x64_t widening_load<uint16x64_t, uint8_t>(const void *base, int32_t offset) {
xb_vecNx16 r1, r2;
const xb_vec2Nx8* __restrict ptr8 = (const xb_vec2Nx8*)((const uint8_t*)base + offset);
valign align = IVP_LA_PP(ptr8);
IVP_LANX8U_IP(r1, align, (const xb_vecNx8U*)ptr8);
// Pointer is automatically incremented by previous call.
IVP_LANX8U_IP(r2, align, (const xb_vecNx8U*)ptr8);
return uint16x64_t(uint16x64_t::from_native_vector, r1, r2);
}
template<>
HALIDE_ALWAYS_INLINE HALIDE_MAYBE_UNUSED int32x16_t widening_load<int32x16_t, int16_t>(const void *base, int32_t offset) {
int32x16_t r1;
const xb_vec2Nx8* __restrict ptr8 = (const xb_vec2Nx8*)((const int16_t*)base + offset);
valign align = IVP_LA_PP(ptr8);
IVP_LAN_2X16S_IP(r1, align, (const xb_vecN_2x16*)ptr8);
return r1;
}
template<>
HALIDE_ALWAYS_INLINE HALIDE_MAYBE_UNUSED int32x32_t widening_load<int32x32_t, int16_t>(const void *base, int32_t offset) {
int32x16_t r1, r2;
const xb_vec2Nx8* __restrict ptr8 = (const xb_vec2Nx8*)((const int16_t*)base + offset);
valign align = IVP_LA_PP(ptr8);
IVP_LAN_2X16S_IP(r1, align, (const xb_vecN_2x16*)ptr8);
// Pointers is automatically incremented by previous call.
IVP_LAN_2X16S_IP(r2, align, (const xb_vecN_2x16*)ptr8);
return int32x32_t(int32x32_t::from_native_vector, r1, r2);
}
template<>
HALIDE_ALWAYS_INLINE HALIDE_MAYBE_UNUSED int32x32_t widening_load<int32x32_t, uint16_t>(const void *base, int32_t offset) {
int32x16_t r1, r2;
const xb_vec2Nx8* __restrict ptr8 = (const xb_vec2Nx8*)((const uint16_t*)base + offset);
valign align = IVP_LA_PP(ptr8);
IVP_LAN_2X16U_IP(r1, align, (const xb_vecN_2x16U*)ptr8);
// Pointers is automatically incremented by previous call.
IVP_LAN_2X16U_IP(r2, align, (const xb_vecN_2x16U*)ptr8);
return int32x32_t(int32x32_t::from_native_vector, r1, r2);
}
template<>
HALIDE_ALWAYS_INLINE HALIDE_MAYBE_UNUSED uint32x32_t widening_load<uint32x32_t, uint16_t>(const void *base, int32_t offset) {
uint32x16_t r1, r2;
const xb_vec2Nx8* __restrict ptr8 = (const xb_vec2Nx8*)((const uint16_t*)base + offset);
valign align = IVP_LA_PP(ptr8);
IVP_LAN_2X16U_IP(r1, align, (const xb_vecN_2x16U*)ptr8);
// Pointers is automatically incremented by previous call.
IVP_LAN_2X16U_IP(r2, align, (const xb_vecN_2x16U*)ptr8);
return uint32x32_t(uint32x32_t::from_native_vector, r1, r2);
}
template<>
HALIDE_ALWAYS_INLINE HALIDE_MAYBE_UNUSED int32x64_t widening_load<int32x64_t, uint16_t>(const void *base, int32_t offset) {
int32x16_t r1, r2, r3, r4;
const xb_vec2Nx8* __restrict ptr8 = (const xb_vec2Nx8*)((const uint16_t*)base + offset);
valign align = IVP_LA_PP(ptr8);
IVP_LAN_2X16U_IP(r1, align, (const xb_vecN_2x16U*)ptr8);
// Pointers is automatically incremented by previous call.
IVP_LAN_2X16U_IP(r2, align, (const xb_vecN_2x16U*)ptr8);
IVP_LAN_2X16U_IP(r3, align, (const xb_vecN_2x16U*)ptr8);
IVP_LAN_2X16U_IP(r4, align, (const xb_vecN_2x16U*)ptr8);
return int32x64_t(int32x64_t::from_native_vector, r1, r2, r3, r4);
}
template <typename VectorType, typename BaseType, int Lanes>
HALIDE_ALWAYS_INLINE void store_narrowing(const VectorType& a, void *base, int32_t offset) = delete;
template<>
HALIDE_ALWAYS_INLINE void store_narrowing<int16x32_t, uint8_t, 32>(const int16x32_t& a, void *base, int32_t offset) {
valign align;
xb_vecNx8U* __restrict ptr = (xb_vecNx8U*)((uint8_t*)base + offset);
IVP_SANX8U_IP(a, align, ptr);
IVP_SAPOSNX8U_FP(align, ptr);
}
HALIDE_ALWAYS_INLINE int16x64_t halide_xtensa_interleave_i16(const int16x32_t& a, const int16x32_t& b) {
return int16x64_t(int16x64_t::from_native_vector,
IVP_SELNX16I(b, a, IVP_SELI_16B_INTERLEAVE_1_LO),
IVP_SELNX16I(b, a, IVP_SELI_16B_INTERLEAVE_1_HI)
);
}
HALIDE_ALWAYS_INLINE uint16x64_t halide_xtensa_interleave_u16(const uint16x32_t& a, const uint16x32_t& b) {
return uint16x64_t(uint16x64_t::from_native_vector,
IVP_SELNX16UI(b, a, IVP_SELI_16B_INTERLEAVE_1_LO),
IVP_SELNX16UI(b, a, IVP_SELI_16B_INTERLEAVE_1_HI)
);
}
HALIDE_ALWAYS_INLINE uint16x128_t halide_xtensa_interleave_u16(const uint16x32_t& a, const uint16x32_t& b, const uint16x32_t& c, const uint16x32_t& d) {
const uint16x32_t ab0 = IVP_SELNX16UI(b, a, IVP_SELI_16B_INTERLEAVE_1_LO);
const uint16x32_t ab1 = IVP_SELNX16UI(b, a, IVP_SELI_16B_INTERLEAVE_1_HI);
const uint16x32_t cd0 = IVP_SELNX16UI(d, c, IVP_SELI_16B_INTERLEAVE_1_LO);
const uint16x32_t cd1 = IVP_SELNX16UI(d, c, IVP_SELI_16B_INTERLEAVE_1_HI);
return uint16x128_t(uint16x128_t::from_native_vector,
IVP_SELNX16UI(cd0, ab0, IVP_SELI_16B_INTERLEAVE_2_LO),
IVP_SELNX16UI(cd0, ab0, IVP_SELI_16B_INTERLEAVE_2_HI),
IVP_SELNX16UI(cd1, ab1, IVP_SELI_16B_INTERLEAVE_2_LO),
IVP_SELNX16UI(cd1, ab1, IVP_SELI_16B_INTERLEAVE_2_HI));
}
HALIDE_ALWAYS_INLINE uint8x128_t halide_xtensa_interleave_u8(const uint8x64_t& a, const uint8x64_t& b) {
return uint8x128_t(uint8x128_t::from_native_vector,
IVP_SEL2NX8UI(b, a, IVP_SELI_8B_INTERLEAVE_1_LO),
IVP_SEL2NX8UI(b, a, IVP_SELI_8B_INTERLEAVE_1_HI)
);
}
HALIDE_ALWAYS_INLINE uint8x256_t halide_xtensa_interleave_u8(const uint8x64_t& a, const uint8x64_t& b, const uint8x64_t& c, const uint8x64_t& d) {
const uint8x64_t ab0 = IVP_SEL2NX8UI(b, a, IVP_SELI_8B_INTERLEAVE_1_LO);
const uint8x64_t ab1 = IVP_SEL2NX8UI(b, a, IVP_SELI_8B_INTERLEAVE_1_HI);
const uint8x64_t cd0 = IVP_SEL2NX8UI(d, c, IVP_SELI_8B_INTERLEAVE_1_LO);
const uint8x64_t cd1 = IVP_SEL2NX8UI(d, c, IVP_SELI_8B_INTERLEAVE_1_HI);
return uint8x256_t(uint8x256_t::from_native_vector,
IVP_SEL2NX8UI(cd0, ab0, IVP_SELI_8B_INTERLEAVE_2_LO),
IVP_SEL2NX8UI(cd0, ab0, IVP_SELI_8B_INTERLEAVE_2_HI),
IVP_SEL2NX8UI(cd1, ab1, IVP_SELI_8B_INTERLEAVE_2_LO),
IVP_SEL2NX8UI(cd1, ab1, IVP_SELI_8B_INTERLEAVE_2_HI));
}
HALIDE_ALWAYS_INLINE uint1x256_t halide_xtensa_interleave_u1(const uint1x64_t& a, const uint1x64_t& b, const uint1x64_t& c, const uint1x64_t& d) {
uint8x64_t a8 = 0, b8 = 0, c8 = 0, d8 = 0;
IVP_INJBI2NX8(a8, a, 0);
IVP_INJBI2NX8(b8, b, 0);
IVP_INJBI2NX8(c8, c, 0);
IVP_INJBI2NX8(d8, d, 0);
uint8x256_t interleaved8 = halide_xtensa_interleave_u8(a8, b8, c8, d8);
uint1x64_t ra = IVP_EXTBI2NX8(interleaved8.native_vector[0], 0);
uint1x64_t rb = IVP_EXTBI2NX8(interleaved8.native_vector[1], 0);
uint1x64_t rc = IVP_EXTBI2NX8(interleaved8.native_vector[2], 0);
uint1x64_t rd = IVP_EXTBI2NX8(interleaved8.native_vector[3], 0);
return uint1x256_t(uint1x256_t::from_native_vector, ra, rb, rc, rd);
}
HALIDE_ALWAYS_INLINE uint8x64_t halide_xtensa_extract_0_off_3_u8(const uint8x64_t& a0, const uint8x64_t& a1, const uint8x64_t& a2) {
// TODO(vksnk): there is likely a better way to do it.
uint8x64_t vR, vG, vB, vRG0, vRG1;
IVP_DSEL2NX8UI(vB, vRG0, a1, a0, IVP_DSELI_8B_DEINTERLEAVE_C3_STEP_0);
IVP_DSEL2NX8UI_H(vB, vRG1, a2, a1, IVP_DSELI_8B_DEINTERLEAVE_C3_STEP_1);
IVP_DSEL2NX8UI (vG,vR, vRG1,vRG0, IVP_DSELI_8B_DEINTERLEAVE_1);
return vR;
}
HALIDE_ALWAYS_INLINE uint8x64_t halide_xtensa_extract_0_off_3_u8(const uint8x192_t& a) {
return halide_xtensa_extract_0_off_3_u8(a.native_vector[0], a.native_vector[1], a.native_vector[2]);
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_deinterleave_even_i16(const int16x64_t& a) {
return IVP_SELNX16I(a.native_vector[1], a.native_vector[0], IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0);
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_deinterleave_odd_i16(const int16x64_t& a) {
return IVP_SELNX16I(a.native_vector[1], a.native_vector[0], IVP_SELI_16B_EXTRACT_1_OF_2_OFF_1);
}
HALIDE_ALWAYS_INLINE uint16x32_t halide_xtensa_deinterleave_even_u16(const uint16x64_t& a) {
return IVP_SELNX16UI(a.native_vector[1], a.native_vector[0], IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0);
}
HALIDE_ALWAYS_INLINE uint16x32_t halide_xtensa_deinterleave_odd_u16(const uint16x64_t& a) {
return IVP_SELNX16UI(a.native_vector[1], a.native_vector[0], IVP_SELI_16B_EXTRACT_1_OF_2_OFF_1);
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_slice_i16(const int16x64_t& a, int start) {
return IVP_SELNX16(a.native_vector[1], a.native_vector[0], IVP_SEQNX16() + int16x32_t(start));
}
HALIDE_ALWAYS_INLINE uint16x32_t halide_xtensa_slice_u16(const uint16x64_t& a, int start) {
return IVP_SELNX16U(a.native_vector[1], a.native_vector[0], IVP_SEQNX16() + int16x32_t(start));
}
/*
HALIDE_ALWAYS_INLINE int8x64_t halide_xtensa_deinterleave_even_i8(const int8x128_t& a) {
return IVP_SEL2NX8I(a.native_vector[1], a.native_vector[0], IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
}
HALIDE_ALWAYS_INLINE int8x64_t halide_xtensa_deinterleave_odd_i8(const int8x128_t& a) {
return IVP_SEL2NX8I(a.native_vector[1], a.native_vector[0], IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
}
*/
HALIDE_ALWAYS_INLINE uint8x64_t halide_xtensa_deinterleave_even_u8(const uint8x128_t& a) {
return IVP_SEL2NX8UI(a.native_vector[1], a.native_vector[0], IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
}
HALIDE_ALWAYS_INLINE uint8x64_t halide_xtensa_deinterleave_odd_u8(const uint8x128_t& a) {
return IVP_SEL2NX8UI(a.native_vector[1], a.native_vector[0], IVP_SELI_8B_EXTRACT_1_OF_2_OFF_1);
}
HALIDE_ALWAYS_INLINE float32x16_t halide_xtensa_slice_f32(const float32x32_t& a, int start) {
return IVP_SELN_2XF32(a.native_vector[1], a.native_vector[0], IVP_ADDN_2X32(IVP_SEQN_2X32(), int32x16_t(start)));
}
HALIDE_ALWAYS_INLINE uint8x64_t halide_xtensa_dynamic_shuffle(const uint8x64_t& a, const int8x64_t& b) {
return IVP_SHFL2NX8U(a, b);
}
HALIDE_ALWAYS_INLINE uint8x64_t halide_xtensa_dynamic_shuffle(const uint8x128_t& a, const int8x64_t& b) {
return IVP_SEL2NX8(a.native_vector[1], a.native_vector[0], b);
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_dynamic_shuffle(const int16x32_t& a, const int16x32_t& b) {
return IVP_SHFLNX16(a, b);
}
HALIDE_ALWAYS_INLINE uint16x32_t halide_xtensa_dynamic_shuffle(const uint16x32_t& a, const int16x32_t& b) {
return IVP_SHFLNX16U(a, b);
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_dynamic_shuffle(const int16x64_t& a, const int16x32_t& b) {
return IVP_SELNX16(a.native_vector[1], a.native_vector[0], b);
}
HALIDE_ALWAYS_INLINE uint16x32_t halide_xtensa_dynamic_shuffle(const uint16x64_t& a, const int16x32_t& b) {
return IVP_SELNX16U(a.native_vector[1], a.native_vector[0], b);
}
HALIDE_ALWAYS_INLINE int16x64_t halide_xtensa_dynamic_shuffle(const int16x64_t& a, const int16x64_t& b) {
return int16x64_t(int16x64_t::from_native_vector,
IVP_SELNX16(a.native_vector[1], a.native_vector[0], b.native_vector[0]),
IVP_SELNX16(a.native_vector[1], a.native_vector[0], b.native_vector[1])
);
}
HALIDE_ALWAYS_INLINE uint16x64_t halide_xtensa_dynamic_shuffle(const uint16x64_t& a, const int16x64_t& b) {
return uint16x64_t(uint16x64_t::from_native_vector,
IVP_SELNX16U(a.native_vector[1], a.native_vector[0], b.native_vector[0]),
IVP_SELNX16U(a.native_vector[1], a.native_vector[0], b.native_vector[1])
);
}
HALIDE_ALWAYS_INLINE float32x16_t halide_xtensa_dynamic_shuffle(const float32x16_t& a, const int32x16_t& b) {
return IVP_SHFLN_2XF32(a, b);
}
HALIDE_ALWAYS_INLINE float32x16_t halide_xtensa_dynamic_shuffle(const float32x32_t& a, const int32x16_t& b) {
return IVP_SELN_2XF32(a.native_vector[1], a.native_vector[0], b);
}
HALIDE_ALWAYS_INLINE int32x16_t halide_xtensa_sat_add_i32(const int32x16_t& a,
const int32x16_t& b) {
// I am not 100% about it.
xb_vecN_2x32v one = 1;
xb_vecN_2x64w l0 = IVP_MULN_2X32(a, one);
IVP_MULAN_2X32(l0, b, one);
return IVP_PACKVRN_2X64W(l0, 0);
}
HALIDE_ALWAYS_INLINE int32x32_t halide_xtensa_sat_add_i32(const int32x32_t& a,
const int32x32_t& b) {
// I am not 100% about it.
xb_vecN_2x32v zero = 0;
xb_vecN_2x32v one = 1;
xb_vecN_2x64w l0 = a.native_vector[0] * one;
IVP_MULAN_2X32(l0, b.native_vector[0], one);
xb_vecN_2x64w l1 = a.native_vector[1] * one;
IVP_MULAN_2X32(l1, b.native_vector[1], one);
return int32x32_t(int32x32_t::from_native_vector, IVP_PACKVN_2X64W(l0, zero), IVP_PACKVN_2X64W(l1, zero));
//return a + b;
/*
// determine the lower or upper bound of the result
//int64_t ret = (x < 0) ? INT64_MIN : INT64_MAX;
int32x32_t ret = int32x32_t::select(a < int32x32_t::broadcast(0),
int32x32_t::broadcast(INT32_MIN),
int32x32_t::broadcast(INT32_MAX));
// this is always well defined:
// if x < 0 this adds a positive value to INT64_MIN
// if x > 0 this subtracts a positive value from INT64_MAX
int32x32_t comp = ret - a;
// the condition is equivalent to
// ((x < 0) && (y > comp)) || ((x >=0) && (y <= comp))
//if ((x < 0) == (y > comp)) ret = x + y;
ret = int32x32_t::select(IVP_NOTBN(IVP_XORBN(a < int32x32_t::broadcast(0), comp <= b)), a + b, ret);
return ret;
*/
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_pred_add_i16(const int16x32_t& a, const uint1x32_t& p, const int16x32_t& b, const int16x32_t& c) {
int16x32_t r = a;
IVP_ADDNX16T(r, b, c, p);
return r;
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_pred_sub_i16(const int16x32_t& a, const uint1x32_t& p, const int16x32_t& b, const int16x32_t& c) {
int16x32_t r = a;
IVP_SUBNX16T(r, b, c, p);
return r;
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_pred_max_i16(const int16x32_t& a, const uint1x32_t& p, const int16x32_t& b, const int16x32_t& c) {
int16x32_t r = a;
IVP_MAXNX16T(r, b, c, p);
return r;
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_pred_min_i16(const int16x32_t& a, const uint1x32_t& p, const int16x32_t& b, const int16x32_t& c) {
int16x32_t r = a;
IVP_MINNX16T(r, b, c, p);
return r;
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_pred_sat_add_i16(const uint1x32_t& p, const int16x32_t& b, const int16x32_t& c, const int16x32_t& a) {
int16x32_t r = a;
IVP_ADDSNX16T(r, b, c, p);
return r;
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_pred_sat_sub_i16(const int16x32_t& a, const uint1x32_t& p, const int16x32_t& b, const int16x32_t& c) {
int16x32_t r = a;
IVP_SUBSNX16T(r, b, c, p);
return r;
}
HALIDE_ALWAYS_INLINE int64x16_t halide_xtensa_widen_mul_i64(const int32x16_t& a, const int32x16_t& b) {
return IVP_MULN_2X32(a, b);
}
HALIDE_ALWAYS_INLINE int64x16_t halide_xtensa_widen_mul_add_i64(const int32x16_t& a, const int32x16_t& b, const int32x16_t& c) {
xb_vecN_2x64w r = IVP_MULN_2X32(c, int32x16_t(1));
IVP_MULAN_2X32(r, a, b);
return r;
}
HALIDE_ALWAYS_INLINE int48x32_t halide_xtensa_widen_mul_add_i48(const int48x32_t& a, const int16x32_t& b, const int16x32_t& c) {
int48x32_t r = a;
IVP_MULANX16(r, b, c);
return r;
}
HALIDE_ALWAYS_INLINE int24x64_t halide_xtensa_widen_mul_add_u24(const int24x64_t& a, const uint8x64_t& b, const uint8x64_t& c) {
int24x64_t r = a;
IVP_MULUUA2NX8(r, b, c);
return r;
}
HALIDE_ALWAYS_INLINE int24x64_t halide_xtensa_widen_mul_add_i24(const int24x64_t& a, const int8x64_t& b, const int8x64_t& c) {
int24x64_t r = a;
IVP_MULA2NX8(r, b, c);
return r;
}
HALIDE_ALWAYS_INLINE int24x64_t halide_xtensa_widen_quad_mul_add_i24(
const int24x64_t& acc,
const int8x64_t& a0,
const int8_t& s0,
const int8x64_t& a1,
const int8_t& s1,
const int8x64_t& a2,
const int8_t& s2,
const int8x64_t& a3,
const int8_t& s3
) {
int24x64_t r = acc;
const int8_t scalar_coef[] = {s3, s2, s1, s0};
const xb_int32pr * __restrict coef = (const xb_int32pr*)scalar_coef;
IVP_MULQA2N8XR8(r, a0, a1, a2, a3, coef[0]);
return r;
}
HALIDE_ALWAYS_INLINE int24x64_t halide_xtensa_widen_quad_mul_add_i24(
const int24x64_t& acc,
const int8x64_t& a0,
const int8x64_t& a1,
const int8x64_t& a2,
const int8x64_t& a3,
const int8x4_t& s
) {
int24x64_t r = acc;
IVP_MULQA2N8XR8(r, a3, a2, a1, a0, s);
return r;
}
HALIDE_ALWAYS_INLINE int24x64_t halide_xtensa_widen_quad_mul_add_i24(
const int24x64_t& acc,
const int8x256_t& a,
const int8x4_t& s
) {
int24x64_t r = acc;
IVP_MULQA2N8XR8(r, a.native_vector[3], a.native_vector[2], a.native_vector[1], a.native_vector[0], s);
return r;
}
HALIDE_ALWAYS_INLINE int24x64_t halide_xtensa_widen_quad_mul_add_u24(
const int24x64_t& acc,
const uint8x64_t& a0,
const uint8x64_t& a1,
const uint8x64_t& a2,
const uint8x64_t& a3,
const uint8x4_t& s
) {
int24x64_t r = acc;
IVP_MULUUQA2N8XR8(r, a3, a2, a1, a0, s);
return r;
}
HALIDE_ALWAYS_INLINE int24x64_t halide_xtensa_widen_quad_mul_add_u24(
const int24x64_t& acc,
const uint8x256_t& a,
const uint8x4_t& s
) {
int24x64_t r = acc;
IVP_MULUUQA2N8XR8(r, a.native_vector[3], a.native_vector[2], a.native_vector[1], a.native_vector[0], s);
return r;
}
HALIDE_ALWAYS_INLINE int24x64_t halide_xtensa_widen_quad_mul_add_by_scalar_u24(
const int24x64_t& acc,
const uint8x256_t& a,
const uint8_t& s
) {
const xb_int32pr coef = s | (s << 8) | (s << 16) | (s << 24);
int24x64_t r = acc;
IVP_MULUUQA2N8XR8(r, a.native_vector[3], a.native_vector[2], a.native_vector[1], a.native_vector[0], coef);
return r;
}
HALIDE_ALWAYS_INLINE int24x128_t halide_xtensa_dual_widen_quad_mul_add_i24(
const int24x128_t& acc,
const int8x256_t& a,
const int8x8_t& s) {
int24x128_t r(acc);
IVP_DMULQA2N8XR8(r.native_vector[1], r.native_vector[0], a.native_vector[3], a.native_vector[2], a.native_vector[1], a.native_vector[0], s);
return r;
}
HALIDE_ALWAYS_INLINE int24x128_t halide_xtensa_dual_widen_quad_mul_add_u24(
const int24x128_t& acc,
const uint8x256_t& a,
const uint8x8_t& s) {
int24x128_t r(acc);
IVP_DMULUUQA2N8XR8(r.native_vector[1], r.native_vector[0], a.native_vector[3], a.native_vector[2], a.native_vector[1], a.native_vector[0], s);
return r;
}
HALIDE_ALWAYS_INLINE int24x64_t halide_xtensa_widen_pair_mul_i24(const int8x64_t& a, const int8x64_t& b,
const int8x64_t& c, const int8x64_t& d) {
return IVP_MULP2NX8(a, b, c, d);
}
HALIDE_ALWAYS_INLINE int24x64_t halide_xtensa_widen_pair_mul_add_i24(const int24x64_t& a, const int8x64_t& b,
const int8x64_t& c, const int8x64_t& d, const int8x64_t& e) {
int24x64_t r = a;
IVP_MULPA2NX8(r, b, c, d, e);
return r;
}
HALIDE_ALWAYS_INLINE int24x64_t halide_xtensa_widen_pair_mul_add_u24(const int24x64_t& a, const uint8x64_t& b,
const uint8x64_t& c, const uint8x64_t& d, const uint8x64_t& e) {
int24x64_t r = a;
IVP_MULUUPA2NX8(r, b, c, d, e);
return r;
}
HALIDE_ALWAYS_INLINE int24x64_t halide_xtensa_widen_pair_mul_u24(const uint8x64_t& a, const uint8x64_t& b,
const uint8x64_t& c, const uint8x64_t& d) {
return IVP_MULUUP2NX8(a, b, c, d);
}
HALIDE_ALWAYS_INLINE int48x32_t halide_xtensa_widen_pair_mul_i48(const int16x32_t& a, const int16x32_t& b,
const int16x32_t& c, const int16x32_t& d) {
return IVP_MULPNX16(a, b, c, d);
}
HALIDE_ALWAYS_INLINE int48x32_t halide_xtensa_widen_pair_mul_add_i48(const int48x32_t& a, const int16x32_t& b,
const int16x32_t& c, const int16x32_t& d, const int16x32_t& e) {
int48x32_t r = a;
IVP_MULPANX16(r, b, c, d, e);
return r;
}
HALIDE_ALWAYS_INLINE int48x32_t halide_xtensa_widen_pair_mul_u48(const uint16x32_t& a, const uint16x32_t& b,
const uint16x32_t& c, const uint16x32_t& d) {
return IVP_MULUUPNX16(a, b, c, d);
}
HALIDE_ALWAYS_INLINE int24x64_t halide_xtensa_widen_mul_add_by_diff_u24(const int24x64_t& a, const uint8x64_t& d1,
const uint8x64_t& d2, const uint8x64_t& c) {
int24x64_t r = a;
IVP_MULUUPDA2NX8(r, d1, c, d2, c);
return r;
}
HALIDE_ALWAYS_INLINE int48x32_t halide_xtensa_widen_add_i48(const int16x32_t& a, const int16x32_t& b) {
return IVP_ADDWNX16(a, b);
}
HALIDE_ALWAYS_INLINE int48x32_t halide_xtensa_widen_add_i48(const int48x32_t& a, const int16x32_t& b) {
int48x32_t r = a;
IVP_ADDWANX16(r, b, int16x32_t(0));
return r;
}
HALIDE_ALWAYS_INLINE int48x32_t halide_xtensa_widen_pair_add_i48(const int48x32_t& a, const int16x32_t& b, const int16x32_t& c) {
int48x32_t r = a;
IVP_ADDWANX16(r, b, c);
return r;
}
HALIDE_ALWAYS_INLINE int48x32_t halide_xtensa_widen_add_u48(const uint16x32_t& a, const uint16x32_t& b) {
return IVP_ADDWUNX16U(a, b);
}
HALIDE_ALWAYS_INLINE int48x32_t halide_xtensa_widen_add_u48(const int48x32_t& a, const uint16x32_t& b) {
int48x32_t r = a;
IVP_ADDWUANX16U(r, b, uint16x32_t(0));
return r;
}
HALIDE_ALWAYS_INLINE int48x32_t halide_xtensa_widen_pair_add_u48(const int48x32_t& a, const uint16x32_t& b, const uint16x32_t& c) {
int48x32_t r = a;
IVP_ADDWUANX16U(r, b, c);
return r;
}
/*
Disabled for now.
HALIDE_ALWAYS_INLINE int24x64_t halide_xtensa_widen_mul_vu8_si16_i24(const uint8x64_t& a, const int16_t& b) {
return IVP_MULUS2N8XR16(a, b);
}
// TODO(vksnk):The one below is incorrect:
HALIDE_ALWAYS_INLINE int24x64_t halide_xtensa_widen_pair_mul_vu8_si16_i24(
const uint8x64_t& a, const int16_t& b,
const uint8x64_t& c, const int16_t& d) {
return IVP_MULUSP2N8XR16(a, c, (b << 16) | d);
}
HALIDE_ALWAYS_INLINE int24x64_t halide_xtensa_widen_mul_add_vu8_si16_i24(const int24x64_t& a, const uint8x64_t& b, const int16_t& c) {
int24x64_t r = a;
IVP_MULUSA2N8XR16(r, b, c);
return r;
}
*/
HALIDE_ALWAYS_INLINE int24x64_t halide_xtensa_widen_add_i24(const int24x64_t& a, const int8x64_t& b) {
int24x64_t r = a;
IVP_ADDWA2NX8(r, b, int8x64_t(0));
return r;
}
HALIDE_ALWAYS_INLINE int8x64_t halide_xtensa_sat_narrow_i24x_with_shift_i8(const int24x64_t& a, int shift) {
return IVP_PACKVRNR2NX24(a, shift);
}
HALIDE_ALWAYS_INLINE uint8x64_t halide_xtensa_sat_narrow_i24x_with_shift_u8(const int24x64_t& a, int shift) {
return xb_vec2Nx8_rtor_xb_vec2Nx8U(IVP_PACKVRNR2NX24(a, shift));
}
HALIDE_ALWAYS_INLINE int16x64_t halide_xtensa_narrow_i24_with_shift_i16(const int24x64_t& a, int shift) {
int16x32_t even = xb_vecNx16U_rtor_xb_vecNx16(IVP_PACKVRNR2NX24_0(a, shift));
int16x32_t odd = xb_vecNx16U_rtor_xb_vecNx16(IVP_PACKVRNR2NX24_1(a, shift));
int16x64_t r;
IVP_DSELNX16I(r.native_vector[1], r.native_vector[0], odd, even, IVP_DSELI_INTERLEAVE_1);
return r;
}
HALIDE_ALWAYS_INLINE int8x64_t halide_xtensa_narrow_i24_with_shift_i8(const int24x64_t& a, int shift) {
return IVP_PACKVR2NX24(a, shift);
}
HALIDE_ALWAYS_INLINE int32x32_t halide_xtensa_narrow_i48_with_shift_i32(const int48x32_t& a, int shift) {
int32x16_t even = IVP_PACKVRNRNX48_0(a, shift);
int32x16_t odd = IVP_PACKVRNRNX48_1(a, shift);
int32x32_t r;
IVP_DSELN_2X32I(r.native_vector[1], r.native_vector[0], odd, even, IVP_DSELI_INTERLEAVE_2);
return r;
}
HALIDE_ALWAYS_INLINE uint32x32_t halide_xtensa_narrow_i48_with_shift_u32(const int48x32_t& a, int shift) {
uint32x16_t even = IVP_PACKVRNRNX48_0(a, shift);
uint32x16_t odd = IVP_PACKVRNRNX48_1(a, shift);
uint32x32_t r;
IVP_DSELN_2X32UI(r.native_vector[1], r.native_vector[0], odd, even, IVP_DSELI_INTERLEAVE_2);
return r;
}
HALIDE_ALWAYS_INLINE uint16x32_t halide_xtensa_narrow_i48_with_shift_u16(const int48x32_t& a, int shift) {
return xb_vecNx16_rtor_xb_vecNx16U(IVP_PACKVRNRNX48(a, shift));
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_narrow_with_shift_i16(const int32x32_t& a, int shift) {
xb_vecNx48 wide = IVP_CVT48SNX32(a.native_vector[1], a.native_vector[0]);
return IVP_PACKVRNRNX48(wide, shift);
}
HALIDE_ALWAYS_INLINE uint16x32_t halide_xtensa_narrow_with_shift_u16(const int32x32_t& a, int shift) {
xb_vecNx48 wide = IVP_CVT48SNX32(a.native_vector[1], a.native_vector[0]);
return xb_vecNx16_rtor_xb_vecNx16U(IVP_PACKVRNRNX48(wide, shift));
}
HALIDE_ALWAYS_INLINE int32x16_t halide_xtensa_narrow_high_i32(const int64x16_t& a) {
return IVP_PACKHN_2X64W(a);
}
HALIDE_ALWAYS_INLINE int32x16_t halide_xtensa_sat_narrow_shift_i32(const int64x16_t& a, int shift) {
return IVP_PACKVN_2X64W(a, shift);
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_narrow_clz_i16(const int32x32_t& a) {
xb_vec2Nx24 wide = IVP_CVT24UNX32L(IVP_NSAUN_2X32(a.native_vector[1]), IVP_NSAUN_2X32(a.native_vector[0]));
return IVP_CVT16U2NX24L(wide);
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_narrow_clz_i16(const uint32x32_t& a) {
xb_vec2Nx24 wide = IVP_CVT24UNX32L(IVP_NSAUN_2X32(a.native_vector[1]), IVP_NSAUN_2X32(a.native_vector[0]));
return IVP_CVT16U2NX24L(wide);
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_i48x_clz_i16(const int48x32_t& a) {
xb_vecNx16 clz_lo = IVP_NSAUNX16(IVP_PACKLNX48(a));
xb_vecNx16 clz_hi = IVP_NSAUNX16(IVP_PACKVRNRNX48(a, 16));
IVP_ADDNX16T(clz_hi, clz_hi, clz_lo, clz_hi == xb_vecNx16(16));
return clz_hi;
}
HALIDE_ALWAYS_INLINE uint1x32_t halide_xtensa_i48x_gt_zero(const int48x32_t& b) {
return int16x32_t(0) < IVP_PACKVRNX48(b, 0);
}
HALIDE_ALWAYS_INLINE uint1x32_t halide_xtensa_i16_neq_zero(const int16x32_t& a) {
return IVP_NEQNX16(a, int16x32_t(0));
}
HALIDE_ALWAYS_INLINE int32_t halide_xtensa_full_reduce_add_u8_to_i32(const uint8x64_t& a) {
return xb_int16U_rtor_uint16(IVP_RADDU2NX8(a));
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_lerp_i16(const int16x32_t& a, const int16x32_t& b, uint16_t w) {
// TODO(vksnk): Halide lerp actually uses full range, but it's not clear from the documentation
// if we can pass unsigned type to IVP_MULPN16XR16, so just to be extra careful reduce it to 14-bit
// for now.
uint32_t w32 = ((uint32_t(w)) >> 0);
uint32_t alphaMalpha = ((65536 - w32) << 16) | w32;
xb_vecNx48 output = IVP_MULSUPN16XR16(a, b, alphaMalpha);
IVP_DECNEGWNX48(output);
return IVP_PACKVRNX48(output, 16);
}
HALIDE_ALWAYS_INLINE int16x64_t convert_to_int16x64_t_from_int8x64_t(const int8x64_t& src) {
xb_vec2Nx24 wide = src * int8x64_t(1);
return int16x64_t(int16x64_t::from_native_vector,
IVP_CVT16S2NX24L(wide), IVP_CVT16S2NX24H(wide));
}
HALIDE_ALWAYS_INLINE uint16x64_t convert_to_uint16x64_t_from_uint8x64_t(const uint8x64_t& src) {
xb_vec2Nx24 wide = src * uint8x64_t(1);
return uint16x64_t(uint16x64_t::from_native_vector,
IVP_CVT16U2NX24L(wide), IVP_CVT16U2NX24H(wide));
}
HALIDE_ALWAYS_INLINE int16x64_t convert_to_int16x64_t_from_uint8x64_t(const uint8x64_t& src) {
xb_vec2Nx24 wide = src * uint8x64_t(1);
return int16x64_t(int16x64_t::from_native_vector,
IVP_CVT16S2NX24L(wide), IVP_CVT16S2NX24H(wide));
}
HALIDE_ALWAYS_INLINE int16x64_t convert_to_int16x64_t_from_int24x64_t(const int24x64_t& wide) {
return int16x64_t(int16x64_t::from_native_vector,
IVP_CVT16S2NX24L(wide), IVP_CVT16S2NX24H(wide));
}
HALIDE_ALWAYS_INLINE int8x64_t convert_to_int8x64_t_from_int16x64_t(const int16x64_t& src) {
xb_vec2Nx24 wide = IVP_CVT24S2NX16(src.native_vector[1], src.native_vector[0]);
return IVP_PACKL2NX24(wide);
}
HALIDE_ALWAYS_INLINE uint8x64_t convert_to_uint8x64_t_from_int16x64_t(const int16x64_t& src) {
xb_vec2Nx24 wide = IVP_CVT24S2NX16(src.native_vector[1], src.native_vector[0]);
return xb_vec2Nx8_rtor_xb_vec2Nx8U(IVP_PACKL2NX24(wide));
}
HALIDE_ALWAYS_INLINE int8x64_t convert_to_int8x64_t_from_int32x64_t(const int32x64_t& src) {
xb_vec2Nx24 wide = IVP_CVT24UNX32L(src.native_vector[1], src.native_vector[0]);
IVP_CVT24UNX32H(wide, src.native_vector[3], src.native_vector[2]);
return IVP_PACKL2NX24(wide);
}
HALIDE_ALWAYS_INLINE int8x64_t convert_to_int8x64_t_from_uint1x64_t(const uint1x64_t& src) {
return IVP_MOV2NX8T(int8x64_t(1), int8x64_t(0), src);
}
HALIDE_ALWAYS_INLINE uint8x64_t convert_to_uint8x64_t_from_uint1x64_t(const uint1x64_t& src) {
return IVP_MOV2NX8UT(uint8x64_t(1), uint8x64_t(0), src);
}
HALIDE_ALWAYS_INLINE uint8x64_t convert_to_uint8x64_t_from_int32x64_t(const int32x64_t& src) {
xb_vec2Nx24 wide = IVP_CVT24UNX32L(src.native_vector[1], src.native_vector[0]);
IVP_CVT24UNX32H(wide, src.native_vector[3], src.native_vector[2]);
return xb_vec2Nx8_rtor_xb_vec2Nx8U(IVP_PACKL2NX24(wide));
}
HALIDE_ALWAYS_INLINE uint8x64_t convert_to_uint8x64_t_from_uint16x64_t(const uint16x64_t& src) {
xb_vec2Nx24 wide = IVP_CVT24U2NX16(src.native_vector[1], src.native_vector[0]);
return xb_vec2Nx8_rtor_xb_vec2Nx8U(IVP_PACKL2NX24(wide));
}
HALIDE_ALWAYS_INLINE int16x32_t convert_to_int16x32_t_from_uint1x32_t(const uint1x32_t& src) {
return IVP_MOVNX16T(int16x32_t(1), int16x32_t(0), src);
}
HALIDE_ALWAYS_INLINE int16x32_t convert_to_int16x32_t_from_int32x32_t(const int32x32_t& src) {
return IVP_SELNX16I(IVP_MOVNX16_FROMN_2X32(src.native_vector[1]),
IVP_MOVNX16_FROMN_2X32(src.native_vector[0]),
IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0);
}
HALIDE_ALWAYS_INLINE int48x32_t convert_to_int48x32_t_from_int32x32_t(const int32x32_t& src) {
return IVP_CVT48SNX32(src.native_vector[1], src.native_vector[0]);
}
HALIDE_ALWAYS_INLINE int48x32_t convert_to_int48x32_t_from_uint32x32_t(const uint32x32_t& src) {
return IVP_CVT48UNX32(src.native_vector[1], src.native_vector[0]);
}
HALIDE_ALWAYS_INLINE int16x32_t convert_to_int16x32_t_from_uint32x32_t(const uint32x32_t& src) {
return IVP_SELNX16I(IVP_MOVNX16_FROMN_2X32U(src.native_vector[1]),
IVP_MOVNX16_FROMN_2X32U(src.native_vector[0]),
IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0);
}
HALIDE_ALWAYS_INLINE int16x64_t convert_to_int16x64_t_from_int32x64_t(const int32x64_t& src) {
xb_vecNx48 wide0 = IVP_CVT48SNX32(src.native_vector[1], src.native_vector[0]);
xb_vecNx48 wide1 = IVP_CVT48SNX32(src.native_vector[3], src.native_vector[2]);
return int16x64_t(int16x64_t::from_native_vector, IVP_PACKLNX48(wide0), IVP_PACKLNX48(wide1));
}
HALIDE_ALWAYS_INLINE uint16x32_t convert_to_uint16x32_t_from_int32x32_t(const int32x32_t& src) {
return IVP_SELNX16UI(IVP_MOVNX16_FROMN_2X32(src.native_vector[1]),
IVP_MOVNX16_FROMN_2X32(src.native_vector[0]),
IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0);
}
HALIDE_ALWAYS_INLINE uint16x32_t convert_to_uint16x32_t_from_uint1x32_t(const uint1x32_t& src) {
return IVP_MOVNX16UT(uint16x32_t(1), uint16x32_t(0), src);
}
HALIDE_ALWAYS_INLINE uint16x32_t convert_to_uint16x32_t_from_uint32x32_t(const uint32x32_t& src) {
return IVP_SELNX16UI(IVP_MOVNX16_FROMN_2X32U(src.native_vector[1]),
IVP_MOVNX16_FROMN_2X32U(src.native_vector[0]),
IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0);
}
HALIDE_ALWAYS_INLINE int32x16_t convert_to_int32x16_t_from_uint1x16_t(const uint1x16_t& src) {
xb_vecN_2x32v r = 0;
IVP_INJBIN_2X32(r, src, 0);
return r;
}
HALIDE_ALWAYS_INLINE int32x64_t convert_to_int32x64_t_from_uint8x64_t(const uint8x64_t& src) {
xb_vec2Nx24 wide = src * uint8x64_t(1);
return int32x64_t(int32x64_t::from_native_vector, IVP_CVT32S2NX24LL(wide), IVP_CVT32S2NX24LH(wide),
IVP_CVT32S2NX24HL(wide), IVP_CVT32S2NX24HH(wide));
}
HALIDE_ALWAYS_INLINE uint32x64_t convert_to_uint32x64_t_from_uint8x64_t(const uint8x64_t& src) {
xb_vec2Nx24 wide = src * uint8x64_t(1);
return uint32x64_t(uint32x64_t::from_native_vector, IVP_CVT32S2NX24LL(wide), IVP_CVT32S2NX24LH(wide),
IVP_CVT32S2NX24HL(wide), IVP_CVT32S2NX24HH(wide));
}
HALIDE_ALWAYS_INLINE int32x64_t convert_to_int32x64_t_from_int24x64_t(const int24x64_t& src) {
return int32x64_t(int32x64_t::from_native_vector, IVP_CVT32S2NX24LL(src), IVP_CVT32S2NX24LH(src),
IVP_CVT32S2NX24HL(src), IVP_CVT32S2NX24HH(src));
}
HALIDE_ALWAYS_INLINE int32x32_t convert_to_int32x32_t_from_int16x32_t(const int16x32_t& src) {
xb_vec2Nx24 wide = IVP_CVT24S2NX16(0, src);
return int32x32_t(int32x32_t::from_native_vector,
IVP_CVT32S2NX24LL(wide), IVP_CVT32S2NX24LH(wide));
}
HALIDE_ALWAYS_INLINE int32x64_t convert_to_int32x64_t_from_int16x64_t(const int16x64_t& src) {
auto r0 = convert_to_int32x32_t_from_int16x32_t(src.native_vector[0]);
auto r1 = convert_to_int32x32_t_from_int16x32_t(src.native_vector[1]);
return int32x64_t(int32x64_t::from_native_vector, r0.native_vector[0], r0.native_vector[1],
r1.native_vector[0], r1.native_vector[1]);
}
HALIDE_ALWAYS_INLINE int32x32_t convert_to_int32x32_t_from_uint16x32_t(const uint16x32_t& src) {
return int32x32_t(int32x32_t::from_native_vector,
IVP_MOVN_2X32_FROMNX16(IVP_SELNX16UI(uint16x32_t(0), src, IVP_SELI_16B_INTERLEAVE_1_LO)),
IVP_MOVN_2X32_FROMNX16(IVP_SELNX16UI(uint16x32_t(0), src, IVP_SELI_16B_INTERLEAVE_1_HI)));
}
HALIDE_ALWAYS_INLINE int32x32_t convert_to_int32x32_t_from_uint32x32_t(const uint32x32_t& src) {
return int32x32_t(int32x32_t::from_native_vector,
src.native_vector[0], src.native_vector[1]);
}
HALIDE_ALWAYS_INLINE uint32x32_t convert_to_uint32x32_t_from_int32x32_t(const int32x32_t& src) {
return uint32x32_t(uint32x32_t::from_native_vector,
src.native_vector[0], src.native_vector[1]);
}
HALIDE_ALWAYS_INLINE uint16x64_t convert_to_uint16x64_t_from_int16x64_t(const int16x64_t& src) {
return uint16x64_t(uint16x64_t::from_native_vector,
src.native_vector[0], src.native_vector[1]);
}
HALIDE_ALWAYS_INLINE int32x32_t convert_to_int32x32_t_from_int48x32_t(const int48x32_t& src) {
return int32x32_t(int32x32_t::from_native_vector,
IVP_CVT32SNX48L(src),
IVP_CVT32SNX48H(src));
}
HALIDE_ALWAYS_INLINE uint32x32_t convert_to_uint32x32_t_from_uint16x32_t(const uint16x32_t& src) {
xb_vec2Nx24 wide = IVP_CVT24U2NX16(0, xb_vecNx16U_rtor_xb_vecNx16(src));
return uint32x32_t(uint32x32_t::from_native_vector,
xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_CVT32S2NX24LL(wide)),
xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_CVT32S2NX24LH(wide)));
}
HALIDE_ALWAYS_INLINE uint32x32_t convert_to_uint32x32_t_from_int48x32_t(const int48x32_t& src) {
return uint32x32_t(uint32x32_t::from_native_vector,
xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_CVT32UNX48L(src)),
xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_CVT32UNX48H(src)));
}
HALIDE_ALWAYS_INLINE int16x64_t convert_to_int16x64_t_from_uint16x64_t(const uint16x64_t& src) {
return int16x64_t(int16x64_t::from_native_vector, src.native_vector[0], src.native_vector[1]);
}
HALIDE_ALWAYS_INLINE float32x16_t convert_to_float32x16_t_from_int32x16_t(const int32x16_t& src) {
return IVP_FLOATN_2X32(src, 0);
}
HALIDE_ALWAYS_INLINE float32x32_t convert_to_float32x32_t_from_int32x32_t(const int32x32_t& src) {
return float32x32_t(float32x32_t::from_native_vector,
convert_to_float32x16_t_from_int32x16_t(src.native_vector[0]),
convert_to_float32x16_t_from_int32x16_t(src.native_vector[1]));
}
HALIDE_ALWAYS_INLINE float32x32_t convert_to_float32x32_t_from_int16x32_t(const int16x32_t& src) {
int32x32_t tmp = convert_to_int32x32_t_from_int16x32_t(src);
return convert_to_float32x32_t_from_int32x32_t(tmp);
}
HALIDE_ALWAYS_INLINE int32x16_t convert_to_int32x16_t_from_float32x16_t(const float32x16_t& src) {
return IVP_TRUNCN_2XF32(src, 0);
}
HALIDE_ALWAYS_INLINE int32x32_t convert_to_int32x32_t_from_float32x32_t(const float32x32_t& src) {
return int32x32_t(int32x32_t::from_native_vector,
convert_to_int32x16_t_from_float32x16_t(src.native_vector[0]),
convert_to_int32x16_t_from_float32x16_t(src.native_vector[1]));
}
HALIDE_ALWAYS_INLINE int16x32_t convert_to_int16x32_t_from_float32x32_t(const float32x32_t& src) {
int32x32_t tmp = convert_to_int32x32_t_from_float32x32_t(src);
return convert_to_int16x32_t_from_int32x32_t(tmp);
}
HALIDE_ALWAYS_INLINE uint1x16_t halide_xtensa_slice_to_native(const uint1x32_t& src, int index, int native_lanes, int total_lanes) {
return (index == 0)?IVP_EXTRACTBLN(src):IVP_EXTRACTBHN(src);
}
HALIDE_ALWAYS_INLINE int32x16_t halide_xtensa_convert_i16_low_i32(const int16x32_t& src) {
const int32x16_t m = int32x16_t(1U << (16 - 1));
int32x16_t x = IVP_MOVN_2X32_FROMNX16(IVP_SELNX16I(int16x32_t(0), src, IVP_SELI_16B_INTERLEAVE_1_LO));
int32x16_t r = (x ^ m) - m;
return r;
}
HALIDE_ALWAYS_INLINE int32x16_t halide_xtensa_convert_i16_high_i32(const int16x32_t& src) {
const int32x16_t m = int32x16_t(1U << (16 - 1));
int32x16_t x = IVP_MOVN_2X32_FROMNX16(IVP_SELNX16I(int16x32_t(0), src, IVP_SELI_16B_INTERLEAVE_1_HI));
int32x16_t r = (x ^ m) - m;
return r;
}
HALIDE_ALWAYS_INLINE int32x16_t halide_xtensa_convert_u16_low_i32(const uint16x32_t& src) {
return IVP_MOVN_2X32_FROMNX16(IVP_SELNX16UI(uint16x32_t(0), src, IVP_SELI_16B_INTERLEAVE_1_LO));
}
HALIDE_ALWAYS_INLINE int32x16_t halide_xtensa_convert_u16_high_i32(const uint16x32_t& src) {
return IVP_MOVN_2X32_FROMNX16(IVP_SELNX16UI(uint16x32_t(0), src, IVP_SELI_16B_INTERLEAVE_1_HI));
}
HALIDE_ALWAYS_INLINE uint32x16_t halide_xtensa_convert_u16_low_u32(const uint16x32_t& src) {
return IVP_MOVN_2X32_FROMNX16(IVP_SELNX16UI(uint16x32_t(0), src, IVP_SELI_16B_INTERLEAVE_1_LO));
}
HALIDE_ALWAYS_INLINE uint32x16_t halide_xtensa_convert_u16_high_u32(const uint16x32_t& src) {
return IVP_MOVN_2X32_FROMNX16(IVP_SELNX16UI(uint16x32_t(0), src, IVP_SELI_16B_INTERLEAVE_1_HI));
}
HALIDE_ALWAYS_INLINE uint16x32_t halide_xtensa_convert_i32_u16(const int32x16_t& src0, const int32x16_t& src1) {
xb_vecNx48 wide = IVP_CVT48SNX32(src1, src0);
return xb_vecNx16_rtor_xb_vecNx16U(IVP_PACKLNX48(wide));
}
HALIDE_ALWAYS_INLINE int8x64_t halide_xtensa_convert_concat_i16_to_i8(const int16x32_t& a, const int16x32_t& b) {
xb_vec2Nx24 wide = IVP_CVT24S2NX16(b, a);
return IVP_PACKL2NX24(wide);
}
HALIDE_ALWAYS_INLINE uint8x64_t halide_xtensa_sat_narrow_u8(const int16x64_t& a) {
xb_vec2Nx24 wide = IVP_CVT24S2NX16(a.native_vector[1], a.native_vector[0]);
return IVP_PACKVRU2NX24(wide, 0);
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_sat_narrow_i16(const int32x32_t& a) {
xb_vecNx48 wide = IVP_CVT48SNX32(a.native_vector[1], a.native_vector[0]);
return IVP_PACKVRNX48(wide, 0);
}
HALIDE_ALWAYS_INLINE int8x64_t halide_xtensa_sat_narrow_with_rounding_shift_i8(const int16x64_t& a, uint32_t shift) {
xb_vec2Nx24 wide = IVP_CVT24S2NX16(a.native_vector[1], a.native_vector[0]);
return IVP_PACKVR2NX24(wide, shift);
}
HALIDE_ALWAYS_INLINE uint8x64_t halide_xtensa_sat_narrow_with_rounding_shift_u8(const int16x64_t& a, uint32_t shift) {
xb_vec2Nx24 wide = IVP_CVT24S2NX16(a.native_vector[1], a.native_vector[0]);
return IVP_PACKVRU2NX24(wide, shift);
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_narrow_with_rounding_shift_i16(const int32x32_t& a, uint32_t shift) {
xb_vecNx48 wide = convert_to_int48x32_t_from_int32x32_t(a);
// Add rounding factor.
int32_t half_shift_1 = (shift - 1) >> 1;
int32_t half_shift_2 = (shift - 1) - half_shift_1;
IVP_MULANX16(wide, int16x32_t(1 << half_shift_1), int16x32_t(1 << half_shift_2));
return IVP_PACKVRNRNX48(wide, shift);
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_sat_narrow_with_rounding_shift_i16(const int32x32_t& a, uint32_t shift) {
xb_vecNx48 wide = convert_to_int48x32_t_from_int32x32_t(a);
return IVP_PACKVRNX48(wide, shift);
}
// TODO(vksnk): this is pretty inefficient.
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_sat_narrow_with_signed_rounding_shift_i16(const int32x32_t& a, int32_t shift) {
if (shift >= 0) {
return halide_xtensa_sat_narrow_with_rounding_shift_i16(a, (uint32_t)shift);
}
return halide_xtensa_sat_narrow_i16(
int32x32_t(int32x32_t::from_native_vector,
IVP_SLAN_2X32(a.native_vector[0], -shift),
IVP_SLAN_2X32(a.native_vector[1], -shift)));
}
HALIDE_ALWAYS_INLINE int32x16_t halide_xtensa_sat_narrow_with_rounding_shift_i32(const int64x16_t& a, uint32_t shift) {
return IVP_PACKVRN_2X64W(a, shift);
}
HALIDE_ALWAYS_INLINE uint8x64_t halide_xtensa_convert_concat_i16_to_u8(const int16x32_t& a, const int16x32_t& b) {
return IVP_SEL2NX8UI(IVP_MOV2NX8_FROMNX16(b), IVP_MOV2NX8_FROMNX16(a), IVP_SELI_8B_EXTRACT_1_OF_2_OFF_0);
}
HALIDE_ALWAYS_INLINE int8x64_t halide_xtensa_convert_concat_u16_to_i8(const uint16x32_t& a, const uint16x32_t& b) {
xb_vec2Nx24 wide = IVP_CVT24U2NX16(xb_vecNx16U_rtor_xb_vecNx16(b), xb_vecNx16U_rtor_xb_vecNx16(a));
return IVP_PACKL2NX24(wide);
}
HALIDE_ALWAYS_INLINE uint8x64_t halide_xtensa_convert_concat_u16_to_u8(const uint16x32_t& a, const uint16x32_t& b) {
xb_vec2Nx24 wide = IVP_CVT24U2NX16(xb_vecNx16U_rtor_xb_vecNx16(b), xb_vecNx16U_rtor_xb_vecNx16(a));
return xb_vec2Nx8_rtor_xb_vec2Nx8U(IVP_PACKL2NX24(wide));
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_convert_i8_low_i16(const int8x64_t& src, int native_lanes, int total_lines) {
const int16x32_t m = int16x32_t(1U << (8 - 1));
int16x32_t x = IVP_MOVNX16_FROM2NX8(IVP_SEL2NX8I(int8x64_t(0), src, IVP_SELI_8B_INTERLEAVE_1_LO));
int16x32_t r = (x ^ m) - m;
return r;
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_convert_i8_high_i16(const int8x64_t& src, int native_lanes, int total_lines) {
const int16x32_t m = int16x32_t(1U << (8 - 1));
int16x32_t x = IVP_MOVNX16_FROM2NX8(IVP_SEL2NX8I(int8x64_t(0), src, IVP_SELI_8B_INTERLEAVE_1_HI));
int16x32_t r = (x ^ m) - m;
return r;
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_convert_u8_low_i16(const uint8x64_t& src, int native_lanes, int total_lines) {
return IVP_MOVNX16_FROM2NX8U(IVP_SEL2NX8UI(uint8x64_t(0), src, IVP_SELI_8B_INTERLEAVE_1_LO));
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_convert_u8_high_i16(const uint8x64_t& src, int native_lanes, int total_lines) {
return IVP_MOVNX16_FROM2NX8U(IVP_SEL2NX8UI(uint8x64_t(0), src, IVP_SELI_8B_INTERLEAVE_1_HI));
}
HALIDE_ALWAYS_INLINE uint16x32_t halide_xtensa_convert_u8_low_u16(const uint8x64_t& src, int native_lanes, int total_lines) {
return IVP_MOVNX16_FROM2NX8U(IVP_SEL2NX8UI(uint8x64_t(0), src, IVP_SELI_8B_INTERLEAVE_1_LO));
}
HALIDE_ALWAYS_INLINE uint16x32_t halide_xtensa_convert_u8_high_u16(const uint8x64_t& src, int native_lanes, int total_lines) {
return IVP_MOVNX16_FROM2NX8U(IVP_SEL2NX8UI(uint8x64_t(0), src, IVP_SELI_8B_INTERLEAVE_1_HI));
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_convert_concat_i32_to_i16(const int32x16_t& a, const int32x16_t& b) {
return IVP_SELNX16I(IVP_MOVNX16_FROMN_2X32(b), IVP_MOVNX16_FROMN_2X32(a), IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0);
}
HALIDE_ALWAYS_INLINE uint16x32_t halide_xtensa_convert_concat_i32_to_u16(const int32x16_t& a, const int32x16_t& b) {
return IVP_SELNX16UI(IVP_MOVNX16_FROMN_2X32(b), IVP_MOVNX16_FROMN_2X32(a), IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0);
}
HALIDE_ALWAYS_INLINE int16x32_t halide_xtensa_convert_concat_u32_to_i16(const uint32x16_t& a, const uint32x16_t& b) {
return IVP_SELNX16I(IVP_MOVNX16_FROMN_2X32U(b), IVP_MOVNX16_FROMN_2X32U(a), IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0);
}
HALIDE_ALWAYS_INLINE uint16x32_t halide_xtensa_convert_concat_u32_to_u16(const uint32x16_t& a, const uint32x16_t& b) {
return IVP_SELNX16UI(IVP_MOVNX16_FROMN_2X32U(b), IVP_MOVNX16_FROMN_2X32U(a), IVP_SELI_16B_EXTRACT_1_OF_2_OFF_0);
}
HALIDE_ALWAYS_INLINE uint16x32_t halide_xtensa_convert_concat_u32_to_u16_zzz(const uint32x16_t& a, const uint32x16_t& b) {
return IVP_SELNX16UI(IVP_MOVNX16_FROMN_2X32U(b), IVP_MOVNX16_FROMN_2X32U(a), IVP_SELI_16B_EXTRACT_1_OF_2_OFF_1);
}
HALIDE_ALWAYS_INLINE uint32x16_t halide_xtensa_convert_i48_low_u32(const int48x32_t& src, int native_lanes, int total_lines) {
return xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_CVT32UNX48L(src));
}
HALIDE_ALWAYS_INLINE uint32x16_t halide_xtensa_convert_i48_high_u32(const int48x32_t& src, int native_lanes, int total_lines) {
return xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_CVT32UNX48H(src));
}
HALIDE_ALWAYS_INLINE uint1x32_t halide_xtensa_concat_from_native(const uint1x16_t& a, const uint1x16_t& b) {
return IVP_JOINBN_2(b, a);
}
HALIDE_ALWAYS_INLINE uint1x64_t halide_xtensa_concat_from_native(const uint1x32_t& a, const uint1x32_t& b) {
return IVP_JOINBN(b, a);
}
HALIDE_ALWAYS_INLINE uint1x64_t halide_xtensa_concat_from_native(const uint1x16_t& a, const uint1x16_t& b, const uint1x16_t& c, const uint1x16_t& d) {
return halide_xtensa_concat_from_native(halide_xtensa_concat_from_native(a, b), halide_xtensa_concat_from_native(c, d));
}
HALIDE_ALWAYS_INLINE float32x32_t halide_xtensa_concat_from_native(const float32x16_t& a, const float32x16_t& b) {
return float32x32_t(float32x32_t::from_native_vector, a, b);
}
// TODO(vksnk): this is disabled by default, because iDMA is not part of cstub
// so we need to get git repo compiling with xt-tools first (b/173159625)
#ifdef __cplusplus
extern "C" {
#endif
extern void *halide_tcm_malloc(void *user_context, size_t x);
extern void halide_tcm_free(void *user_context, void *ptr);
extern int halide_init_dma();
extern int32_t halide_xtensa_copy_1d(void* dst, int32_t dst_base, void* src, int32_t src_base, int extent, int item_size);
extern int32_t halide_xtensa_wait_for_copy(int32_t id);
extern int halide_release_dma();
#ifdef __cplusplus
} // extern "C"
#endif
class ScopedDmaInitializer {
int status_;
public:
ScopedDmaInitializer() {
status_ = halide_init_dma();
}
~ScopedDmaInitializer() {
halide_release_dma();
}
};
)INLINE_CODE";
// Fix: on at least one config (our arm32 buildbot running gcc 5.4),
// emitting this long text string was regularly garbled in a predictable
// pattern; flushing the stream before or after heals it. Since C++
// codegen is rarely on a compilation critical path, we'll just band-aid
// it in this way.
stream << std::flush;
stream << native_typedef_decl;
stream << std::flush;
std::set<Type> native_vector_types = {
Type(Type::Int, 8, 64),
Type(Type::UInt, 8, 64),
Type(Type::Int, 16, 32),
Type(Type::UInt, 16, 32),
Type(Type::Int, 32, 16),
Type(Type::UInt, 32, 16),
Type(Type::Int, 24, 64),
Type(Type::UInt, 24, 64),
Type(Type::Int, 48, 32),
Type(Type::UInt, 48, 32),
Type(Type::Int, 64, 16),
Type(Type::Float, 16, 32),
Type(Type::Float, 32, 16),
};
std::set<Type> predefined_vectors = {
Int(8, 4),
Int(8, 128),
UInt(8, 4),
UInt(8, 8),
UInt(8, 128),
UInt(8, 192),
Int(8, 256),
UInt(8, 256),
Int(16, 64),
UInt(16, 64),
Int(16, 128),
UInt(16, 128),
Int(24, 128),
UInt(24, 128),
Int(32, 32),
UInt(32, 32),
Int(32, 64),
UInt(32, 64),
Float(32, 32),
Int(48, 32),
UInt(48, 32),
Int(48, 64),
UInt(48, 64),
};
std::set<Type> multiple_of_native_types;
for (const auto &type : vector_types) {
if (predefined_vectors.count(type) > 0) {
continue;
}
for (const auto &native_vector : native_vector_types) {
if ((native_vector.code() == type.code()) && (native_vector.bits() == type.bits()) && (type.lanes() > native_vector.lanes()) && (type.lanes() % native_vector.lanes() == 0)) {
stream << "using " << print_type(type) << " = MultipleOfNativeVector<" << print_type(native_vector) << ", " << type.lanes() / native_vector.lanes() << ">;\n";
multiple_of_native_types.insert(type);
break;
}
}
}
std::set<Type> filtered_vector_types;
for (const auto &t : vector_types) {
if ((native_vector_types.count(t) > 0) || (predefined_vectors.count(t) > 0) || (multiple_of_native_types.count(t) > 0)) {
continue;
}
filtered_vector_types.insert(t);
}
CodeGen_C::add_vector_typedefs(filtered_vector_types);
}
}
string CodeGen_Xtensa::print_assignment(Type t, const std::string &rhs) {
auto cached = cache.find(rhs);
if (cached == cache.end()) {
id = unique_name('_');
const char *const_flag = output_kind == CPlusPlusImplementation ? "const " : "";
if (t.is_handle()) {
// Don't print void *, which might lose useful type information. just use auto.
stream << get_indent() << "auto * __restrict ";
} else {
stream << get_indent() << print_type(t, AppendSpace);
}
stream << const_flag << id << " = " << rhs << ";\n";
cache[rhs] = id;
} else {
id = cached->second;
}
return id;
}
std::string CodeGen_Xtensa::print_type(Type t, AppendSpaceIfNeeded space_option) {
if (t.bits() == 1 && t.is_vector()) {
return "uint1x" + std::to_string(t.lanes()) + "_t" + (space_option == AppendSpace ? " " : "");
} else if (t.is_float() && t.is_vector()) {
return "float" + std::to_string(t.bits()) + "x" + std::to_string(t.lanes()) + "_t" + (space_option == AppendSpace ? " " : "");
}
return CodeGen_C::print_type(t, space_option);
}
void CodeGen_Xtensa::visit(const IntImm *op) {
if (op->type.is_int() && (op->type.bits() <= 32)) {
id = std::to_string(op->value);
} else {
static const char *const suffixes[3] = {
"ll", // PlainC
"l", // OpenCL
"", // HLSL
};
print_assignment(op->type, "(" + print_type(op->type) + ")(" + std::to_string(op->value) + suffixes[(int)integer_suffix_style] + ")");
}
}
void CodeGen_Xtensa::visit(const Mul *op) {
int bits;
if (is_const_power_of_two_integer(op->b, &bits)) {
print_expr(Call::make(op->type, Call::shift_left, {op->a, Expr(bits)}, Call::PureIntrinsic));
} else {
if (is_native_xtensa_vector<int16_t>(op->type)) {
string sa = print_expr(op->a);
string sb = print_expr(op->b);
print_assignment(op->type, "IVP_MULNX16PACKL(" + sa + ", " + sb + ")");
} else if (is_native_xtensa_vector<int32_t>(op->type)) {
string sa = print_expr(op->a);
string sb = print_expr(op->b);
print_assignment(op->type, "IVP_PACKLN_2X64W(IVP_MULN_2X32(" + sa + ", " + sb + "))");
} else {
visit_binop(op->type, op->a, op->b, "*");
}
}
}
string CodeGen_Xtensa::print_xtensa_call(const Call *op) {
ostringstream rhs;
vector<string> args(op->args.size());
if (op->name == "halide_xtensa_copy_1d") {
internal_assert(op->args.size() >= 3);
const Variable *dest = op->args[0].as<Variable>();
internal_assert(dest != nullptr);
args[0] = print_name(dest->name);
args[1] = print_expr(op->args[1]);
const Variable *src = op->args[2].as<Variable>();
internal_assert(src != nullptr);
args[2] = print_name(src->name);
for (size_t i = 3; i < op->args.size(); i++) {
args[i] = print_expr(op->args[i]);
}
rhs << op->name << "(" << with_commas(args) << ")";
return rhs.str();
}
if (op->name == "halide_xtensa_widening_load") {
internal_assert(op->args.size() == 3);
const Variable *src = op->args[0].as<Variable>();
internal_assert(src != nullptr);
args[0] = print_name(src->name);
args[1] = print_expr(op->args[1]);
// We are only using args[2] argument to get the type of the load.
rhs << "widening_load<" << print_type(op->type) << ", " << print_type(op->args[2].type()) << ">(" << args[0] << ", " << args[1] << ")";
return rhs.str();
}
for (size_t i = 0; i < op->args.size(); i++) {
args[i] = print_expr(op->args[i]);
}
if (op->name == "halide_xtensa_pad_to_native" || op->name == "halide_xtensa_slice_from_padded") {
internal_assert(op->args.size() == 2);
// TODO(vksnk): bools are tricky, because they are bitmasks, so need to be
// handled differently.
if (op->type.is_bool()) {
internal_assert((op->type.lanes() == 64 && op->args[0].type().lanes() == 32) || (op->type.lanes() == 32 && op->args[0].type().lanes() == 16) || (op->type.lanes() == 64 && op->args[0].type().lanes() == 16)) << Expr(op);
}
rhs << op->name << "<" << print_type(op->args[0].type()) << ", "
<< print_type(op->type) << ", " << print_type(op->type.element_of())
<< ", " << op->args[0].type().lanes() << ", " << op->type.lanes()
<< ">(" << args[0] << ", " << args[1] << ")";
return rhs.str();
}
if (op->name == "halide_xtensa_slice_to_native" && !op->type.is_bool()) {
Type native_vector_type = get_native_xtensa_vector(op->type);
int vector_count = op->type.lanes() / native_vector_type.lanes();
if (vector_count == 1) {
rhs << args[0] << ".native_vector[" << args[1] << "]";
} else {
rhs << print_type(op->type) << "(" << print_type(op->type) << "::from_native_vector, ";
std::vector<std::string> native_vectors;
for (int ix = 0; ix < vector_count; ix++) {
native_vectors.push_back(args[0] + ".native_vector[" + args[1] + " * " + std::to_string(vector_count) + " + " + std::to_string(ix) + "]");
}
rhs << with_commas(native_vectors) << ")";
}
return rhs.str();
}
if (op->name == "halide_xtensa_concat_from_native" && !op->type.is_bool()) {
rhs << print_type(op->type) << "(" << print_type(op->type) << "::from_native_vector, " << with_commas(args) << ")";
return rhs.str();
}
if ((op->name.find("halide_xtensa_slice_right") == 0) || (op->name.find("halide_xtensa_slice_left") == 0)) {
string intrinsic_name;
string shift_define;
string direction = (op->name.find("halide_xtensa_slice_right") == 0) ? "RIGHT_" : "LEFT_";
if (is_native_xtensa_vector<int8_t>(op->type)) {
intrinsic_name = "IVP_SEL2NX8I";
shift_define = "IVP_SELI_8B_ROTATE_";
} else if (is_native_xtensa_vector<uint8_t>(op->type)) {
intrinsic_name = "IVP_SEL2NX8UI";
shift_define = "IVP_SELI_8B_ROTATE_";
} else if (is_native_xtensa_vector<int16_t>(op->type)) {
intrinsic_name = "IVP_SELNX16I";
shift_define = "IVP_SELI_16B_ROTATE_";
} else if (is_native_xtensa_vector<uint16_t>(op->type)) {
intrinsic_name = "IVP_SELNX16UI";
shift_define = "IVP_SELI_16B_ROTATE_";
} else if (is_native_xtensa_vector<int32_t>(op->type)) {
intrinsic_name = "IVP_SELN_2X32I";
shift_define = "IVP_SELI_32B_ROTATE_";
} else if (is_native_xtensa_vector<uint32_t>(op->type)) {
intrinsic_name = "IVP_SELN_2X32UI";
shift_define = "IVP_SELI_32B_ROTATE_";
} else if (is_native_xtensa_vector<float>(op->type)) {
intrinsic_name = "IVP_SELN_2XF32I";
shift_define = "IVP_SELI_32B_ROTATE_";
} else {
internal_assert(false) << "Unsupported type for slicing";
}
rhs << intrinsic_name << "(" << args[0] << ".native_vector[1], " << args[0] << ".native_vector[0], " << shift_define << direction << args[1] << ")";
return rhs.str();
}
// absd needs extra cast to uint*
if (op->name == "halide_xtensa_absd_i16") {
rhs << "xb_vecNx16_rtor_xb_vecNx16U(IVP_ABSSUBNX16(" << args[0] + ", " + args[1] + "))";
return rhs.str();
} else if (op->name == "halide_xtensa_narrow_i48_with_shift_u16") {
rhs << "xb_vecNx16_rtor_xb_vecNx16U(IVP_PACKVRNRNX48(" << args[0] + ", " + args[1] + "))";
return rhs.str();
} else if (op->name == "halide_xtensa_convert_i48_low_u32") {
rhs << "xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_CVT32UNX48L(" << args[0] + "))";
return rhs.str();
} else if (op->name == "halide_xtensa_convert_i48_high_u32") {
rhs << "xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_CVT32UNX48H(" << args[0] + "))";
return rhs.str();
}
if (op->name == "halide_xtensa_extract_i32" || op->name == "halide_xtensa_extract_u32") {
rhs << "IVP_EXTRN_2X32(IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(" << args[0] + ")), " + args[1] + ")";
return rhs.str();
}
if (op->name == "halide_xtensa_dual_extract_i32") {
rhs << "IVP_DEXTRPRN_2X32("
<< "IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(" + args[0] + ")), "
<< "IVP_MOVN_2X32_FROMNX16(IVP_MOVNX16_FROM2NX8(" + args[1] + ")), "
<< args[2] + ", " + args[3] + ")";
return rhs.str();
}
string op_name = op->name;
std::map<string, string> op_name_to_intrinsic = {
{"halide_xtensa_sat_add_i16", "IVP_ADDSNX16"},
{"halide_xtensa_sat_sub_i16", "IVP_SUBSNX16"},
{"halide_xtensa_avg_i8", "IVP_AVG2NX8"},
{"halide_xtensa_avg_u8", "IVP_AVGU2NX8"},
{"halide_xtensa_avg_i16", "IVP_AVGNX16"},
{"halide_xtensa_avg_u16", "IVP_AVGUNX16"},
{"halide_xtensa_avg_round_i8", "IVP_AVGR2NX8"},
{"halide_xtensa_avg_round_u8", "IVP_AVGRU2NX8U"},
{"halide_xtensa_avg_round_i16", "IVP_AVGRNX16"},
{"halide_xtensa_avg_round_u16", "IVP_AVGRUNX16U"},
{"halide_xtensa_widen_mul_i48", "IVP_MULNX16"},
{"halide_xtensa_widen_mul_u48", "IVP_MULUUNX16"},
{"halide_xtensa_widen_mul_ui48", "IVP_MULUSNX16"},
{"halide_xtensa_widen_pair_mul_u48", "IVP_MULUUPNX16"},
{"halide_xtensa_convert_i48_low_i32", "IVP_CVT32SNX48L"},
{"halide_xtensa_convert_i48_high_i32", "IVP_CVT32SNX48H"},
{"halide_xtensa_convert_i48_low_u32", "IVP_CVT32UNX48L"},
{"halide_xtensa_convert_i48_high_u32", "IVP_CVT32UNX48H"},
{"halide_xtensa_convert_to_int32x16_t_from_uint1x16_t", "convert_to_int32x16_t_from_uint1x16_t"},
{"halide_xtensa_narrow_i48_with_shift_i16", "IVP_PACKVRNRNX48"},
{"halide_xtensa_sat_narrow_i48_with_shift_i16", "IVP_PACKVRNX48"},
{"halide_xtensa_full_reduce_add_i8", "IVP_RADD2NX8"},
{"halide_xtensa_full_reduce_add_i16", "IVP_RADDNX16"},
{"halide_xtensa_full_reduce_add_i32", "IVP_RADDN_2X32"},
{"halide_xtensa_full_reduce_min_u8", "IVP_RMINU2NX8U"},
{"halide_xtensa_full_reduce_min_u16", "IVP_RMINUNX16U"},
{"halide_xtensa_full_reduce_min_u32", "IVP_RMINUN_2X32U"},
{"halide_xtensa_full_reduce_min_i8", "IVP_RMIN2NX8"},
{"halide_xtensa_full_reduce_min_i16", "IVP_RMINNX16"},
{"halide_xtensa_full_reduce_min_i32", "IVP_RMINN_2X32"},
{"halide_xtensa_full_reduce_max_u8", "IVP_RMAXU2NX8U"},
{"halide_xtensa_full_reduce_max_u16", "IVP_RMAXUNX16U"},
{"halide_xtensa_full_reduce_max_u32", "IVP_RMAXUN_2X32U"},
{"halide_xtensa_full_reduce_max_i8", "IVP_RMAX2NX8"},
{"halide_xtensa_full_reduce_max_i16", "IVP_RMAXNX16"},
{"halide_xtensa_full_reduce_max_i32", "IVP_RMAXN_2X32"},
{"halide_xtensa_sat_left_shift_i16", "IVP_SLSNX16"},
{"halide_xtensa_sat_left_shift_i32", "IVP_SLSN_2X32"},
};
if (op_name_to_intrinsic.count(op_name) > 0) {
op_name = op_name_to_intrinsic[op_name];
}
rhs << op_name << "(" << with_commas(args) << ")";
return rhs.str();
}
void CodeGen_Xtensa::visit(const Div *op) {
int bits;
if (is_const_power_of_two_integer(op->b, &bits)) {
print_expr(Call::make(op->type, Call::shift_right, {op->a, Expr(bits)}, Call::PureIntrinsic));
} else if (is_native_xtensa_vector<float>(op->type)) {
ostringstream rhs;
rhs << "IVP_DIVN_2XF32(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
print_assignment(op->type, rhs.str());
} else {
string sa = print_expr(op->a);
string sb = print_expr(op->b);
if (is_native_xtensa_vector<int32_t>(op->type)) {
print_assignment(op->type, "(common_int32x16_t)" + sa + " / (common_int32x16_t)" + sb);
} else {
print_assignment(op->type, sa + " / " + sb);
}
}
}
void CodeGen_Xtensa::visit(const Mod *op) {
if (is_native_xtensa_vector<int32_t>(op->type)) {
string sa = print_expr(op->a);
string sb = print_expr(op->b);
print_assignment(op->type, "(common_int32x16_t)" + sa + " % (common_int32x16_t)" + sb);
} else {
CodeGen_C::visit(op);
}
}
void CodeGen_Xtensa::visit(const Max *op) {
if (op->type.is_scalar()) {
print_expr(Call::make(op->type, "::halide_cpp_max<" + print_type(op->type) + ">", {op->a, op->b}, Call::Extern));
} else {
ostringstream rhs;
if (is_native_xtensa_vector<int8_t>(op->type)) {
rhs << "IVP_MAX2NX8(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
} else if (is_native_xtensa_vector<uint8_t>(op->type)) {
rhs << "IVP_MAXU2NX8(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
} else if (is_native_xtensa_vector<int16_t>(op->type)) {
rhs << "IVP_MAXNX16(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
} else if (is_native_xtensa_vector<uint16_t>(op->type)) {
rhs << "IVP_MAXUNX16U(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
} else if (is_native_xtensa_vector<int32_t>(op->type)) {
rhs << "IVP_MAXN_2X32(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
} else if (is_native_xtensa_vector<uint32_t>(op->type)) {
rhs << "IVP_MAXUN_2X32(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
} else if (is_native_xtensa_vector<float>(op->type)) {
rhs << "IVP_MAXN_2XF32(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
} else {
rhs << print_type(op->type) << "::max(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
}
print_assignment(op->type, rhs.str());
}
}
void CodeGen_Xtensa::visit(const Min *op) {
if (op->type.is_scalar()) {
print_expr(Call::make(op->type, "::halide_cpp_min<" + print_type(op->type) + ">", {op->a, op->b}, Call::Extern));
} else {
ostringstream rhs;
if (is_native_xtensa_vector<int8_t>(op->type)) {
rhs << "IVP_MIN2NX8(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
} else if (is_native_xtensa_vector<uint8_t>(op->type)) {
rhs << "IVP_MINU2NX8(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
} else if (is_native_xtensa_vector<int16_t>(op->type)) {
rhs << "IVP_MINNX16(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
} else if (is_native_xtensa_vector<uint16_t>(op->type)) {
rhs << "IVP_MINUNX16U(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
} else if (is_native_xtensa_vector<int32_t>(op->type)) {
rhs << "IVP_MINN_2X32(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
} else if (is_native_xtensa_vector<uint32_t>(op->type)) {
rhs << "IVP_MINUN_2X32(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
} else if (is_native_xtensa_vector<float>(op->type)) {
rhs << "IVP_MINN_2XF32(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
} else {
rhs << print_type(op->type) << "::min(" << print_expr(op->a) << ", " << print_expr(op->b) << ")";
}
print_assignment(op->type, rhs.str());
}
}
void CodeGen_Xtensa::visit(const Select *op) {
ostringstream rhs;
string type = print_type(op->type);
string true_val = print_expr(op->true_value);
string false_val = print_expr(op->false_value);
string cond = print_expr(op->condition);
if (op->condition.type().is_scalar()) {
rhs << "(" << type << ")"
<< "(" << cond
<< " ? " << true_val
<< " : " << false_val
<< ")";
} else {
if (is_native_xtensa_vector<int8_t>(op->type)) {
rhs << "IVP_MOV2NX8T(" << true_val << ", " << false_val << ", " << cond << ")";
} else if (is_native_xtensa_vector<uint8_t>(op->type)) {
rhs << "IVP_MOV2NX8UT(" << true_val << ", " << false_val << ", " << cond << ")";
} else if (is_native_xtensa_vector<int16_t>(op->type)) {
rhs << "IVP_MOVNX16T(" << true_val << ", " << false_val << ", " << cond << ")";
} else if (is_native_xtensa_vector<uint16_t>(op->type)) {
rhs << "IVP_MOVNX16UT(" << true_val << ", " << false_val << ", " << cond << ")";
} else if (is_native_xtensa_vector<int32_t>(op->type)) {
rhs << "IVP_MOVN_2X32T(" << true_val << ", " << false_val << ", " << cond << ")";
} else if (is_native_xtensa_vector<uint32_t>(op->type)) {
rhs << "IVP_MOVN_2X32UT(" << true_val << ", " << false_val << ", " << cond << ")";
} else if (is_native_xtensa_vector<float>(op->type)) {
rhs << "IVP_MOVN_2XF32T(" << true_val << ", " << false_val << ", " << cond << ")";
} else {
rhs << type << "::select(" << cond << ", " << true_val << ", " << false_val << ")";
}
}
print_assignment(op->type, rhs.str());
}
void CodeGen_Xtensa::visit(const Ramp *op) {
Type vector_type = op->type.with_lanes(op->lanes);
string id_base = print_expr(op->base);
string id_stride = print_expr(op->stride);
if (is_const_one(op->stride)) {
if (is_native_xtensa_vector<int32_t>(op->type)) {
print_assignment(vector_type, "/* ramp */ int32x16_t(" + id_base + ") + IVP_SEQN_2X32()");
} else {
// If it's wide enough split it here into concat of smaller ramps.
if (op->type.is_int() && (op->type.bits() == 32) && (op->type.lanes() % 16 == 0) && (op->type.lanes() / 16 > 4)) {
int split_to = op->type.lanes() / 16;
std::vector<Expr> concat_args;
for (int ix = 0; ix < split_to; ix++) {
Expr r = Ramp::make(op->base + op->stride * (16 * ix), op->stride, 16);
concat_args.push_back(std::move(r));
}
Expr concat = Call::make(op->type,
"halide_xtensa_concat_from_native",
concat_args, Call::PureExtern);
concat.accept(this);
} else {
print_assignment(vector_type, "dense_ramp<" + print_type(vector_type) + ">(" + id_base + ")");
}
}
} else {
if (is_native_xtensa_vector<int32_t>(op->type)) {
print_assignment(vector_type, "/* ramp */ int32x16_t(" + id_base + ") + IVP_PACKLN_2X64W(IVP_SEQN_2X32() * int32x16_t(" + id_stride + "))");
} else if ((op->type.lanes() == 32 || op->type.lanes() == 64) && op->type.is_int_or_uint() && op->type.bits() == 32) {
print_assignment(vector_type, "ramp<" + print_type(vector_type) + ">(" + id_base + ", " + id_stride + ")");
} else {
print_assignment(vector_type, print_type(vector_type) + "_ops::ramp(" + id_base + ", " + id_stride + ")");
}
}
}
void CodeGen_Xtensa::visit(const Broadcast *op) {
Type vector_type = op->type.with_lanes(op->lanes);
string rhs;
if (op->type.is_int() && ((op->type.bits() == 24) || (op->type.bits() == 48)) && is_const(op->value)) {
// Assigning a constant to wide vector is tricky.
if (is_const_zero(op->value)) {
if (op->type.bits() == 24) {
rhs = "IVP_ZERO2NX24()";
} else if (op->type.bits() == 48) {
rhs = "IVP_ZERONX48()";
}
} else {
rhs = std::to_string(op->value.as<IntImm>()->value);
}
} else if (op->type.is_int_or_uint() && op->type.bits() == 8 && ((op->type.lanes() == 4) || (op->type.lanes() == 8))) {
string id_value = print_expr(op->value);
rhs = "broadcast<" + print_type(op->type) + ", " + print_type(op->value.type()) + ">(" + id_value + ")";
} else {
string id_value = print_expr(op->value);
if (is_native_vector_type(op->type)) {
// TODO(vsknk): why it this extra cast to scalar is needed?
rhs = print_type(vector_type) + "((" + print_type(op->type.with_lanes(1)) + ")" + id_value + ")";
} else if (op->lanes > 1) {
if (op->type.is_bool()) {
// TODO(vksnk): figure out how to broadcast bool.
if (op->type.lanes() == 16) {
rhs = id_value + "? (int32x16_t(1) == int32x16_t(1)) : (int32x16_t(1) == int32x16_t(0))";
} else if (op->type.lanes() == 32) {
rhs = id_value + "? (int16x32_t(1) == int16x32_t(1)) : (int16x32_t(1) == int16x32_t(0))";
} else if (op->type.lanes() == 64) {
rhs = id_value + "? (int8x64_t(1) == int8x64_t(1)) : (int8x64_t(1) == int8x64_t(0))";
}
} else {
rhs = id_value;
}
} else {
rhs = id_value;
}
}
print_assignment(vector_type, rhs);
}
void CodeGen_Xtensa::visit(const LE *op) {
string sa = print_expr(op->a);
string sb = print_expr(op->b);
if (is_native_xtensa_vector<int8_t>(op->a.type())) {
print_assignment(op->type, "IVP_LE2NX8(" + sa + ", " + sb + ")");
} else if (is_native_xtensa_vector<uint8_t>(op->a.type())) {
print_assignment(op->type, "IVP_LEU2NX8U(" + sa + ", " + sb + ")");
} else if (is_native_xtensa_vector<int16_t>(op->a.type())) {
print_assignment(op->type, "IVP_LENX16(" + sa + ", " + sb + ")");
} else if (is_native_xtensa_vector<uint16_t>(op->a.type())) {
print_assignment(op->type, "IVP_LEUNX16U(" + sa + ", " + sb + ")");
} else if (is_native_xtensa_vector<int32_t>(op->a.type())) {
print_assignment(op->type, "IVP_LEN_2X32(" + sa + ", " + sb + ")");
} else if (is_native_xtensa_vector<uint32_t>(op->a.type())) {
print_assignment(op->type, "IVP_LEUN_2X32U(" + sa + ", " + sb + ")");
} else {
CodeGen_C::visit(op);
}
}
void CodeGen_Xtensa::visit(const LT *op) {
string sa = print_expr(op->a);
string sb = print_expr(op->b);
if (is_native_xtensa_vector<int8_t>(op->a.type())) {
print_assignment(op->type, "IVP_LT2NX8(" + sa + ", " + sb + ")");
} else if (is_native_xtensa_vector<uint8_t>(op->a.type())) {
print_assignment(op->type, "IVP_LTU2NX8U(" + sa + ", " + sb + ")");
} else if (is_native_xtensa_vector<int16_t>(op->a.type())) {
print_assignment(op->type, "IVP_LTNX16(" + sa + ", " + sb + ")");
} else if (is_native_xtensa_vector<uint16_t>(op->a.type())) {
print_assignment(op->type, "IVP_LTUNX16U(" + sa + ", " + sb + ")");
} else if (is_native_xtensa_vector<int32_t>(op->a.type())) {
print_assignment(op->type, "IVP_LTN_2X32(" + sa + ", " + sb + ")");
} else if (is_native_xtensa_vector<uint32_t>(op->a.type())) {
print_assignment(op->type, "IVP_LTUN_2X32U(" + sa + ", " + sb + ")");
} else {
CodeGen_C::visit(op);
}
}
void CodeGen_Xtensa::visit(const GT *op) {
string sa = print_expr(op->a);
string sb = print_expr(op->b);
if (is_native_xtensa_vector<int8_t>(op->a.type())) {
print_assignment(op->type, "IVP_GT2NX8(" + sa + ", " + sb + ")");
} else if (is_native_xtensa_vector<uint8_t>(op->a.type())) {
print_assignment(op->type, "IVP_GTU2NX8U(" + sa + ", " + sb + ")");
} else if (is_native_xtensa_vector<int16_t>(op->a.type())) {
print_assignment(op->type, "IVP_GTNX16(" + sa + ", " + sb + ")");
} else if (is_native_xtensa_vector<uint16_t>(op->a.type())) {
print_assignment(op->type, "IVP_GTUNX16U(" + sa + ", " + sb + ")");
} else if (is_native_xtensa_vector<int32_t>(op->a.type())) {
print_assignment(op->type, "IVP_GTN_2X32(" + sa + ", " + sb + ")");
} else if (is_native_xtensa_vector<uint32_t>(op->a.type())) {
print_assignment(op->type, "IVP_GTUN_2X32U(" + sa + ", " + sb + ")");
} else {
CodeGen_C::visit(op);
}
}
void CodeGen_Xtensa::visit(const Or *op) {
string sa = print_expr(op->a);
string sb = print_expr(op->b);
if (op->a.type().is_bool() && op->type.is_vector()) {
if (op->a.type().lanes() == 16) {
print_assignment(op->type, "IVP_ORBN_2(" + sa + ", " + sb + ")");
} else if (op->a.type().lanes() == 32) {
print_assignment(op->type, "IVP_ORBN(" + sa + ", " + sb + ")");
} else if (op->a.type().lanes() == 64) {
print_assignment(op->type, "IVP_ORB2N(" + sa + ", " + sb + ")");
} else {
internal_assert(false) << "Unhandled boolean type in the || op\n";
}
} else {
CodeGen_C::visit(op);
}
}
void CodeGen_Xtensa::visit(const EQ *op) {
string sa = print_expr(op->a);
string sb = print_expr(op->b);
if (is_native_xtensa_vector<int8_t>(op->a.type())) {
print_assignment(op->type, "IVP_EQ2NX8(" + sa + ", " + sb + ")");
} else if (is_native_xtensa_vector<uint8_t>(op->a.type())) {
print_assignment(op->type, "IVP_EQ2NX8U(" + sa + ", " + sb + ")");
} else if (is_native_xtensa_vector<int16_t>(op->a.type())) {
print_assignment(op->type, "IVP_EQNX16(" + sa + ", " + sb + ")");
} else if (is_native_xtensa_vector<uint16_t>(op->a.type())) {
print_assignment(op->type, "IVP_EQNX16U(" + sa + ", " + sb + ")");
} else if (is_native_xtensa_vector<int32_t>(op->a.type())) {
print_assignment(op->type, "IVP_EQN_2X32(" + sa + ", " + sb + ")");
} else if (is_native_xtensa_vector<uint32_t>(op->a.type())) {
print_assignment(op->type, "IVP_EQN_2X32U(" + sa + ", " + sb + ")");
} else {
CodeGen_C::visit(op);
}
}
void CodeGen_Xtensa::visit(const Load *op) {
// TODO: We could replicate the logic in the llvm codegen which decides whether
// the vector access can be aligned. Doing so would also require introducing
// aligned type equivalents for all the vector types.
ostringstream rhs;
Type t = op->type;
string name = print_name(op->name);
// If we're loading a contiguous ramp into a vector, just load the vector
Expr dense_ramp_base = strided_ramp_base(op->index, 1);
if (!is_const_one(op->predicate)) {
const Call *pred = op->predicate.as<Call>();
if (pred && (pred->name == "clamped_dense_ramp") && dense_ramp_base.defined()) {
internal_assert(t.is_vector());
// The number of elements is difference between upper bound and base of the ramp
// plus one (because the predicate is <=).
Expr count = simplify(pred->args[1] - pred->args[0] + 1);
string id_ramp_base = print_expr(dense_ramp_base);
string id_count = print_expr(count);
rhs << "load_variable"
<< "<" << print_type(t) << ", "
<< print_type(t.element_of()) << ", " << t.lanes()
<< ">(" << name << ", " << id_ramp_base << ", " << id_count << ")";
} else {
string id_index = print_expr(op->index);
string id_predicate = print_expr(op->predicate);
rhs << "load_predicated<" << print_type(t) << ", "
<< print_type(op->index.type()) << ", "
<< print_type(op->predicate.type()) << ", "
<< print_type(t.element_of()) << ", " << t.lanes()
<< ">(" << name << ", " << id_index << ", " << id_predicate << ")";
}
} else if (dense_ramp_base.defined()) {
internal_assert(t.is_vector());
std::string op_name;
// TODO(vksnk): generalize this!
int native_lanes = (64 / op->type.element_of().bytes());
if (op->type.element_of().bytes() == 3) {
native_lanes = 64;
}
if (op->type.element_of().bytes() == 6) {
native_lanes = 32;
}
bool is_aligned_load = (op->alignment.modulus % native_lanes == 0) && (op->alignment.remainder % native_lanes == 0);
if (external_buffers.count(op->name) > 0) {
is_aligned_load = is_aligned_load && (op->param.host_alignment() % 64 == 0);
}
if (is_aligned_load) {
op_name = "aligned_load";
} else {
op_name = "load";
}
string id_ramp_base = print_expr(dense_ramp_base);
rhs << op_name << "<" << print_type(t) << ", "
<< print_type(t.element_of()) << ", " << t.lanes()
<< ">(" << name << ", " << id_ramp_base << ")";
} else if (op->index.type().is_vector()) {
// If index is a vector, gather vector elements.
internal_assert(t.is_vector());
// NOTE(vksnk): strided_load may be a good idea, but needs more work.
// const Ramp* maybe_ramp = op->index.as<Ramp>();
// if (maybe_ramp && is_const(maybe_ramp->stride)) {
// string id_index_base = print_expr(maybe_ramp->base);
// string id_index_stride = print_expr(maybe_ramp->stride);
// rhs << print_type(t) + "_strided_load(" << name << ", "
// << id_index_base << ", " << id_index_stride << ")";
// } else {
string id_index = print_expr(op->index);
rhs << "gather_load<" << print_type(t) << ", "
<< print_type(Int(32, t.lanes())) << ", "
<< print_type(t.element_of()) << ", " << t.lanes()
<< ">(" << name << ", " << id_index << ")";
// }
} else {
string id_index = print_expr(op->index);
bool type_cast_needed = !(allocations.contains(op->name) &&
allocations.get(op->name).type.element_of() == t.element_of());
if (type_cast_needed) {
rhs << "((const " << print_type(t.element_of()) << " *)" << name << ")";
} else {
rhs << name;
}
rhs << "[" << id_index << "]";
}
print_assignment(t, rhs.str());
}
void CodeGen_Xtensa::visit(const Store *op) {
Type t = op->value.type();
if (inside_atomic_mutex_node) {
user_assert(t.is_scalar())
<< "The vectorized atomic operation for the store" << op->name
<< " is lowered into a mutex lock, which does not support vectorization.\n";
}
// Issue atomic store if we are in the designated producer.
if (emit_atomic_stores) {
stream << "#if defined(_OPENMP)\n";
stream << "#pragma omp atomic\n";
stream << "#else\n";
stream << "#error \"Atomic stores in the C backend are only supported in compilers that support OpenMP.\"\n";
stream << "#endif\n";
}
bool is_narrowing = false;
bool is_sat_narrowing = false;
Expr value = op->value;
if (const Cast *cast = value.as<Cast>()) {
if (cast->value.type().is_vector() && cast->type.is_int_or_uint() && cast->value.type().is_int_or_uint() && (cast->value.type().bits() == value.type().bits() * 2)) {
is_narrowing = true;
value = cast->value;
}
}
if (const Call *call = value.as<Call>()) {
// TODO: more checks for this one are needed.
if (call->name == "halide_xtensa_slice_from_padded") {
if (const Cast *cast = call->args[0].as<Cast>()) {
if (cast->value.type().is_vector() && cast->type.is_int_or_uint() && cast->value.type().is_int_or_uint() && (cast->value.type().bits() == value.type().bits() * 2)) {
if (const Call *inner_call = cast->value.as<Call>()) {
if (inner_call->name == "halide_xtensa_pad_to_native") {
is_narrowing = true;
value = inner_call->args[0];
}
}
}
}
}
// TODO(vksnk): disabled for now, because corresponding implementation
// is missing.
// if (call->name.find("halide_xtensa_sat_narrow_i") == 0) {
// is_sat_narrowing = true;
// value = call->args[0];
// }
}
string id_value = print_expr(value);
string name = print_name(op->name);
// TODO: We could replicate the logic in the llvm codegen which decides whether
// the vector access can be aligned. Doing so would also require introducing
// aligned type equivalents for all the vector types.
// If we're writing a contiguous ramp, just store the vector.
Expr dense_ramp_base = strided_ramp_base(op->index, 1);
if (!is_const_one(op->predicate)) {
const Call *pred = op->predicate.as<Call>();
if (pred && (pred->name == "clamped_dense_ramp") && dense_ramp_base.defined()) {
// The number of elements is difference between upper bound and base of the ramp
// plus one (because the predicate is <=).
Expr count = simplify(pred->args[1] - pred->args[0] + 1);
internal_assert(op->value.type().is_vector());
string id_ramp_base = print_expr(dense_ramp_base);
string id_count = print_expr(count);
string op_name = "store_variable";
if (is_narrowing) {
op_name = op_name + "_narrowing";
}
if (is_sat_narrowing) {
op_name = op_name + "_narrowing_sat";
}
stream << get_indent() << op_name << "<";
if (is_narrowing) {
stream << print_type(value.type());
} else {
stream << print_type(t);
}
stream << ", " << print_type(t.element_of()) << ", " << t.lanes()
<< ">(" << id_value << ", " << name << ", " << id_ramp_base << ", " << id_count << ");\n";
} else {
string id_index = print_expr(op->index);
string id_predicate = print_expr(op->predicate);
stream << get_indent() << "store_predicated<" << print_type(t) << ", "
<< print_type(op->index.type()) << ", "
<< print_type(op->predicate.type()) << ", "
<< print_type(t.element_of()) << ", " << t.lanes()
<< ">(" << id_value << ", " << name << ", " << id_index << ", " << id_predicate << ");\n";
}
} else if (dense_ramp_base.defined()) {
internal_assert(op->value.type().is_vector());
string op_name;
// TODO(vksnk): generalize this!
int native_lanes = (64 / op->value.type().element_of().bytes());
if (op->value.type().element_of().bytes() == 3) {
native_lanes = 64;
}
if (op->value.type().element_of().bytes() == 6) {
native_lanes = 32;
}
bool is_aligned_store = (op->alignment.modulus % native_lanes == 0) && (op->alignment.remainder % native_lanes == 0);
if (external_buffers.count(op->name) > 0) {
is_aligned_store = is_aligned_store && (op->param.host_alignment() % 64 == 0);
}
if (is_aligned_store) {
op_name = "aligned_store";
} else {
op_name = "store";
}
if (is_narrowing) {
op_name = op_name + "_narrowing";
}
if (is_sat_narrowing) {
op_name = op_name + "_narrowing_sat";
}
string id_ramp_base = print_expr(dense_ramp_base);
stream << get_indent() << op_name << "<";
if (is_narrowing) {
stream << print_type(value.type());
} else {
stream << print_type(t);
}
stream << ", " << print_type(t.element_of()) << ", " << t.lanes()
<< ">(" << id_value << ", " << name << ", " << id_ramp_base << ");\n";
} else if (op->index.type().is_vector()) {
// If index is a vector, scatter vector elements.
internal_assert(t.is_vector());
string id_index = print_expr(op->index);
stream << get_indent() << "store_scatter<" << print_type(t) << ", "
<< print_type(op->index.type()) << ", "
<< print_type(t.element_of()) << ", " << t.lanes()
<< ">(" << id_value << ", " << name << ", " << id_index << ");\n";
} else {
bool type_cast_needed =
t.is_handle() ||
!allocations.contains(op->name) ||
allocations.get(op->name).type != t;
string id_index = print_expr(op->index);
stream << get_indent();
if (type_cast_needed) {
stream << "((" << print_type(t) << " *)" << name << ")";
} else {
stream << name;
}
stream << "[" << id_index << "] = " << id_value << ";\n";
}
cache.clear();
}
void CodeGen_Xtensa::visit(const Call *op) {
ostringstream rhs;
// Handle intrinsics first
if (op->is_intrinsic(Call::shift_left)) {
internal_assert(op->args.size() == 2);
string a0 = print_expr(op->args[0]);
const int64_t *bits = as_const_int(op->args[1]);
if (is_native_xtensa_vector<uint8_t>(op->type) && bits) {
rhs << "IVP_SLLI2NX8U(" << a0 << ", " << std::to_string(*bits) << ")";
} else if (is_native_xtensa_vector<uint16_t>(op->type) && bits) {
rhs << "IVP_SLLINX16U(" << a0 << ", " << std::to_string(*bits) << ")";
} else if (is_native_xtensa_vector<uint32_t>(op->type) && bits) {
rhs << "IVP_SLLIN_2X32U(" << a0 << ", " << std::to_string(*bits) << ")";
} else {
string a1 = print_expr(op->args[1]);
if (is_native_xtensa_vector<uint8_t>(op->type)) {
rhs << "IVP_SLL2NX8U(" << a0 << ", xb_vec2Nx8U_rtor_xb_vec2Nx8(" << a1 << "))";
} else if (is_native_xtensa_vector<int8_t>(op->type)) {
rhs << "IVP_SLA2NX8(" << a0 << ", " << a1 << ")";
} else if (is_native_xtensa_vector<uint16_t>(op->type)) {
rhs << "IVP_SLLNX16U(" << a0 << ", xb_vecNx16U_rtor_xb_vecNx16(" << a1 << "))";
} else if (is_native_xtensa_vector<int16_t>(op->type)) {
rhs << "IVP_SLANX16(" << a0 << ", " << a1 << ")";
} else if (is_native_xtensa_vector<uint32_t>(op->type)) {
rhs << "IVP_SLLN_2X32U(" << a0 << ", xb_vecN_2x32Uv_rtor_xb_vecN_2x32v( " << a1 << "))";
} else if (is_native_xtensa_vector<int32_t>(op->type)) {
rhs << "IVP_SLAN_2X32(" << a0 << ", " << a1 << ")";
} else {
if (op->args[1].type().is_uint()) {
string a0 = print_expr(op->args[0]);
string a1 = print_expr(op->args[1]);
rhs << a0 << " << " << a1;
} else {
rhs << print_expr(lower_signed_shift_left(op->args[0], op->args[1]));
}
}
}
} else if (op->is_intrinsic(Call::shift_right)) {
internal_assert(op->args.size() == 2);
string a0 = print_expr(op->args[0]);
const int64_t *bits = as_const_int(op->args[1]);
if (is_native_xtensa_vector<uint8_t>(op->type) && bits) {
rhs << "IVP_SRLI2NX8U(" << a0 << ", " << std::to_string(*bits) << ")";
} else if (is_native_xtensa_vector<int8_t>(op->type) && bits) {
rhs << "IVP_SRAI2NX8U(" << a0 << ", " << std::to_string(*bits) << ")";
} else if (is_native_xtensa_vector<int16_t>(op->type) && bits) {
rhs << "IVP_SRAINX16(" << a0 << ", " << std::to_string(*bits) << ")";
} else if (is_native_xtensa_vector<uint16_t>(op->type) && bits) {
rhs << "IVP_SRLINX16U(" << a0 << ", " << std::to_string(*bits) << ")";
} else if (is_native_xtensa_vector<int32_t>(op->type) && bits) {
rhs << "IVP_SRAIN_2X32(" << a0 << ", " << std::to_string(*bits) << ")";
} else if (is_native_xtensa_vector<uint32_t>(op->type) && bits) {
rhs << "IVP_SRLIN_2X32U(" << a0 << ", " << std::to_string(*bits) << ")";
} else {
string a1 = print_expr(op->args[1]);
if (is_native_xtensa_vector<uint8_t>(op->type)) {
rhs << "IVP_SRL2NX8(" << a0 << ", " << a1 << ")";
} else if (is_native_xtensa_vector<int8_t>(op->type)) {
rhs << "IVP_SRA2NX8(" << a0 << ", " << a1 << ")";
} else if (is_native_xtensa_vector<uint16_t>(op->type)) {
rhs << "IVP_SRLNX16(" << a0 << ", " << a1 << ")";
} else if (is_native_xtensa_vector<int16_t>(op->type)) {
rhs << "IVP_SRANX16(" << a0 << ", " << a1 << ")";
} else if (is_native_xtensa_vector<uint32_t>(op->type)) {
rhs << "IVP_SRLN_2X32(" << a0 << ", " << a1 << ")";
} else if (is_native_xtensa_vector<int32_t>(op->type)) {
rhs << "IVP_SRAN_2X32(" << a0 << ", (int32x16_t)" << a1 << ")";
} else {
if (op->args[1].type().is_uint()) {
string a0 = print_expr(op->args[0]);
string a1 = print_expr(op->args[1]);
rhs << a0 << " >> " << a1;
} else {
rhs << print_expr(lower_signed_shift_right(op->args[0], op->args[1]));
}
}
}
} else if (op->is_intrinsic(Call::count_leading_zeros)) {
internal_assert(op->args.size() == 1);
if (is_native_xtensa_vector<int16_t>(op->type) || is_native_xtensa_vector<uint16_t>(op->type)) {
// TODO(vksnk): it seems that what Halide does is always matching IVP_NSAUN*?
string intrins_name = op->type.is_int() ? "(IVP_NSAUNX16(" : "xb_vecNx16_rtor_xb_vecNx16U(IVP_NSAUNX16U(";
rhs << intrins_name << print_expr(op->args[0]) << "))";
} else if (is_native_xtensa_vector<int32_t>(op->type) || is_native_xtensa_vector<uint32_t>(op->type)) {
// TODO(vksnk): it seems that what Halide does is always matching IVP_NSAUN*?
string intrins_name = op->type.is_int() ? "(IVP_NSAUN_2X32(" : "xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(IVP_NSAUN_2X32U(";
rhs << intrins_name << print_expr(op->args[0]) << "))";
} else if (op->args[0].type().is_vector()) {
rhs << print_type(op->type) << "::count_leading_zeros(" << print_expr(op->args[0]) << ")";
} else {
string a0 = print_expr(op->args[0]);
rhs << "halide_" << op->name << "(" << a0 << ")";
}
} else if (op->is_intrinsic(Call::prefetch)) {
user_error << "Prefetch is not supported by Xtensa backend." << Expr(op) << "\n";
} else if (op->name == "sqrt_f32") {
string a0 = print_expr(op->args[0]);
if (is_native_xtensa_vector<float>(op->type)) {
rhs << "IVP_FSQRTN_2XF32(" << a0 << ")";
} else {
rhs << "sqrtf(" << a0 << ")";
}
} else if (op->name == "round_f32") {
string a0 = print_expr(op->args[0]);
if (is_native_xtensa_vector<float>(op->type)) {
rhs << "IVP_FIRINTN_2XF32(" << a0 << ")";
} else {
rhs << "nearbyint(" << a0 << ")";
}
} else if (op->name == "floor_f32") {
string a0 = print_expr(op->args[0]);
if (is_native_xtensa_vector<float>(op->type)) {
rhs << "IVP_FIFLOORN_2XF32(" << a0 << ")";
} else {
rhs << "floor_f32(" << a0 << ")";
}
} else if (op->name.find("halide_xtensa_") == 0) {
rhs << print_xtensa_call(op);
} else {
CodeGen_C::visit(op);
return;
}
print_assignment(op->type, rhs.str());
}
void CodeGen_Xtensa::visit(const Cast *op) {
const Type &t = op->type;
const Expr &e = op->value;
string value = print_expr(e);
string type = print_type(t);
if ((is_native_xtensa_vector<int16_t>(t) || is_native_xtensa_vector<uint16_t>(t)) && (is_native_xtensa_vector<int16_t>(e.type()) || is_native_xtensa_vector<uint16_t>(e.type()))) {
if (e.type().is_int()) {
id = print_assignment(t, "xb_vecNx16_rtor_xb_vecNx16U(" + value + ")");
} else {
id = print_assignment(t, "xb_vecNx16U_rtor_xb_vecNx16(" + value + ")");
}
} else if ((is_native_xtensa_vector<int32_t>(t) || is_native_xtensa_vector<uint32_t>(t)) && (is_native_xtensa_vector<int32_t>(e.type()) || is_native_xtensa_vector<uint32_t>(e.type()))) {
if (e.type().is_int()) {
id = print_assignment(t, "xb_vecN_2x32v_rtor_xb_vecN_2x32Uv(" + value + ")");
} else {
id = print_assignment(t, "xb_vecN_2x32Uv_rtor_xb_vecN_2x32v(" + value + ")");
}
} else if (t.is_vector() &&
t.lanes() == e.type().lanes() &&
t != e.type()) {
id = print_assignment(t, "convert_to_" + type + "_from_" + print_type(e.type()) + "(" + value + ")");
} else {
id = print_assignment(t, "(" + type + ")(" + value + ")");
}
}
void CodeGen_Xtensa::visit(const For *op) {
current_loop_level++;
string id_min = print_expr(op->min);
string id_extent = print_expr(op->extent);
if (op->for_type == ForType::Parallel) {
stream << get_indent() << "#pragma omp parallel for\n";
} else {
internal_assert(op->for_type == ForType::Serial)
<< "Can only emit serial or parallel for loops to C\n";
}
// NOTE(vksnk): poor man's profiling below.
// if (current_loop_level == 1) {
// open_scope();
// stream << get_indent() << "int cycles_start, cycles_stop, cyclesAV; (void)cycles_stop; (void)cyclesAV;\n";
// stream << get_indent() << "cycles_start = GetCycleCount();\n";
// }
// if (current_loop_level == 1) {
// stream << get_indent() << "cycles_start = GetCycleCount();\n";
// }
stream << get_indent() << "for (int "
<< print_name(op->name)
<< " = " << id_min
<< "; "
<< print_name(op->name)
<< " < " << id_min
<< " + " << id_extent
<< "; "
<< print_name(op->name)
<< "++)\n";
open_scope();
op->body.accept(this);
close_scope("for " + print_name(op->name));
// NOTE(vksnk): Second part of the poor man's profiling below.
// if (current_loop_level == 1) {
// stream << get_indent() << "cycles_stop = GetCycleCount();\n";
// stream << get_indent() << "cyclesAV = cycles_stop - cycles_start;\n";
// stream << get_indent() << "printf(\"" << op->name << ": %d\\n\", cyclesAV);\n";
// }
// if (current_loop_level == 1) {
// close_scope("profiler" + print_name(op->name));
// }
current_loop_level--;
}
void CodeGen_Xtensa::visit(const Shuffle *op) {
internal_assert(!op->vectors.empty());
for (size_t i = 1; i < op->vectors.size(); i++) {
internal_assert(op->vectors[0].type() == op->vectors[i].type());
}
internal_assert(op->type.lanes() == (int)op->indices.size());
const int max_index = (int)(op->vectors[0].type().lanes() * op->vectors.size());
for (int i : op->indices) {
internal_assert(i >= -1 && i < max_index);
}
// Generate intrinsics for the interleave op.
if (op->is_interleave() && (is_native_vector_type(op->vectors[0].type()) || (op->vectors[0].type().is_bool() && op->vectors[0].type().lanes() == 64))) {
string type_suffix = suffix_for_type(op->type);
Expr call = Call::make(op->type, "halide_xtensa_interleave" + type_suffix,
op->vectors, Call::PureExtern);
call.accept(this);
return;
}
if (op->is_slice() && (op->slice_stride() == 1) && (is_native_xtensa_vector<int8_t>(op->type) || is_native_xtensa_vector<uint8_t>(op->type) || is_native_xtensa_vector<int16_t>(op->type) || is_native_xtensa_vector<uint16_t>(op->type) || is_native_xtensa_vector<int32_t>(op->type) || is_native_xtensa_vector<uint32_t>(op->type) || is_native_xtensa_vector<float>(op->type))) {
string type_suffix = suffix_for_type(op->type);
string function_name = "halide_xtensa_slice";
int slice_begin = op->slice_begin();
if (op->slice_begin() < 5) {
function_name += "_right";
}
if ((op->type.lanes() - op->slice_begin() < 5) && (op->type.lanes() > op->slice_begin())) {
function_name += "_left";
slice_begin = op->type.lanes() - op->slice_begin();
}
Expr call = Call::make(op->type, function_name + type_suffix,
{op->vectors[0], slice_begin}, Call::PureExtern);
call.accept(this);
return;
}
if (op->vectors.size() == 1 && is_double_native_vector_type(op->vectors[0].type())) {
if (op->is_slice() && (op->slice_begin() < 2) && (op->slice_stride() == 2) && ((int)op->indices.size() == op->vectors[0].type().lanes() / 2)) {
string type_suffix = suffix_for_type(op->type);
string function_name = std::string("halide_xtensa_deinterleave") + ((op->slice_begin() == 0) ? "_even" : "_odd");
Expr call = Call::make(op->type, function_name + type_suffix,
{op->vectors[0]}, Call::PureExtern);
call.accept(this);
return;
}
}
if (op->is_concat() && is_native_vector_type(op->vectors[0].type())) {
Expr call = Call::make(op->type, "halide_xtensa_concat_from_native", op->vectors, Call::PureExtern);
call.accept(this);
return;
}
std::vector<string> vecs;
for (Expr v : op->vectors) {
vecs.push_back(print_expr(v));
}
string src = vecs[0];
Type src_type = op->vectors[0].type();
if (op->vectors.size() > 1) {
ostringstream rhs;
rhs << "concat<"
<< print_type(op->type) << ", "
<< print_type(op->vectors[0].type()) << ", "
<< print_type(op->type.element_of()) << ", "
<< op->type.lanes() << ", "
<< op->vectors[0].type().lanes()
<< ">(" << with_commas(vecs) << ")";
src = print_assignment(op->type, rhs.str());
src_type = src_type.with_lanes(src_type.lanes() * op->vectors.size());
}
ostringstream rhs;
if (op->type.is_scalar()) {
rhs << src << "[" << op->indices[0] << "]";
} else if (op->is_concat()) {
// Do nothing if it's just concat.
return;
} else if (op->type.bits() == 24 && op->vectors[0].type().lanes() == 128 && op->type.is_int()) {
if (op->is_slice() && op->slice_begin() == 0 && op->slice_stride() == 1 && op->indices.size() == 64) {
rhs << src << ".native_vector[0]";
}
if (op->is_slice() && op->slice_begin() == 64 &&
op->slice_stride() == 1 && op->indices.size() == 64) {
rhs << src << ".native_vector[1]";
}
} else {
string indices_name = unique_name('_');
stream << get_indent() << "const int32_t " << indices_name << "[" << op->indices.size() << "] = { " << with_commas(op->indices) << " };\n";
rhs << "shuffle"
<< "<"
<< print_type(src_type) << ", "
<< print_type(op->type) << ", "
<< print_type(op->type.element_of()) << ", " << src_type.lanes()
<< ", " << op->type.lanes()
<< ">(" << src << ", " << indices_name << ")";
}
print_assignment(op->type, rhs.str());
}
void CodeGen_Xtensa::visit(const Allocate *op) {
open_scope();
string op_name = print_name(op->name);
string op_type = print_type(op->type, AppendSpace);
// For sizes less than 8k, do a stack allocation
bool on_stack = false;
int32_t constant_size;
string size_id;
Type size_id_type;
if (op->new_expr.defined()) {
Allocation alloc;
alloc.type = op->type;
allocations.push(op->name, alloc);
heap_allocations.push(op->name);
stream << op_type << "*" << op_name << " = (" << print_expr(op->new_expr) << ");\n";
} else {
constant_size = op->constant_allocation_size();
if (constant_size > 0) {
int64_t stack_bytes = constant_size * op->type.bytes();
if (stack_bytes > ((int64_t(1) << 31) - 1)) {
user_error << "Total size for allocation "
<< op->name << " is constant but exceeds 2^31 - 1.\n";
} else {
size_id_type = Int(32);
size_id = print_expr(make_const(size_id_type, constant_size));
if (op->memory_type == MemoryType::Stack ||
op->memory_type == MemoryType::Register ||
(op->memory_type == MemoryType::Auto &&
can_allocation_fit_on_stack(stack_bytes))) {
on_stack = true;
}
}
} else {
// Check that the allocation is not scalar (if it were scalar
// it would have constant size).
internal_assert(!op->extents.empty());
size_id = print_assignment(Int(64), print_expr(op->extents[0]));
size_id_type = Int(64);
for (size_t i = 1; i < op->extents.size(); i++) {
// Make the code a little less cluttered for two-dimensional case
string new_size_id_rhs;
string next_extent = print_expr(op->extents[i]);
if (i > 1) {
new_size_id_rhs = "(" + size_id + " > ((int64_t(1) << 31) - 1)) ? " + size_id + " : (" + size_id + " * " + next_extent + ")";
} else {
new_size_id_rhs = size_id + " * " + next_extent;
}
size_id = print_assignment(Int(64), new_size_id_rhs);
}
stream << get_indent() << "if (("
<< size_id << " > ((int64_t(1) << 31) - 1)) || (("
<< size_id << " * sizeof("
<< op_type << ")) > ((int64_t(1) << 31) - 1)))\n";
open_scope();
stream << get_indent();
// TODO: call halide_error_buffer_allocation_too_large() here instead
// TODO: call create_assertion() so that NoAssertions works
stream << "halide_error(_ucon, "
<< "\"32-bit signed overflow computing size of allocation " << op->name << "\\n\");\n";
stream << get_indent() << "return -1;\n";
close_scope("overflow test " + op->name);
}
// Check the condition to see if this allocation should actually be created.
// If the allocation is on the stack, the only condition we can respect is
// unconditional false (otherwise a non-constant-sized array declaration
// will be generated).
if (!on_stack || is_const_zero(op->condition)) {
Expr conditional_size = Select::make(op->condition,
Variable::make(size_id_type, size_id),
make_const(size_id_type, 0));
conditional_size = simplify(conditional_size);
size_id = print_assignment(Int(64), print_expr(conditional_size));
}
Allocation alloc;
alloc.type = op->type;
allocations.push(op->name, alloc);
stream << get_indent() << op_type;
if (on_stack) {
stream << "__attribute__((aligned(64))) " << op_name
<< "[" << size_id << "];\n";
} else if (op->memory_type == MemoryType::VTCM) {
stream << "*"
<< "__attribute__((aligned(64))) "
<< " __restrict "
<< op_name
<< " = ("
<< op_type
<< " *)halide_tcm_malloc(_ucon, sizeof("
<< op_type
<< ")*" << size_id << ");\n";
} else {
stream << "*"
<< "__attribute__((aligned(64))) "
<< " __restrict "
<< op_name
<< " = ("
<< op_type
<< " *)halide_malloc(_ucon, sizeof("
<< op_type
<< ")*" << size_id << ");\n";
heap_allocations.push(op->name);
}
}
if (!on_stack) {
create_assertion(op_name, Call::make(Int(32), "halide_error_out_of_memory", {}, Call::Extern));
string free_function = op->free_function.empty() ?
(op->memory_type != MemoryType::VTCM ? "halide_free" : "halide_tcm_free") :
op->free_function;
if (op->memory_type != MemoryType::VTCM) {
}
stream << get_indent();
stream << "HalideFreeHelper " << op_name << "_free(_ucon, "
<< op_name << ", " << free_function << ");\n";
}
op->body.accept(this);
// Free the memory if it was allocated on the heap and there is no matching
// Free node.
print_heap_free(op->name);
if (allocations.contains(op->name)) {
allocations.pop(op->name);
}
close_scope("alloc " + print_name(op->name));
}
void CodeGen_Xtensa::visit(const Let *op) {
const auto *call = op->value.as<Call>();
if (call && (call->name == "clamped_dense_ramp")) {
Expr body = substitute(op->name, call, op->body);
body.accept(this);
return;
}
return CodeGen_C::visit(op);
}
void CodeGen_Xtensa::visit(const LetStmt *op) {
const auto *call = op->value.as<Call>();
if (call && (call->name == "clamped_dense_ramp")) {
Stmt body = substitute(op->name, call, op->body);
body.accept(this);
return;
}
return CodeGen_C::visit(op);
}
} // namespace Internal
} // namespace Halide
![swh spinner](/static/img/swh-spinner.gif)
Computing file changes ...