// This file is a part of Julia. License is MIT: http://julialang.org/license

// This is in implementation of the Julia intrinsic functions against boxed types
// excluding the c interface (ccall, cglobal, llvmcall)
//
// this file assumes a little-endian processor, although that isn't too hard to fix
// it also assumes two's complement negative numbers, which might be a bit harder to fix
//
// TODO: add half-float support

#include "julia.h"
#include "julia_internal.h"
#include "APInt-C.h"

const unsigned int host_char_bit = 8;

// run time version of box/unbox intrinsic
JL_DLLEXPORT jl_value_t *jl_reinterpret(jl_value_t *ty, jl_value_t *v)
{
    JL_TYPECHK(reinterpret, datatype, ty);
    if (!jl_is_leaf_type(ty) || !jl_is_bitstype(ty))
        jl_error("reinterpret: target type not a leaf bitstype");
    if (!jl_is_bitstype(jl_typeof(v)))
        jl_error("reinterpret: value not a bitstype");
    if (jl_datatype_size(jl_typeof(v)) != jl_datatype_size(ty))
        jl_error("reinterpret: argument size does not match size of target type");
    if (ty == jl_typeof(v))
        return v;
    if (ty == (jl_value_t*)jl_bool_type)
        return *(uint8_t*)jl_data_ptr(v) & 1 ? jl_true : jl_false;
    return jl_new_bits(ty, jl_data_ptr(v));
}

// run time version of pointerref intrinsic (warning: i is not rooted)
JL_DLLEXPORT jl_value_t *jl_pointerref(jl_value_t *p, jl_value_t *i, jl_value_t *align)
{
    JL_TYPECHK(pointerref, pointer, p);
    JL_TYPECHK(pointerref, long, i)
    JL_TYPECHK(pointerref, long, align);
    // TODO: alignment
    jl_value_t *ety = jl_tparam0(jl_typeof(p));
    if (ety == (jl_value_t*)jl_any_type) {
        jl_value_t **pp = (jl_value_t**)(jl_unbox_long(p) + (jl_unbox_long(i)-1)*sizeof(void*));
        return *pp;
    }
    else {
        if (!jl_is_datatype(ety))
            jl_error("pointerref: invalid pointer");
        size_t nb = LLT_ALIGN(jl_datatype_size(ety), ((jl_datatype_t*)ety)->layout->alignment);
        char *pp = (char*)jl_unbox_long(p) + (jl_unbox_long(i)-1)*nb;
        return jl_new_bits(ety, pp);
    }
}

// run time version of pointerset intrinsic (warning: x is not gc-rooted)
JL_DLLEXPORT jl_value_t *jl_pointerset(jl_value_t *p, jl_value_t *x, jl_value_t *i, jl_value_t *align)
{
    JL_TYPECHK(pointerset, pointer, p);
    JL_TYPECHK(pointerset, long, i);
    JL_TYPECHK(pointerref, long, align);
    // TODO: alignment
    jl_value_t *ety = jl_tparam0(jl_typeof(p));
    if (ety == (jl_value_t*)jl_any_type) {
        jl_value_t **pp = (jl_value_t**)(jl_unbox_long(p) + (jl_unbox_long(i)-1)*sizeof(void*));
        *pp = x;
    }
    else {
        if (!jl_is_datatype(ety))
            jl_error("pointerset: invalid pointer");
        size_t nb = LLT_ALIGN(jl_datatype_size(ety), ((jl_datatype_t*)ety)->layout->alignment);
        char *pp = (char*)jl_unbox_long(p) + (jl_unbox_long(i)-1)*nb;
        if (jl_typeof(x) != ety)
            jl_error("pointerset: type mismatch in assign");
        jl_assign_bits(pp, x);
    }
    return p;
}

static inline char signbitbyte(void *a, unsigned bytes)
{
    // sign bit of an signed number of n bytes, as a byte
    return (((signed char*)a)[bytes - 1] < 0) ? ~0 : 0;
}

static inline char usignbitbyte(void *a, unsigned bytes)
{
    // sign bit of an unsigned number
    return 0;
}

static inline unsigned select_by_size(unsigned sz)
{
    /* choose the right sized function specialization */
    switch (sz) {
    default: return 0;
    case  1: return 1;
    case  2: return 2;
    case  4: return 3;
    case  8: return 4;
    case 16: return 5;
    }
}

#define SELECTOR_FUNC(intrinsic) \
    typedef intrinsic##_t select_##intrinsic##_t[6]; \
    static inline intrinsic##_t select_##intrinsic(unsigned sz, const select_##intrinsic##_t list) \
    { \
        intrinsic##_t thunk = list[select_by_size(sz)]; \
        if (!thunk) thunk = list[0]; \
        return thunk; \
    }

#define fp_select(a, func) \
    sizeof(a) == sizeof(float) ? func##f((float)a) : func(a)
#define fp_select2(a, b, func) \
    sizeof(a) == sizeof(float) ? func##f(a, b) : func(a, b)

// fast-function generators //

// integer input
// OP::Function macro(input)
// name::unique string
// nbits::number of bits
// c_type::c_type corresponding to nbits
#define un_iintrinsic_ctype(OP, name, nbits, c_type) \
static inline void jl_##name##nbits(unsigned runtime_nbits, void *pa, void *pr) \
{ \
    c_type a = *(c_type*)pa; \
    *(c_type*)pr = OP(a); \
}

// integer input, unsigned output
// OP::Function macro(input)
// name::unique string
// nbits::number of bits
// c_type::c_type corresponding to nbits
#define uu_iintrinsic_ctype(OP, name, nbits, c_type) \
static inline unsigned jl_##name##nbits(unsigned runtime_nbits, void *pa) \
{ \
    c_type a = *(c_type*)pa; \
    return OP(a); \
}

// floating point
// OP::Function macro(output pointer, input)
// name::unique string
// nbits::number of bits in the *input*
// c_type::c_type corresponding to nbits
#define un_fintrinsic_ctype(OP, name, c_type) \
static inline void name(unsigned osize, void *pa, void *pr) \
{ \
    c_type a = *(c_type*)pa; \
    OP((c_type*)pr, a); \
}

// float or integer inputs
// OP::Function macro(inputa, inputb)
// name::unique string
// nbits::number of bits
// c_type::c_type corresponding to nbits
#define bi_intrinsic_ctype(OP, name, nbits, c_type) \
static void jl_##name##nbits(unsigned runtime_nbits, void *pa, void *pb, void *pr) \
{ \
    c_type a = *(c_type*)pa; \
    c_type b = *(c_type*)pb; \
    *(c_type*)pr = (c_type)OP(a, b); \
}

// float or integer inputs, bool output
// OP::Function macro(inputa, inputb)
// name::unique string
// nbits::number of bits
// c_type::c_type corresponding to nbits
#define bool_intrinsic_ctype(OP, name, nbits, c_type) \
static int jl_##name##nbits(unsigned runtime_nbits, void *pa, void *pb) \
{ \
    c_type a = *(c_type*)pa; \
    c_type b = *(c_type*)pb; \
    return OP(a, b); \
}

// integer inputs, with precondition test
// OP::Function macro(inputa, inputb)
// name::unique string
// nbits::number of bits
// c_type::c_type corresponding to nbits
#define checked_intrinsic_ctype(CHECK_OP, OP, name, nbits, c_type) \
static int jl_##name##nbits(unsigned runtime_nbits, void *pa, void *pb, void *pr) \
{ \
    c_type a = *(c_type*)pa; \
    c_type b = *(c_type*)pb; \
    if (CHECK_OP(a, b)) \
        return 1; \
    *(c_type*)pr = (c_type)OP(a, b); \
    return 0; \
}

// float inputs
// OP::Function macro(inputa, inputb, inputc)
// name::unique string
// nbits::number of bits
// c_type::c_type corresponding to nbits
#define ter_intrinsic_ctype(OP, name, nbits, c_type) \
static void jl_##name##nbits(unsigned runtime_nbits, void *pa, void *pb, void *pc, void *pr) \
{ \
    c_type a = *(c_type*)pa; \
    c_type b = *(c_type*)pb; \
    c_type c = *(c_type*)pc; \
    *(c_type*)pr = (c_type)OP(a, b, c); \
}


// unary operator generator //

typedef void (*intrinsic_1_t)(unsigned, void*, void*);
SELECTOR_FUNC(intrinsic_1)
#define un_iintrinsic(name, u) \
JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a) \
{ \
    return jl_iintrinsic_1(jl_typeof(a), a, #name, u##signbitbyte, jl_intrinsiclambda_ty1, name##_list); \
}
#define un_iintrinsic_fast(LLVMOP, OP, name, u) \
un_iintrinsic_ctype(OP, name, 8, u##int##8_t) \
un_iintrinsic_ctype(OP, name, 16, u##int##16_t) \
un_iintrinsic_ctype(OP, name, 32, u##int##32_t) \
un_iintrinsic_ctype(OP, name, 64, u##int##64_t) \
static const select_intrinsic_1_t name##_list = { \
    LLVMOP, \
    jl_##name##8, \
    jl_##name##16, \
    jl_##name##32, \
    jl_##name##64, \
}; \
un_iintrinsic(name, u)
#define un_iintrinsic_slow(LLVMOP, name, u) \
static const select_intrinsic_1_t name##_list = { \
    LLVMOP \
}; \
un_iintrinsic(name, u)

typedef unsigned (*intrinsic_u1_t)(unsigned, void*);
SELECTOR_FUNC(intrinsic_u1)
#define uu_iintrinsic(name, u) \
JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a) \
{ \
    return jl_iintrinsic_1(jl_typeof(a), a, #name, u##signbitbyte, jl_intrinsiclambda_u1, name##_list); \
}
#define uu_iintrinsic_fast(LLVMOP, OP, name, u) \
uu_iintrinsic_ctype(OP, name, 8, u##int##8_t) \
uu_iintrinsic_ctype(OP, name, 16, u##int##16_t) \
uu_iintrinsic_ctype(OP, name, 32, u##int##32_t) \
uu_iintrinsic_ctype(OP, name, 64, u##int##64_t) \
static const select_intrinsic_u1_t name##_list = { \
    LLVMOP, \
    jl_##name##8, \
    jl_##name##16, \
    jl_##name##32, \
    jl_##name##64, \
}; \
uu_iintrinsic(name, u)
#define uu_iintrinsic_slow(LLVMOP, name, u) \
static const select_intrinsic_u1_t name##_list = { \
    LLVMOP \
}; \
uu_iintrinsic(name, u)

static inline
jl_value_t *jl_iintrinsic_1(jl_value_t *ty, jl_value_t *a, const char *name,
                            char (*getsign)(void*, unsigned),
                            jl_value_t *(*lambda1)(jl_value_t*, void*, unsigned, unsigned, const void*), const void *list)
{
    if (!jl_is_bitstype(jl_typeof(a)))
        jl_errorf("%s: value is not a bitstype", name);
    if (!jl_is_bitstype(ty))
        jl_errorf("%s: type is not a bitstype", name);
    void *pa = jl_data_ptr(a);
    unsigned isize = jl_datatype_size(jl_typeof(a));
    unsigned isize2 = next_power_of_two(isize);
    unsigned osize = jl_datatype_size(ty);
    unsigned osize2 = next_power_of_two(osize);
    if (isize2 > osize2)
        osize2 = isize2;
    if (osize2 > isize || isize2 > isize) {
        /* if needed, round type up to a real c-type and set/clear the unused bits */
        void *pa2;
        pa2 = alloca(osize2);
        /* TODO: this memcpy assumes little-endian,
         * for big-endian, need to align the copy to the other end */ \
        memcpy(pa2, pa, isize);
        memset((char*)pa2 + isize, getsign(pa, isize), osize2 - isize);
        pa = pa2;
    }
    jl_value_t *newv = lambda1(ty, pa, osize, osize2, list);
    if (ty == (jl_value_t*)jl_bool_type)
        return *(uint8_t*)jl_data_ptr(newv) & 1 ? jl_true : jl_false;
    return newv;
}

static inline jl_value_t *jl_intrinsiclambda_ty1(jl_value_t *ty, void *pa, unsigned osize, unsigned osize2, const void *voidlist)
{
    jl_ptls_t ptls = jl_get_ptls_states();
    jl_value_t *newv = jl_gc_alloc(ptls, ((jl_datatype_t*)ty)->size, ty);
    intrinsic_1_t op = select_intrinsic_1(osize2, (const intrinsic_1_t*)voidlist);
    op(osize * host_char_bit, pa, jl_data_ptr(newv));
    return newv;
}

static inline jl_value_t *jl_intrinsiclambda_u1(jl_value_t *ty, void *pa, unsigned osize, unsigned osize2, const void *voidlist)
{
    jl_ptls_t ptls = jl_get_ptls_states();
    jl_value_t *newv = jl_gc_alloc(ptls, ((jl_datatype_t*)ty)->size, ty);
    intrinsic_u1_t op = select_intrinsic_u1(osize2, (const intrinsic_u1_t*)voidlist);
    unsigned cnt = op(osize * host_char_bit, pa);
    // TODO: the following memset/memcpy assumes little-endian
    // for big-endian, need to copy from the other end of cnt
    if (osize > sizeof(unsigned)) {
        // perform zext, if needed
        memset((char*)jl_data_ptr(newv) + sizeof(unsigned), 0, osize - sizeof(unsigned));
        osize = sizeof(unsigned);
    }
    memcpy(jl_data_ptr(newv), &cnt, osize);
    return newv;
}

// conversion operator

typedef void (*intrinsic_cvt_t)(unsigned, void*, unsigned, void*);
typedef unsigned (*intrinsic_cvt_check_t)(unsigned, unsigned, void*);
#define cvt_iintrinsic_checked(LLVMOP, check_op, name) \
JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *ty, jl_value_t *a) \
{ \
    return jl_intrinsic_cvt(ty, a, #name, LLVMOP, check_op); \
}
#define cvt_iintrinsic(LLVMOP, name) \
    cvt_iintrinsic_checked(LLVMOP, NULL, name) \

static inline jl_value_t *jl_intrinsic_cvt(jl_value_t *ty, jl_value_t *a, const char *name, intrinsic_cvt_t op, intrinsic_cvt_check_t check_op)
{
    jl_ptls_t ptls = jl_get_ptls_states();
    jl_value_t *aty = jl_typeof(a);
    if (!jl_is_bitstype(aty))
        jl_errorf("%s: value is not a bitstype", name);
    if (!jl_is_bitstype(ty))
        jl_errorf("%s: type is not a bitstype", name);
    void *pa = jl_data_ptr(a);
    unsigned isize = jl_datatype_size(aty);
    unsigned osize = jl_datatype_size(ty);
    if (check_op && check_op(isize, osize, pa))
        jl_throw(jl_inexact_exception);
    jl_value_t *newv = jl_gc_alloc(ptls, ((jl_datatype_t*)ty)->size, ty);
    op(aty == (jl_value_t*)jl_bool_type ? 1 : isize * host_char_bit, pa,
            osize * host_char_bit, jl_data_ptr(newv));
    if (ty == (jl_value_t*)jl_bool_type)
        return *(uint8_t*)jl_data_ptr(newv) & 1 ? jl_true : jl_false;
    return newv;
}

// floating point

#define un_fintrinsic_withtype(OP, name) \
un_fintrinsic_ctype(OP, jl_##name##32, float) \
un_fintrinsic_ctype(OP, jl_##name##64, double) \
JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *ty, jl_value_t *a) \
{ \
    return jl_fintrinsic_1(ty, a, #name, jl_##name##32, jl_##name##64); \
}

#define un_fintrinsic(OP, name) \
un_fintrinsic_withtype(OP, name##_withtype) \
JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a) \
{ \
    return jl_##name##_withtype(jl_typeof(a), a); \
}

typedef void (fintrinsic_op1)(unsigned, void*, void*);

static inline jl_value_t *jl_fintrinsic_1(jl_value_t *ty, jl_value_t *a, const char *name, fintrinsic_op1 *floatop, fintrinsic_op1 *doubleop)
{
    jl_ptls_t ptls = jl_get_ptls_states();
    if (!jl_is_bitstype(jl_typeof(a)))
        jl_errorf("%s: value is not a bitstype", name);
    if (!jl_is_bitstype(ty))
        jl_errorf("%s: type is not a bitstype", name);
    unsigned sz2 = jl_datatype_size(ty);
    jl_value_t *newv = jl_gc_alloc(ptls, sz2, ty);
    void *pa = jl_data_ptr(a), *pr = jl_data_ptr(newv);
    unsigned sz = jl_datatype_size(jl_typeof(a));
    switch (sz) {
    /* choose the right size c-type operation based on the input */
    case 4:
        floatop(sz2 * host_char_bit, pa, pr);
        break;
    case 8:
        doubleop(sz2 * host_char_bit, pa, pr);
        break;
    default:
        jl_errorf("%s: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64", name);
    }
    return newv;
}

// binary operator generator //

// integer

typedef void (*intrinsic_2_t)(unsigned, void*, void*, void*);
SELECTOR_FUNC(intrinsic_2)
#define bi_iintrinsic(name, u, cvtb) \
JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b) \
{ \
    return jl_iintrinsic_2(a, b, #name, u##signbitbyte, jl_intrinsiclambda_2, name##_list, cvtb); \
}
#define bi_iintrinsic_cnvtb_fast(LLVMOP, OP, name, u, cvtb) \
bi_intrinsic_ctype(OP, name, 8, u##int##8_t) \
bi_intrinsic_ctype(OP, name, 16, u##int##16_t) \
bi_intrinsic_ctype(OP, name, 32, u##int##32_t) \
bi_intrinsic_ctype(OP, name, 64, u##int##64_t) \
static const select_intrinsic_2_t name##_list = { \
    LLVMOP, \
    jl_##name##8, \
    jl_##name##16, \
    jl_##name##32, \
    jl_##name##64, \
}; \
bi_iintrinsic(name, u, cvtb)
#define bi_iintrinsic_fast(LLVMOP, OP, name, u) \
    bi_iintrinsic_cnvtb_fast(LLVMOP, OP, name, u, 0)

typedef int (*intrinsic_cmp_t)(unsigned, void*, void*);
SELECTOR_FUNC(intrinsic_cmp)
#define cmp_iintrinsic(name, u) \
JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b) \
{ \
    return jl_iintrinsic_2(a, b, #name, u##signbitbyte, jl_intrinsiclambda_cmp, name##_list, 0); \
}
#define bool_iintrinsic_fast(LLVMOP, OP, name, u) \
bool_intrinsic_ctype(OP, name, 8, u##int##8_t) \
bool_intrinsic_ctype(OP, name, 16, u##int##16_t) \
bool_intrinsic_ctype(OP, name, 32, u##int##32_t) \
bool_intrinsic_ctype(OP, name, 64, u##int##64_t) \
static const select_intrinsic_cmp_t name##_list = { \
    LLVMOP, \
    jl_##name##8, \
    jl_##name##16, \
    jl_##name##32, \
    jl_##name##64, \
}; \
cmp_iintrinsic(name, u)

typedef int (*intrinsic_checked_t)(unsigned, void*, void*, void*);
SELECTOR_FUNC(intrinsic_checked)
#define checked_iintrinsic(name, u) \
JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b) \
{ \
    return jl_iintrinsic_2(a, b, #name, u##signbitbyte, jl_intrinsiclambda_checked, name##_list, 0); \
}
#define checked_iintrinsic_fast(LLVMOP, CHECK_OP, OP, name, u) \
checked_intrinsic_ctype(CHECK_OP, OP, name, 8, u##int##8_t) \
checked_intrinsic_ctype(CHECK_OP, OP, name, 16, u##int##16_t) \
checked_intrinsic_ctype(CHECK_OP, OP, name, 32, u##int##32_t) \
checked_intrinsic_ctype(CHECK_OP, OP, name, 64, u##int##64_t) \
static const select_intrinsic_checked_t name##_list = { \
    LLVMOP, \
    jl_##name##8, \
    jl_##name##16, \
    jl_##name##32, \
    jl_##name##64, \
}; \
checked_iintrinsic(name, u)
#define checked_iintrinsic_slow(LLVMOP, name, u) \
static const select_intrinsic_checked_t name##_list = { \
    LLVMOP \
}; \
checked_iintrinsic(name, u)

static inline
jl_value_t *jl_iintrinsic_2(jl_value_t *a, jl_value_t *b, const char *name,
                            char (*getsign)(void*, unsigned),
                            jl_value_t *(*lambda2)(jl_value_t*, void*, void*, unsigned, unsigned, const void*),
                            const void *list, int cvtb)
{
    jl_value_t *ty = jl_typeof(a);
    jl_value_t *tyb = jl_typeof(b);
    if (tyb != ty) {
        if (!cvtb)
            jl_errorf("%s: types of a and b must match", name);
        if (!jl_is_bitstype(tyb))
            jl_errorf("%s: b is not a bitstypes", name);
    }
    if (!jl_is_bitstype(ty))
        jl_errorf("%s: a is not a bitstypes", name);
    void *pa = jl_data_ptr(a), *pb = jl_data_ptr(b);
    unsigned sz = jl_datatype_size(ty);
    unsigned sz2 = next_power_of_two(sz);
    unsigned szb = jl_datatype_size(tyb);
    if (sz2 > sz) {
        /* round type up to the appropriate c-type and set/clear the unused bits */
        void *pa2 = alloca(sz2);
        memcpy(pa2, pa, sz);
        memset((char*)pa2 + sz, getsign(pa, sz), sz2 - sz);
        pa = pa2;
    }
    if (sz2 > szb) {
        /* round type up to the appropriate c-type and set/clear/truncate the unused bits */
        void *pb2 = alloca(sz2);
        memcpy(pb2, pb, szb);
        memset((char*)pb2 + szb, getsign(pb, sz), sz2 - szb);
        pb = pb2;
    }
    jl_value_t *newv = lambda2(ty, pa, pb, sz, sz2, list);
    return newv;
}

static inline jl_value_t *jl_intrinsiclambda_2(jl_value_t *ty, void *pa, void *pb, unsigned sz, unsigned sz2, const void *voidlist)
{
    jl_ptls_t ptls = jl_get_ptls_states();
    jl_value_t *newv = jl_gc_alloc(ptls, ((jl_datatype_t*)ty)->size, ty);
    intrinsic_2_t op = select_intrinsic_2(sz2, (const intrinsic_2_t*)voidlist);
    op(sz * host_char_bit, pa, pb, jl_data_ptr(newv));
    if (ty == (jl_value_t*)jl_bool_type)
        return *(uint8_t*)jl_data_ptr(newv) & 1 ? jl_true : jl_false;
    return newv;
}

static inline jl_value_t *jl_intrinsiclambda_cmp(jl_value_t *ty, void *pa, void *pb, unsigned sz, unsigned sz2, const void *voidlist)
{
    intrinsic_cmp_t op = select_intrinsic_cmp(sz2, (const intrinsic_cmp_t*)voidlist);
    int cmp = op(sz * host_char_bit, pa, pb);
    return cmp ? jl_true : jl_false;
}

static inline jl_value_t *jl_intrinsiclambda_checked(jl_value_t *ty, void *pa, void *pb, unsigned sz, unsigned sz2, const void *voidlist)
{
    jl_ptls_t ptls = jl_get_ptls_states();
    jl_value_t *newv = jl_gc_alloc(ptls, ((jl_datatype_t*)ty)->size, ty);
    intrinsic_checked_t op = select_intrinsic_checked(sz2, (const intrinsic_checked_t*)voidlist);
    int ovflw = op(sz * host_char_bit, pa, pb, jl_data_ptr(newv));
    if (ovflw)
        jl_throw(jl_overflow_exception);
    if (ty == (jl_value_t*)jl_bool_type)
        return *(uint8_t*)jl_data_ptr(newv) & 1 ? jl_true : jl_false;
    return newv;
}

// floating point

#define bi_fintrinsic(OP, name) \
    bi_intrinsic_ctype(OP, name, 32, float) \
    bi_intrinsic_ctype(OP, name, 64, double) \
JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b) \
{ \
    jl_ptls_t ptls = jl_get_ptls_states();\
    jl_value_t *ty = jl_typeof(a); \
    if (jl_typeof(b) != ty) \
        jl_error(#name ": types of a and b must match"); \
    if (!jl_is_bitstype(ty)) \
        jl_error(#name ": values are not bitstypes"); \
    int sz = jl_datatype_size(ty); \
    jl_value_t *newv = jl_gc_alloc(ptls, sz, ty);          \
    void *pa = jl_data_ptr(a), *pb = jl_data_ptr(b), *pr = jl_data_ptr(newv); \
    switch (sz) { \
    /* choose the right size c-type operation */ \
    case 4: \
        jl_##name##32(32, pa, pb, pr); \
        break; \
    case 8: \
        jl_##name##64(64, pa, pb, pr); \
        break; \
    default: \
        jl_error(#name ": runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64"); \
    } \
    return newv; \
}

#define bool_fintrinsic(OP, name) \
    bool_intrinsic_ctype(OP, name, 32, float) \
    bool_intrinsic_ctype(OP, name, 64, double) \
JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b) \
{ \
    jl_value_t *ty = jl_typeof(a); \
    if (jl_typeof(b) != ty) \
        jl_error(#name ": types of a and b must match"); \
    if (!jl_is_bitstype(ty)) \
        jl_error(#name ": values are not bitstypes"); \
    void *pa = jl_data_ptr(a), *pb = jl_data_ptr(b); \
    int sz = jl_datatype_size(ty); \
    int cmp; \
    switch (sz) { \
    /* choose the right size c-type operation */ \
    case 4: \
        cmp = jl_##name##32(32, pa, pb); \
        break; \
    case 8: \
        cmp = jl_##name##64(64, pa, pb); \
        break; \
    default: \
        jl_error(#name ": runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64"); \
    } \
    return cmp ? jl_true : jl_false; \
}

#define ter_fintrinsic(OP, name) \
    ter_intrinsic_ctype(OP, name, 32, float) \
    ter_intrinsic_ctype(OP, name, 64, double) \
JL_DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b, jl_value_t *c) \
{ \
    jl_ptls_t ptls = jl_get_ptls_states();\
    jl_value_t *ty = jl_typeof(a); \
    if (jl_typeof(b) != ty || jl_typeof(c) != ty) \
        jl_error(#name ": types of a, b, and c must match"); \
    if (!jl_is_bitstype(ty)) \
        jl_error(#name ": values are not bitstypes"); \
    int sz = jl_datatype_size(ty);                                      \
    jl_value_t *newv = jl_gc_alloc(ptls, sz, ty);                       \
    void *pa = jl_data_ptr(a), *pb = jl_data_ptr(b), *pc = jl_data_ptr(c), *pr = jl_data_ptr(newv); \
    switch (sz) { \
    /* choose the right size c-type operation */ \
    case 4: \
        jl_##name##32(32, pa, pb, pc, pr); \
        break; \
    case 8: \
        jl_##name##64(64, pa, pb, pc, pr); \
        break; \
    default: \
        jl_error(#name ": runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64"); \
    } \
    return newv; \
}

// arithmetic
#define neg(a) -a
#define neg_float(pr, a) *pr = -a
un_iintrinsic_fast(LLVMNeg, neg, neg_int, u)
#define add(a,b) a + b
bi_iintrinsic_fast(LLVMAdd, add, add_int, u)
#define sub(a,b) a - b
bi_iintrinsic_fast(LLVMSub, sub, sub_int, u)
#define mul(a,b) a * b
bi_iintrinsic_fast(LLVMMul, mul, mul_int, u)
#define div(a,b) a / b
bi_iintrinsic_fast(LLVMSDiv, div, sdiv_int,  )
bi_iintrinsic_fast(LLVMUDiv, div, udiv_int, u)
#define rem(a,b) a % b
bi_iintrinsic_fast(LLVMSRem, rem, srem_int,  )
bi_iintrinsic_fast(LLVMURem, rem, urem_int, u)
#define smod(a,b) ((a < 0) == (b < 0)) ? a % b : (b + (a % b)) % b
bi_iintrinsic_fast(jl_LLVMSMod, smod, smod_int,  )
#define frem(a, b) \
    fp_select2(a, b, fmod)

un_fintrinsic(neg_float,neg_float)
bi_fintrinsic(add,add_float)
bi_fintrinsic(sub,sub_float)
bi_fintrinsic(mul,mul_float)
bi_fintrinsic(div,div_float)
bi_fintrinsic(frem,rem_float)

// ternary operators //
#define fma(a, b, c) \
    sizeof(a) == sizeof(float) ? fmaf(a, b, c) : fma(a, b, c)
#define muladd(a, b, c) a * b + c
ter_fintrinsic(fma,fma_float)
ter_fintrinsic(muladd,muladd_float)

// same-type comparisons
#define eq(a,b) a == b
bool_iintrinsic_fast(LLVMICmpEQ, eq, eq_int, u)
#define ne(a,b) a != b
bool_iintrinsic_fast(LLVMICmpNE, ne, ne_int, u)
#define lt(a,b) a < b
bool_iintrinsic_fast(LLVMICmpSLT, lt, slt_int,  )
bool_iintrinsic_fast(LLVMICmpULT, lt, ult_int, u)
#define le(a,b) a <= b
bool_iintrinsic_fast(LLVMICmpSLE, le, sle_int,  )
bool_iintrinsic_fast(LLVMICmpULE, le, ule_int, u)

typedef union {
    float f;
    int32_t d;
    uint32_t ud;
} bits32;
typedef union {
    double f;
    int64_t d;
    uint64_t ud;
} bits64;

#define fpiseq_n(c_type, nbits) \
static inline int fpiseq##nbits(c_type a, c_type b) { \
    bits##nbits ua, ub; \
    ua.f = a; \
    ub.f = b; \
    return (isnan(a) && isnan(b)) || ua.d == ub.d; \
}
fpiseq_n(float, 32)
fpiseq_n(double, 64)
#define fpiseq(a,b) \
    sizeof(a) == sizeof(float) ? fpiseq32(a, b) : fpiseq64(a, b)

#define fpislt_n(c_type, nbits) \
static inline int fpislt##nbits(c_type a, c_type b) { \
    bits##nbits ua, ub; \
    ua.f = a; \
    ub.f = b; \
    if (!isnan(a) && isnan(b)) \
        return 1; \
    if (isnan(a) || isnan(b)) \
        return 0; \
    if (ua.d >= 0 && ua.d < ub.d) \
        return 1; \
    if (ua.d < 0 && ua.ud > ub.ud) \
        return 1; \
    return 0; \
}
fpislt_n(float, 32)
fpislt_n(double, 64)
#define fpislt(a, b) \
    sizeof(a) == sizeof(float) ? fpislt32(a, b) : fpislt64(a, b)

bool_fintrinsic(eq,eq_float)
bool_fintrinsic(ne,ne_float)
bool_fintrinsic(lt,lt_float)
bool_fintrinsic(le,le_float)
bool_fintrinsic(fpiseq,fpiseq)
bool_fintrinsic(fpislt,fpislt)

// bitwise operators
#define and_op(a,b) a & b
bi_iintrinsic_fast(LLVMAnd, and_op, and_int, u)
#define or_op(a,b) a | b
bi_iintrinsic_fast(LLVMOr, or_op, or_int, u)
#define xor_op(a,b) a ^ b
bi_iintrinsic_fast(LLVMXor, xor_op, xor_int, u)
#define shl_op(a,b) b >= 8 * sizeof(a) ? 0 : a << b
bi_iintrinsic_cnvtb_fast(LLVMShl, shl_op, shl_int, u, 1)
#define lshr_op(a,b) (b >= 8 * sizeof(a)) ? 0 : a >> b
bi_iintrinsic_cnvtb_fast(LLVMLShr, lshr_op, lshr_int, u, 1)
#define ashr_op(a,b) \
        /* if ((signed)a > 0) [in two's complement] ? ... : ...) */ \
        (a >> (host_char_bit * sizeof(a) - 1)) ? ~(b >= 8 * sizeof(a) ? 0 : (~a) >> b) : (b >= 8 * sizeof(a) ? 0 : a >> b)
bi_iintrinsic_cnvtb_fast(LLVMAShr, ashr_op, ashr_int, u, 1)
//#define bswap_op(a) __builtin_bswap(a)
//un_iintrinsic_fast(LLVMByteSwap, bswap_op, bswap_int, u)
un_iintrinsic_slow(LLVMByteSwap, bswap_int, u)
//#define ctpop_op(a) __builtin_ctpop(a)
//uu_iintrinsic_fast(LLVMCountPopulation, ctpop_op, ctpop_int, u)
uu_iintrinsic_slow(LLVMCountPopulation, ctpop_int, u)
//#define ctlz_op(a) __builtin_ctlz(a)
//uu_iintrinsic_fast(LLVMCountLeadingZeros, ctlz_op, ctlz_int, u)
uu_iintrinsic_slow(LLVMCountLeadingZeros, ctlz_int, u)
//#define cttz_op(a) __builtin_cttz(a)
//uu_iintrinsic_fast(LLVMCountTrailingZeros, cttz_op, cttz_int, u)
uu_iintrinsic_slow(LLVMCountTrailingZeros, cttz_int, u)
#define not_op(a) ~a
un_iintrinsic_fast(LLVMFlipAllBits, not_op, not_int, u)

// conversions
cvt_iintrinsic(LLVMTrunc, trunc_int)
cvt_iintrinsic(LLVMSExt, sext_int)
cvt_iintrinsic(LLVMZExt, zext_int)
cvt_iintrinsic(LLVMSItoFP, sitofp)
cvt_iintrinsic(LLVMUItoFP, uitofp)
cvt_iintrinsic(LLVMFPtoSI, fptosi)
cvt_iintrinsic(LLVMFPtoUI, fptoui)

#define fpcvt(pr, a) \
        if (osize == 32) \
            *(float*)pr = a; \
        else if (osize == 64) \
            *(double*)pr = a; \
        else \
            jl_error("fptrunc/fpext: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64");
un_fintrinsic_withtype(fpcvt,fptrunc)
un_fintrinsic_withtype(fpcvt,fpext)

JL_DLLEXPORT jl_value_t *jl_fptoui_auto(jl_value_t *a)
{
    jl_datatype_t *ty;
    switch (jl_datatype_size(jl_typeof(a))) {
        case 4:
            ty = jl_uint32_type;
            break;
        case 8:
            ty = jl_uint64_type;
            break;
        default:
            jl_error("fptoui: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64");
    }
    return jl_fptoui((jl_value_t*)ty, a);
}
JL_DLLEXPORT jl_value_t *jl_fptosi_auto(jl_value_t *a)
{
    jl_datatype_t *ty;
    switch (jl_datatype_size(jl_typeof(a))) {
        case 4:
            ty = jl_int32_type;
            break;
        case 8:
            ty = jl_int64_type;
            break;
        default:
            jl_error("fptoui: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64");
    }
    return jl_fptosi((jl_value_t*)ty, a);
}

// checked conversion
static inline int all_eq(char *p, char n, char v)
{
    // computes p[0:n] == v
    while (n--)
        if (*p++ != v)
            return 0;
    return 1;
}
static unsigned check_trunc_sint(unsigned isize, unsigned osize, void *pa)
{
    return !all_eq((char*)pa + osize, isize - osize, signbitbyte(pa, isize)); // TODO: assumes little-endian
}
cvt_iintrinsic_checked(LLVMTrunc, check_trunc_sint, checked_trunc_sint)
static unsigned check_trunc_uint(unsigned isize, unsigned osize, void *pa)
{
    return !all_eq((char*)pa + osize, isize - osize, 0); // TODO: assumes little-endian
}
cvt_iintrinsic_checked(LLVMTrunc, check_trunc_uint, checked_trunc_uint)

#define checked_fptosi(pr, a) \
        if (!LLVMFPtoSI_exact(sizeof(a) * host_char_bit, pa, osize, pr)) \
            jl_throw(jl_inexact_exception);
un_fintrinsic_withtype(checked_fptosi, checked_fptosi)
#define checked_fptoui(pr, a) \
        if (!LLVMFPtoUI_exact(sizeof(a) * host_char_bit, pa, osize, pr)) \
            jl_throw(jl_inexact_exception);
un_fintrinsic_withtype(checked_fptoui, checked_fptoui)

JL_DLLEXPORT jl_value_t *jl_check_top_bit(jl_value_t *a)
{
    jl_value_t *ty = jl_typeof(a);
    if (!jl_is_bitstype(ty))
        jl_error("check_top_bit: value is not a bitstype");
    if (signbitbyte(jl_data_ptr(a), jl_datatype_size(ty)))
        jl_throw(jl_inexact_exception);
    return a;
}

// checked arithmetic
#define check_sadd_int(a,b) \
        /* this test is a reduction of (b > 0) ? (a + b > typemax(a)) : (a + b < typemin(a)) ==> overflow \
         * where (a - a) == (typeof(a))0 */ \
        (b > 0) ? (a > ~((a - a + 1) << (8 * sizeof(a) - 1)) - b) : (a < ((a - a + 1) << (8 * sizeof(a) - 1)) - b)
checked_iintrinsic_fast(LLVMAdd_sov, check_sadd_int, add, checked_sadd_int,  )
#define check_uadd_int(a,b) \
        /* this test checks for (a + b) > typemax(a) ==> overflow */ \
        a >= -b
checked_iintrinsic_fast(LLVMAdd_uov, check_uadd_int, add, checked_uadd_int, u)
#define check_ssub_int(a,b) check_sadd_int(a,-b)
checked_iintrinsic_fast(LLVMSub_sov, check_ssub_int, sub, checked_ssub_int,  )
#define check_usub_int(a,b) \
        /* this test checks for (a - b) < 0 ==> overflow */ \
        a < b
checked_iintrinsic_fast(LLVMSub_uov, check_usub_int, sub, checked_usub_int, u)
checked_iintrinsic_slow(LLVMMul_sov, checked_smul_int,  )
checked_iintrinsic_slow(LLVMMul_uov, checked_umul_int, u)
checked_iintrinsic_slow(LLVMDiv_sov, checked_sdiv_int,  )
checked_iintrinsic_slow(LLVMDiv_uov, checked_udiv_int, u)
checked_iintrinsic_slow(LLVMRem_sov, checked_srem_int,  )
checked_iintrinsic_slow(LLVMRem_uov, checked_urem_int, u)

// functions
#define flipsign(a, b) \
        (b >= 0) ? a : -a
bi_iintrinsic_fast(jl_LLVMFlipSign, flipsign, flipsign_int,  )
#define abs_float(pr, a) *pr = fp_select(a, fabs)
#define ceil_float(pr, a) *pr = fp_select(a, ceil)
#define floor_float(pr, a) *pr = fp_select(a, floor)
#define trunc_float(pr, a) *pr = fp_select(a, trunc)
#define rint_float(pr, a) *pr = fp_select(a, rint)
#define sqrt_float(pr, a) \
        if (a < 0) \
            jl_throw(jl_domain_exception); \
        *pr = fp_select(a, sqrt)
#define copysign_float(a, b) \
        fp_select2(a, b, copysign)

un_fintrinsic(abs_float,abs_float)
bi_fintrinsic(copysign_float,copysign_float)
un_fintrinsic(ceil_float,ceil_llvm)
un_fintrinsic(floor_float,floor_llvm)
un_fintrinsic(trunc_float,trunc_llvm)
un_fintrinsic(rint_float,rint_llvm)
un_fintrinsic(sqrt_float,sqrt_llvm)

JL_DLLEXPORT jl_value_t *jl_powi_llvm(jl_value_t *a, jl_value_t *b)
{
    jl_ptls_t ptls = jl_get_ptls_states();
    jl_value_t *ty = jl_typeof(a);
    if (!jl_is_bitstype(ty))
        jl_error("powi_llvm: a is not a bitstype");
    if (!jl_is_bitstype(jl_typeof(b)) || jl_datatype_size(jl_typeof(b)) != 4)
        jl_error("powi_llvm: b is not a 32-bit bitstype");
    int sz = jl_datatype_size(ty);
    jl_value_t *newv = jl_gc_alloc(ptls, sz, ty);
    void *pa = jl_data_ptr(a), *pr = jl_data_ptr(newv);
    switch (sz) {
    /* choose the right size c-type operation */
    case 4:
        *(float*)pr = powf(*(float*)pa, (float)jl_unbox_int32(b));
        break;
    case 8:
        *(double*)pr = pow(*(double*)pa, (double)jl_unbox_int32(b));
        break;
    default:
        jl_error("powi_llvm: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64");
    }
    return newv;
}

JL_DLLEXPORT jl_value_t *jl_select_value(jl_value_t *isfalse, jl_value_t *a, jl_value_t *b)
{
    JL_TYPECHK(isfalse, bool, isfalse);
    return (isfalse == jl_false ? b : a);
}

JL_DLLEXPORT jl_value_t *jl_arraylen(jl_value_t *a)
{
    return jl_box_long(jl_array_len((jl_array_t*)a));
}