https://github.com/JuliaLang/julia
Raw File
Tip revision: 0d623ab60f8b501dbd7457cec5fdef26793e5281 authored by Jeff Bezanson on 10 January 2024, 04:01:02 UTC
give consistent results for `methods` with and without limiting matches
Tip revision: 0d623ab
float.jl
# This file is a part of Julia. License is MIT: https://julialang.org/license

const IEEEFloat = Union{Float16, Float32, Float64}

## floating point traits ##

"""
    Inf16

Positive infinity of type [`Float16`](@ref).
"""
const Inf16 = bitcast(Float16, 0x7c00)
"""
    NaN16

A not-a-number value of type [`Float16`](@ref).
"""
const NaN16 = bitcast(Float16, 0x7e00)
"""
    Inf32

Positive infinity of type [`Float32`](@ref).
"""
const Inf32 = bitcast(Float32, 0x7f800000)
"""
    NaN32

A not-a-number value of type [`Float32`](@ref).
"""
const NaN32 = bitcast(Float32, 0x7fc00000)
const Inf64 = bitcast(Float64, 0x7ff0000000000000)
const NaN64 = bitcast(Float64, 0x7ff8000000000000)

const Inf = Inf64
"""
    Inf, Inf64

Positive infinity of type [`Float64`](@ref).

See also: [`isfinite`](@ref), [`typemax`](@ref), [`NaN`](@ref), [`Inf32`](@ref).

# Examples
```jldoctest
julia> π/0
Inf

julia> +1.0 / -0.0
-Inf

julia> ℯ^-Inf
0.0
```
"""
Inf, Inf64

const NaN = NaN64
"""
    NaN, NaN64

A not-a-number value of type [`Float64`](@ref).

See also: [`isnan`](@ref), [`missing`](@ref), [`NaN32`](@ref), [`Inf`](@ref).

# Examples
```jldoctest
julia> 0/0
NaN

julia> Inf - Inf
NaN

julia> NaN == NaN, isequal(NaN, NaN), NaN === NaN
(false, true, true)
```
"""
NaN, NaN64

# bit patterns
reinterpret(::Type{Unsigned}, x::Float64) = reinterpret(UInt64, x)
reinterpret(::Type{Unsigned}, x::Float32) = reinterpret(UInt32, x)
reinterpret(::Type{Unsigned}, x::Float16) = reinterpret(UInt16, x)
reinterpret(::Type{Signed}, x::Float64) = reinterpret(Int64, x)
reinterpret(::Type{Signed}, x::Float32) = reinterpret(Int32, x)
reinterpret(::Type{Signed}, x::Float16) = reinterpret(Int16, x)

sign_mask(::Type{Float64}) =        0x8000_0000_0000_0000
exponent_mask(::Type{Float64}) =    0x7ff0_0000_0000_0000
exponent_one(::Type{Float64}) =     0x3ff0_0000_0000_0000
exponent_half(::Type{Float64}) =    0x3fe0_0000_0000_0000
significand_mask(::Type{Float64}) = 0x000f_ffff_ffff_ffff

sign_mask(::Type{Float32}) =        0x8000_0000
exponent_mask(::Type{Float32}) =    0x7f80_0000
exponent_one(::Type{Float32}) =     0x3f80_0000
exponent_half(::Type{Float32}) =    0x3f00_0000
significand_mask(::Type{Float32}) = 0x007f_ffff

sign_mask(::Type{Float16}) =        0x8000
exponent_mask(::Type{Float16}) =    0x7c00
exponent_one(::Type{Float16}) =     0x3c00
exponent_half(::Type{Float16}) =    0x3800
significand_mask(::Type{Float16}) = 0x03ff

mantissa(x::T) where {T} = reinterpret(Unsigned, x) & significand_mask(T)

for T in (Float16, Float32, Float64)
    @eval significand_bits(::Type{$T}) = $(trailing_ones(significand_mask(T)))
    @eval exponent_bits(::Type{$T}) = $(sizeof(T)*8 - significand_bits(T) - 1)
    @eval exponent_bias(::Type{$T}) = $(Int(exponent_one(T) >> significand_bits(T)))
    # maximum float exponent
    @eval exponent_max(::Type{$T}) = $(Int(exponent_mask(T) >> significand_bits(T)) - exponent_bias(T) - 1)
    # maximum float exponent without bias
    @eval exponent_raw_max(::Type{$T}) = $(Int(exponent_mask(T) >> significand_bits(T)))
end

"""
    exponent_max(T)

Maximum [`exponent`](@ref) value for a floating point number of type `T`.

# Examples
```jldoctest
julia> Base.exponent_max(Float64)
1023
```

Note, `exponent_max(T) + 1` is a possible value of the exponent field
with bias, which might be used as sentinel value for `Inf` or `NaN`.
"""
function exponent_max end

"""
    exponent_raw_max(T)

Maximum value of the [`exponent`](@ref) field for a floating point number of type `T` without bias,
i.e. the maximum integer value representable by [`exponent_bits(T)`](@ref) bits.
"""
function exponent_raw_max end

"""
IEEE 754 definition of the minimum exponent.
"""
ieee754_exponent_min(::Type{T}) where {T<:IEEEFloat} = Int(1 - exponent_max(T))::Int

exponent_min(::Type{Float16}) = ieee754_exponent_min(Float16)
exponent_min(::Type{Float32}) = ieee754_exponent_min(Float32)
exponent_min(::Type{Float64}) = ieee754_exponent_min(Float64)

function ieee754_representation(
    ::Type{F}, sign_bit::Bool, exponent_field::Integer, significand_field::Integer
) where {F<:IEEEFloat}
    T = uinttype(F)
    ret::T = sign_bit
    ret <<= exponent_bits(F)
    ret |= exponent_field
    ret <<= significand_bits(F)
    ret |= significand_field
end

# ±floatmax(T)
function ieee754_representation(
    ::Type{F}, sign_bit::Bool, ::Val{:omega}
) where {F<:IEEEFloat}
    ieee754_representation(F, sign_bit, exponent_raw_max(F) - 1, significand_mask(F))
end

# NaN or an infinity
function ieee754_representation(
    ::Type{F}, sign_bit::Bool, significand_field::Integer, ::Val{:nan}
) where {F<:IEEEFloat}
    ieee754_representation(F, sign_bit, exponent_raw_max(F), significand_field)
end

# NaN with default payload
function ieee754_representation(
    ::Type{F}, sign_bit::Bool, ::Val{:nan}
) where {F<:IEEEFloat}
    ieee754_representation(F, sign_bit, one(uinttype(F)) << (significand_bits(F) - 1), Val(:nan))
end

# Infinity
function ieee754_representation(
    ::Type{F}, sign_bit::Bool, ::Val{:inf}
) where {F<:IEEEFloat}
    ieee754_representation(F, sign_bit, false, Val(:nan))
end

# Subnormal or zero
function ieee754_representation(
    ::Type{F}, sign_bit::Bool, significand_field::Integer, ::Val{:subnormal}
) where {F<:IEEEFloat}
    ieee754_representation(F, sign_bit, false, significand_field)
end

# Zero
function ieee754_representation(
    ::Type{F}, sign_bit::Bool, ::Val{:zero}
) where {F<:IEEEFloat}
    ieee754_representation(F, sign_bit, false, Val(:subnormal))
end

"""
    uabs(x::Integer)

Return the absolute value of `x`, possibly returning a different type should the
operation be susceptible to overflow. This typically arises when `x` is a two's complement
signed integer, so that `abs(typemin(x)) == typemin(x) < 0`, in which case the result of
`uabs(x)` will be an unsigned integer of the same size.
"""
uabs(x::Integer) = abs(x)
uabs(x::BitSigned) = unsigned(abs(x))

## conversions to floating-point ##

# TODO: deprecate in 2.0
Float16(x::Integer) = convert(Float16, convert(Float32, x)::Float32)

for t1 in (Float16, Float32, Float64)
    for st in (Int8, Int16, Int32, Int64)
        @eval begin
            (::Type{$t1})(x::($st)) = sitofp($t1, x)
            promote_rule(::Type{$t1}, ::Type{$st}) = $t1
        end
    end
    for ut in (Bool, UInt8, UInt16, UInt32, UInt64)
        @eval begin
            (::Type{$t1})(x::($ut)) = uitofp($t1, x)
            promote_rule(::Type{$t1}, ::Type{$ut}) = $t1
        end
    end
end

Bool(x::Real) = x==0 ? false : x==1 ? true : throw(InexactError(:Bool, Bool, x))

promote_rule(::Type{Float64}, ::Type{UInt128}) = Float64
promote_rule(::Type{Float64}, ::Type{Int128}) = Float64
promote_rule(::Type{Float32}, ::Type{UInt128}) = Float32
promote_rule(::Type{Float32}, ::Type{Int128}) = Float32
promote_rule(::Type{Float16}, ::Type{UInt128}) = Float16
promote_rule(::Type{Float16}, ::Type{Int128}) = Float16

function Float64(x::UInt128)
    if x < UInt128(1) << 104 # Can fit it in two 52 bits mantissas
        low_exp = 0x1p52
        high_exp = 0x1p104
        low_bits = (x % UInt64) & Base.significand_mask(Float64)
        low_value = reinterpret(Float64, reinterpret(UInt64, low_exp) | low_bits) - low_exp
        high_bits = ((x >> 52) % UInt64)
        high_value = reinterpret(Float64, reinterpret(UInt64, high_exp) | high_bits) - high_exp
        low_value + high_value
    else # Large enough that low bits only affect rounding, pack low bits
        low_exp = 0x1p76
        high_exp = 0x1p128
        low_bits = ((x >> 12) % UInt64) >> 12 | (x % UInt64) & 0xFFFFFF
        low_value = reinterpret(Float64, reinterpret(UInt64, low_exp) | low_bits) - low_exp
        high_bits = ((x >> 76) % UInt64)
        high_value = reinterpret(Float64, reinterpret(UInt64, high_exp) | high_bits) - high_exp
        low_value + high_value
    end
end

function Float64(x::Int128)
    sign_bit = ((x >> 127) % UInt64) << 63
    ux = uabs(x)
    if ux < UInt128(1) << 104 # Can fit it in two 52 bits mantissas
        low_exp = 0x1p52
        high_exp = 0x1p104
        low_bits = (ux % UInt64) & Base.significand_mask(Float64)
        low_value = reinterpret(Float64, reinterpret(UInt64, low_exp) | low_bits) - low_exp
        high_bits = ((ux >> 52) % UInt64)
        high_value = reinterpret(Float64, reinterpret(UInt64, high_exp) | high_bits) - high_exp
        reinterpret(Float64, sign_bit | reinterpret(UInt64, low_value + high_value))
    else # Large enough that low bits only affect rounding, pack low bits
        low_exp = 0x1p76
        high_exp = 0x1p128
        low_bits = ((ux >> 12) % UInt64) >> 12 | (ux % UInt64) & 0xFFFFFF
        low_value = reinterpret(Float64, reinterpret(UInt64, low_exp) | low_bits) - low_exp
        high_bits = ((ux >> 76) % UInt64)
        high_value = reinterpret(Float64, reinterpret(UInt64, high_exp) | high_bits) - high_exp
        reinterpret(Float64, sign_bit | reinterpret(UInt64, low_value + high_value))
    end
end

function Float32(x::UInt128)
    x == 0 && return 0f0
    n = top_set_bit(x) # ndigits0z(x,2)
    if n <= 24
        y = ((x % UInt32) << (24-n)) & 0x007f_ffff
    else
        y = ((x >> (n-25)) % UInt32) & 0x00ff_ffff # keep 1 extra bit
        y = (y+one(UInt32))>>1 # round, ties up (extra leading bit in case of next exponent)
        y &= ~UInt32(trailing_zeros(x) == (n-25)) # fix last bit to round to even
    end
    d = ((n+126) % UInt32) << 23
    reinterpret(Float32, d + y)
end

function Float32(x::Int128)
    x == 0 && return 0f0
    s = ((x >>> 96) % UInt32) & 0x8000_0000 # sign bit
    x = abs(x) % UInt128
    n = top_set_bit(x) # ndigits0z(x,2)
    if n <= 24
        y = ((x % UInt32) << (24-n)) & 0x007f_ffff
    else
        y = ((x >> (n-25)) % UInt32) & 0x00ff_ffff # keep 1 extra bit
        y = (y+one(UInt32))>>1 # round, ties up (extra leading bit in case of next exponent)
        y &= ~UInt32(trailing_zeros(x) == (n-25)) # fix last bit to round to even
    end
    d = ((n+126) % UInt32) << 23
    reinterpret(Float32, s | d + y)
end

# TODO: optimize
Float16(x::UInt128) = convert(Float16, Float64(x))
Float16(x::Int128)  = convert(Float16, Float64(x))

Float16(x::Float32) = fptrunc(Float16, x)
Float16(x::Float64) = fptrunc(Float16, x)
Float32(x::Float64) = fptrunc(Float32, x)

Float32(x::Float16) = fpext(Float32, x)
Float64(x::Float32) = fpext(Float64, x)
Float64(x::Float16) = fpext(Float64, x)

AbstractFloat(x::Bool)    = Float64(x)
AbstractFloat(x::Int8)    = Float64(x)
AbstractFloat(x::Int16)   = Float64(x)
AbstractFloat(x::Int32)   = Float64(x)
AbstractFloat(x::Int64)   = Float64(x) # LOSSY
AbstractFloat(x::Int128)  = Float64(x) # LOSSY
AbstractFloat(x::UInt8)   = Float64(x)
AbstractFloat(x::UInt16)  = Float64(x)
AbstractFloat(x::UInt32)  = Float64(x)
AbstractFloat(x::UInt64)  = Float64(x) # LOSSY
AbstractFloat(x::UInt128) = Float64(x) # LOSSY

Bool(x::Float16) = x==0 ? false : x==1 ? true : throw(InexactError(:Bool, Bool, x))

"""
    float(x)

Convert a number or array to a floating point data type.

See also: [`complex`](@ref), [`oftype`](@ref), [`convert`](@ref).

# Examples
```jldoctest
julia> float(1:1000)
1.0:1.0:1000.0

julia> float(typemax(Int32))
2.147483647e9
```
"""
float(x) = AbstractFloat(x)

"""
    float(T::Type)

Return an appropriate type to represent a value of type `T` as a floating point value.
Equivalent to `typeof(float(zero(T)))`.

# Examples
```jldoctest
julia> float(Complex{Int})
ComplexF64 (alias for Complex{Float64})

julia> float(Int)
Float64
```
"""
float(::Type{T}) where {T<:Number} = typeof(float(zero(T)))
float(::Type{T}) where {T<:AbstractFloat} = T
float(::Type{Union{}}, slurp...) = Union{}(0.0)

"""
    unsafe_trunc(T, x)

Return the nearest integral value of type `T` whose absolute value is
less than or equal to the absolute value of `x`. If the value is not representable by `T`,
an arbitrary value will be returned.
See also [`trunc`](@ref).

# Examples
```jldoctest
julia> unsafe_trunc(Int, -2.2)
-2

julia> unsafe_trunc(Int, NaN)
-9223372036854775808
```
"""
function unsafe_trunc end

for Ti in (Int8, Int16, Int32, Int64)
    @eval begin
        unsafe_trunc(::Type{$Ti}, x::IEEEFloat) = fptosi($Ti, x)
    end
end
for Ti in (UInt8, UInt16, UInt32, UInt64)
    @eval begin
        unsafe_trunc(::Type{$Ti}, x::IEEEFloat) = fptoui($Ti, x)
    end
end

function unsafe_trunc(::Type{UInt128}, x::Float64)
    xu = reinterpret(UInt64,x)
    k = Int(xu >> 52) & 0x07ff - 1075
    xu = (xu & 0x000f_ffff_ffff_ffff) | 0x0010_0000_0000_0000
    if k <= 0
        UInt128(xu >> -k)
    else
        UInt128(xu) << k
    end
end
function unsafe_trunc(::Type{Int128}, x::Float64)
    copysign(unsafe_trunc(UInt128,x) % Int128, x)
end

function unsafe_trunc(::Type{UInt128}, x::Float32)
    xu = reinterpret(UInt32,x)
    k = Int(xu >> 23) & 0x00ff - 150
    xu = (xu & 0x007f_ffff) | 0x0080_0000
    if k <= 0
        UInt128(xu >> -k)
    else
        UInt128(xu) << k
    end
end
function unsafe_trunc(::Type{Int128}, x::Float32)
    copysign(unsafe_trunc(UInt128,x) % Int128, x)
end

unsafe_trunc(::Type{UInt128}, x::Float16) = unsafe_trunc(UInt128, Float32(x))
unsafe_trunc(::Type{Int128}, x::Float16) = unsafe_trunc(Int128, Float32(x))

# matches convert methods
# also determines trunc, floor, ceil
round(::Type{Signed},   x::IEEEFloat, r::RoundingMode) = round(Int, x, r)
round(::Type{Unsigned}, x::IEEEFloat, r::RoundingMode) = round(UInt, x, r)
round(::Type{Integer},  x::IEEEFloat, r::RoundingMode) = round(Int, x, r)

round(x::IEEEFloat, ::RoundingMode{:ToZero})  = trunc_llvm(x)
round(x::IEEEFloat, ::RoundingMode{:Down})    = floor_llvm(x)
round(x::IEEEFloat, ::RoundingMode{:Up})      = ceil_llvm(x)
round(x::IEEEFloat, ::RoundingMode{:Nearest}) = rint_llvm(x)

## floating point promotions ##
promote_rule(::Type{Float32}, ::Type{Float16}) = Float32
promote_rule(::Type{Float64}, ::Type{Float16}) = Float64
promote_rule(::Type{Float64}, ::Type{Float32}) = Float64

widen(::Type{Float16}) = Float32
widen(::Type{Float32}) = Float64

## floating point arithmetic ##
-(x::IEEEFloat) = neg_float(x)

+(x::T, y::T) where {T<:IEEEFloat} = add_float(x, y)
-(x::T, y::T) where {T<:IEEEFloat} = sub_float(x, y)
*(x::T, y::T) where {T<:IEEEFloat} = mul_float(x, y)
/(x::T, y::T) where {T<:IEEEFloat} = div_float(x, y)

muladd(x::T, y::T, z::T) where {T<:IEEEFloat} = muladd_float(x, y, z)

# TODO: faster floating point div?
# TODO: faster floating point fld?
# TODO: faster floating point mod?

function unbiased_exponent(x::T) where {T<:IEEEFloat}
    return (reinterpret(Unsigned, x) & exponent_mask(T)) >> significand_bits(T)
end

function explicit_mantissa_noinfnan(x::T) where {T<:IEEEFloat}
    m = mantissa(x)
    issubnormal(x) || (m |= significand_mask(T) + uinttype(T)(1))
    return m
end

function _to_float(number::U, ep) where {U<:Unsigned}
    F = floattype(U)
    S = signed(U)
    epint = unsafe_trunc(S,ep)
    lz::signed(U) = unsafe_trunc(S, Core.Intrinsics.ctlz_int(number) - U(exponent_bits(F)))
    number <<= lz
    epint -= lz
    bits = U(0)
    if epint >= 0
        bits = number & significand_mask(F)
        bits |= ((epint + S(1)) << significand_bits(F)) & exponent_mask(F)
    else
        bits = (number >> -epint) & significand_mask(F)
    end
    return reinterpret(F, bits)
end

@assume_effects :terminates_locally :nothrow function rem_internal(x::T, y::T) where {T<:IEEEFloat}
    xuint = reinterpret(Unsigned, x)
    yuint = reinterpret(Unsigned, y)
    if xuint <= yuint
        if xuint < yuint
            return x
        end
        return zero(T)
    end

    e_x = unbiased_exponent(x)
    e_y = unbiased_exponent(y)
    # Most common case where |y| is "very normal" and |x/y| < 2^EXPONENT_WIDTH
    if e_y > (significand_bits(T)) && (e_x - e_y) <= (exponent_bits(T))
        m_x = explicit_mantissa_noinfnan(x)
        m_y = explicit_mantissa_noinfnan(y)
        d = urem_int((m_x << (e_x - e_y)),  m_y)
        iszero(d) && return zero(T)
        return _to_float(d, e_y - uinttype(T)(1))
    end
    # Both are subnormals
    if e_x == 0 && e_y == 0
        return reinterpret(T, urem_int(xuint, yuint) & significand_mask(T))
    end

    m_x = explicit_mantissa_noinfnan(x)
    e_x -= uinttype(T)(1)
    m_y = explicit_mantissa_noinfnan(y)
    lz_m_y = uinttype(T)(exponent_bits(T))
    if e_y > 0
        e_y -= uinttype(T)(1)
    else
        m_y = mantissa(y)
        lz_m_y = Core.Intrinsics.ctlz_int(m_y)
    end

    tz_m_y = Core.Intrinsics.cttz_int(m_y)
    sides_zeroes_cnt = lz_m_y + tz_m_y

    # n>0
    exp_diff = e_x - e_y
    # Shift hy right until the end or n = 0
    right_shift = min(exp_diff, tz_m_y)
    m_y >>= right_shift
    exp_diff -= right_shift
    e_y += right_shift
    # Shift hx left until the end or n = 0
    left_shift = min(exp_diff, uinttype(T)(exponent_bits(T)))
    m_x <<= left_shift
    exp_diff -= left_shift

    m_x = urem_int(m_x, m_y)
    iszero(m_x) && return zero(T)
    iszero(exp_diff) && return _to_float(m_x, e_y)

    while exp_diff > sides_zeroes_cnt
        exp_diff -= sides_zeroes_cnt
        m_x <<= sides_zeroes_cnt
        m_x = urem_int(m_x, m_y)
    end
    m_x <<= exp_diff
    m_x = urem_int(m_x, m_y)
    return _to_float(m_x, e_y)
end

function rem(x::T, y::T) where {T<:IEEEFloat}
    if isfinite(x) && !iszero(x) && isfinite(y) && !iszero(y)
        return copysign(rem_internal(abs(x), abs(y)), x)
    elseif isinf(x) || isnan(y) || iszero(y)  # y can still be Inf
        return T(NaN)
    else
        return x
    end
end

function mod(x::T, y::T) where {T<:AbstractFloat}
    r = rem(x,y)
    if r == 0
        copysign(r,y)
    elseif (r > 0) ⊻ (y > 0)
        r+y
    else
        r
    end
end

## floating point comparisons ##
==(x::T, y::T) where {T<:IEEEFloat} = eq_float(x, y)
!=(x::T, y::T) where {T<:IEEEFloat} = ne_float(x, y)
<( x::T, y::T) where {T<:IEEEFloat} = lt_float(x, y)
<=(x::T, y::T) where {T<:IEEEFloat} = le_float(x, y)

isequal(x::T, y::T) where {T<:IEEEFloat} = fpiseq(x, y)

# interpret as sign-magnitude integer
@inline function _fpint(x)
    IntT = inttype(typeof(x))
    ix = reinterpret(IntT, x)
    return ifelse(ix < zero(IntT), ix ⊻ typemax(IntT), ix)
end

@inline function isless(a::T, b::T) where T<:IEEEFloat
    (isnan(a) || isnan(b)) && return !isnan(a)

    return _fpint(a) < _fpint(b)
end

# Exact Float (Tf) vs Integer (Ti) comparisons
# Assumes:
# - typemax(Ti) == 2^n-1
# - typemax(Ti) can't be exactly represented by Tf:
#   => Tf(typemax(Ti)) == 2^n or Inf
# - typemin(Ti) can be exactly represented by Tf
#
# 1. convert y::Ti to float fy::Tf
# 2. perform Tf comparison x vs fy
# 3. if x == fy, check if (1) resulted in rounding:
#  a. convert fy back to Ti and compare with original y
#  b. unsafe_convert undefined behaviour if fy == Tf(typemax(Ti))
#     (but consequently x == fy > y)
for Ti in (Int64,UInt64,Int128,UInt128)
    for Tf in (Float32,Float64)
        @eval begin
            function ==(x::$Tf, y::$Ti)
                fy = ($Tf)(y)
                (x == fy) & (fy != $(Tf(typemax(Ti)))) & (y == unsafe_trunc($Ti,fy))
            end
            ==(y::$Ti, x::$Tf) = x==y

            function <(x::$Ti, y::$Tf)
                fx = ($Tf)(x)
                (fx < y) | ((fx == y) & ((fx == $(Tf(typemax(Ti)))) | (x < unsafe_trunc($Ti,fx)) ))
            end
            function <=(x::$Ti, y::$Tf)
                fx = ($Tf)(x)
                (fx < y) | ((fx == y) & ((fx == $(Tf(typemax(Ti)))) | (x <= unsafe_trunc($Ti,fx)) ))
            end

            function <(x::$Tf, y::$Ti)
                fy = ($Tf)(y)
                (x < fy) | ((x == fy) & (fy < $(Tf(typemax(Ti)))) & (unsafe_trunc($Ti,fy) < y))
            end
            function <=(x::$Tf, y::$Ti)
                fy = ($Tf)(y)
                (x < fy) | ((x == fy) & (fy < $(Tf(typemax(Ti)))) & (unsafe_trunc($Ti,fy) <= y))
            end
        end
    end
end
for op in (:(==), :<, :<=)
    @eval begin
        ($op)(x::Float16, y::Union{Int128,UInt128,Int64,UInt64}) = ($op)(Float64(x), Float64(y))
        ($op)(x::Union{Int128,UInt128,Int64,UInt64}, y::Float16) = ($op)(Float64(x), Float64(y))

        ($op)(x::Union{Float16,Float32}, y::Union{Int32,UInt32}) = ($op)(Float64(x), Float64(y))
        ($op)(x::Union{Int32,UInt32}, y::Union{Float16,Float32}) = ($op)(Float64(x), Float64(y))

        ($op)(x::Float16, y::Union{Int16,UInt16}) = ($op)(Float32(x), Float32(y))
        ($op)(x::Union{Int16,UInt16}, y::Float16) = ($op)(Float32(x), Float32(y))
    end
end


abs(x::IEEEFloat) = abs_float(x)

"""
    isnan(f) -> Bool

Test whether a number value is a NaN, an indeterminate value which is neither an infinity
nor a finite number ("not a number").

See also: [`iszero`](@ref), [`isone`](@ref), [`isinf`](@ref), [`ismissing`](@ref).
"""
isnan(x::AbstractFloat) = (x != x)::Bool
isnan(x::Number) = false

isfinite(x::AbstractFloat) = !isnan(x - x)
isfinite(x::Real) = decompose(x)[3] != 0
isfinite(x::Integer) = true

"""
    isinf(f) -> Bool

Test whether a number is infinite.

See also: [`Inf`](@ref), [`iszero`](@ref), [`isfinite`](@ref), [`isnan`](@ref).
"""
isinf(x::Real) = !isnan(x) & !isfinite(x)
isinf(x::IEEEFloat) = abs(x) === oftype(x, Inf)

const hx_NaN = hash_uint64(reinterpret(UInt64, NaN))
function hash(x::Float64, h::UInt)
    # see comments on trunc and hash(Real, UInt)
    if typemin(Int64) <= x < typemax(Int64)
        xi = fptosi(Int64, x)
        if isequal(xi, x)
            return hash(xi, h)
        end
    elseif typemin(UInt64) <= x < typemax(UInt64)
        xu = fptoui(UInt64, x)
        if isequal(xu, x)
            return hash(xu, h)
        end
    elseif isnan(x)
        return hx_NaN ⊻ h # NaN does not have a stable bit pattern
    end
    return hash_uint64(bitcast(UInt64, x)) - 3h
end

hash(x::Float32, h::UInt) = hash(Float64(x), h)

function hash(x::Float16, h::UInt)
    # see comments on trunc and hash(Real, UInt)
    if isfinite(x) # all finite Float16 fit in Int64
        xi = fptosi(Int64, x)
        if isequal(xi, x)
            return hash(xi, h)
        end
    elseif isnan(x)
        return hx_NaN ⊻ h # NaN does not have a stable bit pattern
    end
    return hash_uint64(bitcast(UInt64, Float64(x))) - 3h
end

## generic hashing for rational values ##
function hash(x::Real, h::UInt)
    # decompose x as num*2^pow/den
    num, pow, den = decompose(x)

    # handle special values
    num == 0 && den == 0 && return hash(NaN, h)
    num == 0 && return hash(ifelse(den > 0, 0.0, -0.0), h)
    den == 0 && return hash(ifelse(num > 0, Inf, -Inf), h)

    # normalize decomposition
    if den < 0
        num = -num
        den = -den
    end
    num_z = trailing_zeros(num)
    num >>= num_z
    den_z = trailing_zeros(den)
    den >>= den_z
    pow += num_z - den_z
    # If the real can be represented as an Int64, UInt64, or Float64, hash as those types.
    # To be an Integer the denominator must be 1 and the power must be non-negative.
    if den == 1
        # left = ceil(log2(num*2^pow))
        left = top_set_bit(abs(num)) + pow
        # 2^-1074 is the minimum Float64 so if the power is smaller, not a Float64
        if -1074 <= pow
            if 0 <= pow # if pow is non-negative, it is an integer
                left <= 63 && return hash(Int64(num) << Int(pow), h)
                left <= 64 && !signbit(num) && return hash(UInt64(num) << Int(pow), h)
            end # typemin(Int64) handled by Float64 case
            # 2^1024 is the maximum Float64 so if the power is greater, not a Float64
            # Float64s only have 53 mantisa bits (including implicit bit)
            left <= 1024 && left - pow <= 53 && return hash(ldexp(Float64(num), pow), h)
        end
    else
        h = hash_integer(den, h)
    end
    # handle generic rational values
    h = hash_integer(pow, h)
    h = hash_integer(num, h)
    return h
end

#=
`decompose(x)`: non-canonical decomposition of rational values as `num*2^pow/den`.

The decompose function is the point where rational-valued numeric types that support
hashing hook into the hashing protocol. `decompose(x)` should return three integer
values `num, pow, den`, such that the value of `x` is mathematically equal to

    num*2^pow/den

The decomposition need not be canonical in the sense that it just needs to be *some*
way to express `x` in this form, not any particular way – with the restriction that
`num` and `den` may not share any odd common factors. They may, however, have powers
of two in common – the generic hashing code will normalize those as necessary.

Special values:

 - `x` is zero: `num` should be zero and `den` should have the same sign as `x`
 - `x` is infinite: `den` should be zero and `num` should have the same sign as `x`
 - `x` is not a number: `num` and `den` should both be zero
=#

decompose(x::Integer) = x, 0, 1

function decompose(x::Float16)::NTuple{3,Int}
    isnan(x) && return 0, 0, 0
    isinf(x) && return ifelse(x < 0, -1, 1), 0, 0
    n = reinterpret(UInt16, x)
    s = (n & 0x03ff) % Int16
    e = ((n & 0x7c00) >> 10) % Int
    s |= Int16(e != 0) << 10
    d = ifelse(signbit(x), -1, 1)
    s, e - 25 + (e == 0), d
end

function decompose(x::Float32)::NTuple{3,Int}
    isnan(x) && return 0, 0, 0
    isinf(x) && return ifelse(x < 0, -1, 1), 0, 0
    n = reinterpret(UInt32, x)
    s = (n & 0x007fffff) % Int32
    e = ((n & 0x7f800000) >> 23) % Int
    s |= Int32(e != 0) << 23
    d = ifelse(signbit(x), -1, 1)
    s, e - 150 + (e == 0), d
end

function decompose(x::Float64)::Tuple{Int64, Int, Int}
    isnan(x) && return 0, 0, 0
    isinf(x) && return ifelse(x < 0, -1, 1), 0, 0
    n = reinterpret(UInt64, x)
    s = (n & 0x000fffffffffffff) % Int64
    e = ((n & 0x7ff0000000000000) >> 52) % Int
    s |= Int64(e != 0) << 52
    d = ifelse(signbit(x), -1, 1)
    s, e - 1075 + (e == 0), d
end


"""
    precision(num::AbstractFloat; base::Integer=2)
    precision(T::Type; base::Integer=2)

Get the precision of a floating point number, as defined by the effective number of bits in
the significand, or the precision of a floating-point type `T` (its current default, if
`T` is a variable-precision type like [`BigFloat`](@ref)).

If `base` is specified, then it returns the maximum corresponding
number of significand digits in that base.

!!! compat "Julia 1.8"
    The `base` keyword requires at least Julia 1.8.
"""
function precision end

_precision(::Type{Float16}) = 11
_precision(::Type{Float32}) = 24
_precision(::Type{Float64}) = 53
function _precision(x, base::Integer=2)
    base > 1 || throw(DomainError(base, "`base` cannot be less than 2."))
    p = _precision(x)
    return base == 2 ? Int(p) : floor(Int, p / log2(base))
end
precision(::Type{T}; base::Integer=2) where {T<:AbstractFloat} = _precision(T, base)
precision(::T; base::Integer=2) where {T<:AbstractFloat} = precision(T; base)


"""
    nextfloat(x::AbstractFloat, n::Integer)

The result of `n` iterative applications of `nextfloat` to `x` if `n >= 0`, or `-n`
applications of [`prevfloat`](@ref) if `n < 0`.
"""
function nextfloat(f::IEEEFloat, d::Integer)
    F = typeof(f)
    fumax = reinterpret(Unsigned, F(Inf))
    U = typeof(fumax)

    isnan(f) && return f
    fi = reinterpret(Signed, f)
    fneg = fi < 0
    fu = unsigned(fi & typemax(fi))

    dneg = d < 0
    da = uabs(d)
    if da > typemax(U)
        fneg = dneg
        fu = fumax
    else
        du = da % U
        if fneg ⊻ dneg
            if du > fu
                fu = min(fumax, du - fu)
                fneg = !fneg
            else
                fu = fu - du
            end
        else
            if fumax - fu < du
                fu = fumax
            else
                fu = fu + du
            end
        end
    end
    if fneg
        fu |= sign_mask(F)
    end
    reinterpret(F, fu)
end

"""
    nextfloat(x::AbstractFloat)

Return the smallest floating point number `y` of the same type as `x` such `x < y`. If no
such `y` exists (e.g. if `x` is `Inf` or `NaN`), then return `x`.

See also: [`prevfloat`](@ref), [`eps`](@ref), [`issubnormal`](@ref).
"""
nextfloat(x::AbstractFloat) = nextfloat(x,1)

"""
    prevfloat(x::AbstractFloat, n::Integer)

The result of `n` iterative applications of `prevfloat` to `x` if `n >= 0`, or `-n`
applications of [`nextfloat`](@ref) if `n < 0`.
"""
prevfloat(x::AbstractFloat, d::Integer) = nextfloat(x, -d)

"""
    prevfloat(x::AbstractFloat)

Return the largest floating point number `y` of the same type as `x` such `y < x`. If no
such `y` exists (e.g. if `x` is `-Inf` or `NaN`), then return `x`.
"""
prevfloat(x::AbstractFloat) = nextfloat(x,-1)

for Ti in (Int8, Int16, Int32, Int64, Int128, UInt8, UInt16, UInt32, UInt64, UInt128)
    for Tf in (Float16, Float32, Float64)
        if Ti <: Unsigned || sizeof(Ti) < sizeof(Tf)
            # Here `Tf(typemin(Ti))-1` is exact, so we can compare the lower-bound
            # directly. `Tf(typemax(Ti))+1` is either always exactly representable, or
            # rounded to `Inf` (e.g. when `Ti==UInt128 && Tf==Float32`).
            @eval begin
                function round(::Type{$Ti},x::$Tf,::RoundingMode{:ToZero})
                    if $(Tf(typemin(Ti))-one(Tf)) < x < $(Tf(typemax(Ti))+one(Tf))
                        return unsafe_trunc($Ti,x)
                    else
                        throw(InexactError(:round, $Ti, x, RoundToZero))
                    end
                end
                function (::Type{$Ti})(x::$Tf)
                    # When typemax(Ti) is not representable by Tf but typemax(Ti) + 1 is,
                    # then < Tf(typemax(Ti) + 1) is stricter than <= Tf(typemax(Ti)). Using
                    # the former causes us to throw on UInt64(Float64(typemax(UInt64))+1)
                    if ($(Tf(typemin(Ti))) <= x < $(Tf(typemax(Ti))+one(Tf))) && isinteger(x)
                        return unsafe_trunc($Ti,x)
                    else
                        throw(InexactError($(Expr(:quote,Ti.name.name)), $Ti, x))
                    end
                end
            end
        else
            # Here `eps(Tf(typemin(Ti))) > 1`, so the only value which can be truncated to
            # `Tf(typemin(Ti)` is itself. Similarly, `Tf(typemax(Ti))` is inexact and will
            # be rounded up. This assumes that `Tf(typemin(Ti)) > -Inf`, which is true for
            # these types, but not for `Float16` or larger integer types.
            @eval begin
                function round(::Type{$Ti},x::$Tf,::RoundingMode{:ToZero})
                    if $(Tf(typemin(Ti))) <= x < $(Tf(typemax(Ti)))
                        return unsafe_trunc($Ti,x)
                    else
                        throw(InexactError(:round, $Ti, x, RoundToZero))
                    end
                end
                function (::Type{$Ti})(x::$Tf)
                    if ($(Tf(typemin(Ti))) <= x < $(Tf(typemax(Ti)))) && isinteger(x)
                        return unsafe_trunc($Ti,x)
                    else
                        throw(InexactError($(Expr(:quote,Ti.name.name)), $Ti, x))
                    end
                end
            end
        end
    end
end

"""
    issubnormal(f) -> Bool

Test whether a floating point number is subnormal.

An IEEE floating point number is [subnormal](https://en.wikipedia.org/wiki/Subnormal_number)
when its exponent bits are zero and its significand is not zero.

# Examples
```jldoctest
julia> floatmin(Float32)
1.1754944f-38

julia> issubnormal(1.0f-37)
false

julia> issubnormal(1.0f-38)
true
```
"""
function issubnormal(x::T) where {T<:IEEEFloat}
    y = reinterpret(Unsigned, x)
    (y & exponent_mask(T) == 0) & (y & significand_mask(T) != 0)
end

ispow2(x::AbstractFloat) = !iszero(x) && frexp(x)[1] == 0.5
iseven(x::AbstractFloat) = isinteger(x) && (abs(x) > maxintfloat(x) || iseven(Integer(x)))
isodd(x::AbstractFloat) = isinteger(x) && abs(x) ≤ maxintfloat(x) && isodd(Integer(x))

@eval begin
    typemin(::Type{Float16}) = $(bitcast(Float16, 0xfc00))
    typemax(::Type{Float16}) = $(Inf16)
    typemin(::Type{Float32}) = $(-Inf32)
    typemax(::Type{Float32}) = $(Inf32)
    typemin(::Type{Float64}) = $(-Inf64)
    typemax(::Type{Float64}) = $(Inf64)
    typemin(x::T) where {T<:Real} = typemin(T)
    typemax(x::T) where {T<:Real} = typemax(T)

    floatmin(::Type{Float16}) = $(bitcast(Float16, 0x0400))
    floatmin(::Type{Float32}) = $(bitcast(Float32, 0x00800000))
    floatmin(::Type{Float64}) = $(bitcast(Float64, 0x0010000000000000))
    floatmax(::Type{Float16}) = $(bitcast(Float16, 0x7bff))
    floatmax(::Type{Float32}) = $(bitcast(Float32, 0x7f7fffff))
    floatmax(::Type{Float64}) = $(bitcast(Float64, 0x7fefffffffffffff))

    eps(::Type{Float16}) = $(bitcast(Float16, 0x1400))
    eps(::Type{Float32}) = $(bitcast(Float32, 0x34000000))
    eps(::Type{Float64}) = $(bitcast(Float64, 0x3cb0000000000000))
    eps() = eps(Float64)
end

eps(x::AbstractFloat) = isfinite(x) ? abs(x) >= floatmin(x) ? ldexp(eps(typeof(x)), exponent(x)) : nextfloat(zero(x)) : oftype(x, NaN)

function eps(x::T) where T<:IEEEFloat
    # For isfinite(x), toggling the LSB will produce either prevfloat(x) or
    # nextfloat(x) but will never change the sign or exponent.
    # For !isfinite(x), this will map Inf to NaN and NaN to NaN or Inf.
    y = reinterpret(T, reinterpret(Unsigned, x) ⊻ true)
    # The absolute difference between these values is eps(x). This is true even
    # for Inf/NaN values.
    return abs(x - y)
end

"""
    floatmin(T = Float64)

Return the smallest positive normal number representable by the floating-point
type `T`.

# Examples
```jldoctest
julia> floatmin(Float16)
Float16(6.104e-5)

julia> floatmin(Float32)
1.1754944f-38

julia> floatmin()
2.2250738585072014e-308
```
"""
floatmin(x::T) where {T<:AbstractFloat} = floatmin(T)

"""
    floatmax(T = Float64)

Return the largest finite number representable by the floating-point type `T`.

See also: [`typemax`](@ref), [`floatmin`](@ref), [`eps`](@ref).

# Examples
```jldoctest
julia> floatmax(Float16)
Float16(6.55e4)

julia> floatmax(Float32)
3.4028235f38

julia> floatmax()
1.7976931348623157e308

julia> typemax(Float64)
Inf
```
"""
floatmax(x::T) where {T<:AbstractFloat} = floatmax(T)

floatmin() = floatmin(Float64)
floatmax() = floatmax(Float64)

"""
    eps(::Type{T}) where T<:AbstractFloat
    eps()

Return the *machine epsilon* of the floating point type `T` (`T = Float64` by
default). This is defined as the gap between 1 and the next largest value representable by
`typeof(one(T))`, and is equivalent to `eps(one(T))`.  (Since `eps(T)` is a
bound on the *relative error* of `T`, it is a "dimensionless" quantity like [`one`](@ref).)

# Examples
```jldoctest
julia> eps()
2.220446049250313e-16

julia> eps(Float32)
1.1920929f-7

julia> 1.0 + eps()
1.0000000000000002

julia> 1.0 + eps()/2
1.0
```
"""
eps(::Type{<:AbstractFloat})

"""
    eps(x::AbstractFloat)

Return the *unit in last place* (ulp) of `x`. This is the distance between consecutive
representable floating point values at `x`. In most cases, if the distance on either side
of `x` is different, then the larger of the two is taken, that is

    eps(x) == max(x-prevfloat(x), nextfloat(x)-x)

The exceptions to this rule are the smallest and largest finite values
(e.g. `nextfloat(-Inf)` and `prevfloat(Inf)` for [`Float64`](@ref)), which round to the
smaller of the values.

The rationale for this behavior is that `eps` bounds the floating point rounding
error. Under the default `RoundNearest` rounding mode, if ``y`` is a real number and ``x``
is the nearest floating point number to ``y``, then

```math
|y-x| \\leq \\operatorname{eps}(x)/2.
```

See also: [`nextfloat`](@ref), [`issubnormal`](@ref), [`floatmax`](@ref).

# Examples
```jldoctest
julia> eps(1.0)
2.220446049250313e-16

julia> eps(prevfloat(2.0))
2.220446049250313e-16

julia> eps(2.0)
4.440892098500626e-16

julia> x = prevfloat(Inf)      # largest finite Float64
1.7976931348623157e308

julia> x + eps(x)/2            # rounds up
Inf

julia> x + prevfloat(eps(x)/2) # rounds down
1.7976931348623157e308
```
"""
eps(::AbstractFloat)


## byte order swaps for arbitrary-endianness serialization/deserialization ##
bswap(x::IEEEFloat) = bswap_int(x)

# integer size of float
uinttype(::Type{Float64}) = UInt64
uinttype(::Type{Float32}) = UInt32
uinttype(::Type{Float16}) = UInt16
inttype(::Type{Float64}) = Int64
inttype(::Type{Float32}) = Int32
inttype(::Type{Float16}) = Int16
# float size of integer
floattype(::Type{UInt64}) = Float64
floattype(::Type{UInt32}) = Float32
floattype(::Type{UInt16}) = Float16
floattype(::Type{Int64}) = Float64
floattype(::Type{Int32}) = Float32
floattype(::Type{Int16}) = Float16


## Array operations on floating point numbers ##

float(A::AbstractArray{<:AbstractFloat}) = A

function float(A::AbstractArray{T}) where T
    if !isconcretetype(T)
        error("`float` not defined on abstractly-typed arrays; please convert to a more specific type")
    end
    convert(AbstractArray{typeof(float(zero(T)))}, A)
end

float(r::StepRange) = float(r.start):float(r.step):float(last(r))
float(r::UnitRange) = float(r.start):float(last(r))
float(r::StepRangeLen{T}) where {T} =
    StepRangeLen{typeof(float(T(r.ref)))}(float(r.ref), float(r.step), length(r), r.offset)
function float(r::LinRange)
    LinRange(float(r.start), float(r.stop), length(r))
end
back to top