Revision c3861f5c7371fea2fc5d20404a7f77c927bbd61b authored by Shuhei Kadowaki on 14 August 2021, 12:49:21 UTC, committed by Shuhei Kadowaki on 23 August 2021, 15:09:13 UTC
Built on top of #41882, this PR sorts out the constant-prop' interface
yet more, particularly generalizes `force_const_prop` to `const_prop_config`
so that it can turn on and off each heuristic.

The main motivation here is, in #41882, we want to force const-prop' on
`setproperty` even when its return type is already `Const` for the sake
of succeeding inlining, by skipping all the `const_prop_xxx_heuristic` checks.
But I still found we want to apply `const_prop_entry_heuristic` to
`getproperty`, because if we already know very accurate result for a
`getproperty` call, usually there is really no motivation for constant-prop', e.g.:
```julia
struct FZero end
Base.getproperty(::FZero, ::Symbol) = 0.0
getproperty(FZero(), :val) # const-prop' doesn't need to happen here
```

Now `force_const_prop(...) -> force::Bool` is refactored to
`const_prop_config(...) -> config::UInt8`, which can turn on and off
each heuristic based on the value of `config`.

I also included another refactoring that inlines `const_prop_rettype_heuristic`
into `const_prop_argument_heuristic`, because they really seem tightly coupled.
1 parent e1e4986
Raw File
atomics.h
// This file is a part of Julia. License is MIT: https://julialang.org/license

#ifndef JL_ATOMICS_H
#define JL_ATOMICS_H

// Low-level atomic operations

#if defined(__i386__) && defined(__GNUC__) && !defined(__SSE2__)
#  error Julia can only be built for architectures above Pentium 4. Pass -march=pentium4, or set MARCH=pentium4 and ensure that -march is not passed separately with an older architecture.
#endif
#ifdef _COMPILER_MICROSOFT_
#  include <intrin.h>
#  include <type_traits>
#endif
#if defined(_CPU_X86_64_) || defined(_CPU_X86_)
#  include <immintrin.h>
#endif
#ifndef _OS_WINDOWS_
#  include <pthread.h>
#endif
#include <signal.h>

enum jl_memory_order {
    jl_memory_order_unspecified = -2,
    jl_memory_order_invalid = -1,
    jl_memory_order_notatomic = 0,
    jl_memory_order_unordered,
    jl_memory_order_monotonic,
    jl_memory_order_consume,
    jl_memory_order_acquire,
    jl_memory_order_release,
    jl_memory_order_acq_rel,
    jl_memory_order_seq_cst
};

/**
 * Thread synchronization primitives:
 *
 * These roughly follows the c11/c++11 memory model and the act as memory
 * barriers at both the compiler level and the hardware level.
 * The only exception is the GC safepoint and GC state transitions for which
 * we use only a compiler (signal) barrier and use the signal handler to do the
 * synchronization in order to lower the mutator overhead as much as possible.
 *
 * We use the compiler intrinsics to implement a similar API to the c11/c++11
 * one instead of using it directly because, we need interoperability between
 * code written in different languages. The current c++ standard (c++14) does
 * not allow using c11 atomic functions or types and there's currently no
 * guarantee that the two types are compatible (although most of them probably
 * are). We also need to access these atomic variables from the LLVM JIT code
 * which is very hard unless the layout of the object is fully specified.
 */
#define jl_fence() __atomic_thread_fence(__ATOMIC_SEQ_CST)
#define jl_fence_release() __atomic_thread_fence(__ATOMIC_RELEASE)
#define jl_signal_fence() __atomic_signal_fence(__ATOMIC_SEQ_CST)


#  define jl_atomic_fetch_add_relaxed(obj, arg)         \
    __atomic_fetch_add(obj, arg, __ATOMIC_RELAXED)
#  define jl_atomic_fetch_add(obj, arg)                 \
    __atomic_fetch_add(obj, arg, __ATOMIC_SEQ_CST)
#  define jl_atomic_add_fetch(obj, arg)                 \
    __atomic_add_fetch(obj, arg, __ATOMIC_SEQ_CST)
#  define jl_atomic_fetch_and_relaxed(obj, arg)         \
    __atomic_fetch_and(obj, arg, __ATOMIC_RELAXED)
#  define jl_atomic_fetch_and(obj, arg)                 \
    __atomic_fetch_and(obj, arg, __ATOMIC_SEQ_CST)
#  define jl_atomic_fetch_or_relaxed(obj, arg)          \
    __atomic_fetch_or(obj, arg, __ATOMIC_RELAXED)
#  define jl_atomic_fetch_or(obj, arg)                  \
    __atomic_fetch_or(obj, arg, __ATOMIC_SEQ_CST)
#  define jl_atomic_cmpswap(obj, expected, desired)    \
    __atomic_compare_exchange_n(obj, expected, desired, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
#  define jl_atomic_cmpswap_relaxed(obj, expected, desired)    \
    __atomic_compare_exchange_n(obj, expected, desired, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED)
// TODO: Maybe add jl_atomic_cmpswap_weak for spin lock
#  define jl_atomic_exchange(obj, desired)              \
    __atomic_exchange_n(obj, desired, __ATOMIC_SEQ_CST)
#  define jl_atomic_exchange_relaxed(obj, desired)      \
    __atomic_exchange_n(obj, desired, __ATOMIC_RELAXED)
#  define jl_atomic_store(obj, val)                     \
    __atomic_store_n(obj, val, __ATOMIC_SEQ_CST)
#  define jl_atomic_store_relaxed(obj, val)             \
    __atomic_store_n(obj, val, __ATOMIC_RELAXED)

#  if defined(__clang__) || defined(__ICC) || defined(__INTEL_COMPILER) || \
    !(defined(_CPU_X86_) || defined(_CPU_X86_64_))
// ICC and Clang doesn't have this bug...
#    define jl_atomic_store_release(obj, val)           \
    __atomic_store_n(obj, val, __ATOMIC_RELEASE)
#  else
// Workaround a GCC bug when using store with release order by using the
// stronger version instead.
// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67458
// fixed in https://gcc.gnu.org/git/?p=gcc.git&a=commit;h=d8c40eff56f69877b33c697ded756d50fde90c27
#    define jl_atomic_store_release(obj, val) do {      \
        jl_signal_fence();                              \
        __atomic_store_n(obj, val, __ATOMIC_RELEASE);   \
    } while (0)
#  endif
#  define jl_atomic_load(obj)                   \
    __atomic_load_n(obj, __ATOMIC_SEQ_CST)
#  define jl_atomic_load_acquire(obj)           \
    __atomic_load_n(obj, __ATOMIC_ACQUIRE)
#ifdef _COMPILER_TSAN_ENABLED_
// For the sake of tsan, call these loads consume ordering since they will act
// as such on the processors we support while normally, the compiler would
// upgrade this to acquire ordering, which is strong (and slower) than we want.
#  define jl_atomic_load_relaxed(obj)           \
    __atomic_load_n(obj, __ATOMIC_CONSUME)
#else
#  define jl_atomic_load_relaxed(obj)           \
    __atomic_load_n(obj, __ATOMIC_RELAXED)
#endif

#ifdef __clang_analyzer__
// for the purposes of the analyzer, we can turn these into non-atomic expressions with similar properties
// (for the sake of the analyzer, we don't care if it is an exact match for behavior)

#undef jl_atomic_exchange
#undef jl_atomic_exchange_relaxed
#define jl_atomic_exchange(obj, desired) \
    (__extension__({ \
            __typeof__((obj)) p__analyzer__ = (obj); \
            __typeof__(*p__analyzer__) temp__analyzer__ = *p__analyzer__; \
            *p__analyzer__ = (desired); \
            temp__analyzer__; \
        }))
#define jl_atomic_exchange_relaxed jl_atomic_exchange

#undef jl_atomic_cmpswap
#undef jl_atomic_cmpswap_relaxed
#define jl_atomic_cmpswap(obj, expected, desired) \
    (__extension__({ \
            __typeof__((obj)) p__analyzer__ = (obj); \
            __typeof__(*p__analyzer__) temp__analyzer__ = *p__analyzer__; \
            __typeof__((expected)) x__analyzer__ = (expected); \
            if (temp__analyzer__ == *x__analyzer__) \
                *p__analyzer__ = (desired); \
            else \
                *x__analyzer__ = temp__analyzer__; \
            temp__analyzer__ == *x__analyzer__; \
        }))
#define jl_atomic_cmpswap_relaxed jl_atomic_cmpswap

#undef jl_atomic_store
#undef jl_atomic_store_release
#undef jl_atomic_store_relaxed
#define jl_atomic_store(obj, val)         (*(obj) = (val))
#define jl_atomic_store_release jl_atomic_store
#define jl_atomic_store_relaxed jl_atomic_store

#undef jl_atomic_load
#undef jl_atomic_load_acquire
#undef jl_atomic_load_relaxed
#define jl_atomic_load(obj)         (*(obj))
#define jl_atomic_load_acquire jl_atomic_load
#define jl_atomic_load_relaxed jl_atomic_load

#endif


#endif // JL_ATOMICS_H
back to top