// This file is a part of Julia. License is MIT: https://julialang.org/license // Meant to be included in #ifndef JULIA_THREADS_H #define JULIA_THREADS_H // threading ------------------------------------------------------------------ // WARNING: Threading support is incomplete and experimental // Nonetheless, we define JL_THREAD and use it to give advanced notice to // maintainers of what eventual threading support will change. // JULIA_ENABLE_THREADING is switched on in Make.inc if JULIA_THREADS is // set (in Make.user) #if defined(__i386__) && defined(__GNUC__) && !defined(__SSE2__) # error Julia can only be built for architectures above Pentium 4. Pass -march=pentium4, or set MARCH=pentium4 and ensure that -march is not passed separately with an older architecture. #endif #ifdef _COMPILER_MICROSOFT_ # include # include #endif #if defined(_CPU_X86_64_) || defined(_CPU_X86_) # include #endif #ifndef _OS_WINDOWS_ # include #endif #include typedef struct { jl_taggedvalue_t *freelist; // root of list of free objects jl_taggedvalue_t *newpages; // root of list of chunks of free objects uint16_t osize; // size of objects in this pool } jl_gc_pool_t; typedef struct { // variable for tracking weak references arraylist_t weak_refs; // variables for tracking malloc'd arrays struct _mallocarray_t *mallocarrays; struct _mallocarray_t *mafreelist; // variables for tracking big objects struct _bigval_t *big_objects; // variables for tracking "remembered set" arraylist_t rem_bindings; arraylist_t _remset[2]; // contains jl_value_t* // lower bound of the number of pointers inside remembered values int remset_nptr; arraylist_t *remset; arraylist_t *last_remset; // variables for allocating objects from pools #ifdef _P64 # define JL_GC_N_POOLS 41 #elif defined(_CPU_ARM_) || defined(_CPU_PPC_) || defined(_CPU_X86_) # define JL_GC_N_POOLS 42 #else # define JL_GC_N_POOLS 43 #endif jl_gc_pool_t norm_pools[JL_GC_N_POOLS]; } jl_thread_heap_t; // Cache of thread local change to global metadata during GC // This is sync'd after marking. typedef struct { // thread local increment of `perm_scanned_bytes` size_t perm_scanned_bytes; // thread local increment of `scanned_bytes` size_t scanned_bytes; // Number of queued big objects (<= 1024) size_t nbig_obj; // Array of queued big objects to be moved between the young list // and the old list. // A set low bit means that the object should be moved from the old list // to the young list (`mark_reset_age`). // Objects can only be put into this list when the mark bit is flipped to // `1` (atomically). Combining with the sync after marking, // this makes sure that a single objects can only appear once in // the lists (the mark bit cannot be flipped to `0` without sweeping) void *big_obj[1024]; } jl_gc_mark_cache_t; // This includes all the thread local states we care about for a thread. #define JL_MAX_BT_SIZE 80000 typedef struct _jl_tls_states_t { struct _jl_gcframe_t *pgcstack; size_t world_age; struct _jl_value_t *exception_in_transit; volatile size_t *safepoint; // Whether it is safe to execute GC at the same time. #define JL_GC_STATE_WAITING 1 // gc_state = 1 means the thread is doing GC or is waiting for the GC to // finish. #define JL_GC_STATE_SAFE 2 // gc_state = 2 means the thread is running unmanaged code that can be // execute at the same time with the GC. volatile int8_t gc_state; volatile int8_t in_finalizer; int8_t disable_gc; volatile sig_atomic_t defer_signal; struct _jl_module_t *current_module; struct _jl_task_t *volatile current_task; struct _jl_task_t *root_task; void *stackbase; char *stack_lo; char *stack_hi; jl_jmp_buf *volatile jmp_target; jl_jmp_buf base_ctx; // base context of stack jl_jmp_buf *safe_restore; int16_t tid; size_t bt_size; // JL_MAX_BT_SIZE + 1 elements long uintptr_t *bt_data; // Atomically set by the sender, reset by the handler. volatile sig_atomic_t signal_request; // Allow the sigint to be raised asynchronously // this is limited to the few places we do synchronous IO // we can make this more general (similar to defer_signal) if necessary volatile sig_atomic_t io_wait; jl_thread_heap_t heap; #ifndef _OS_WINDOWS_ // These are only used on unix now pthread_t system_id; void *signal_stack; #endif // execution of certain certain impure // statements is prohibited from certain // callbacks (such as generated functions) // as it may make compilation undecidable int in_pure_callback; // Counter to disable finalizer **on the current thread** int finalizers_inhibited; arraylist_t finalizers; jl_gc_mark_cache_t gc_cache; } jl_tls_states_t; typedef jl_tls_states_t *jl_ptls_t; // Update codegen version in `ccall.cpp` after changing either `pause` or `wake` #ifdef __MIC__ # define jl_cpu_pause() _mm_delay_64(100) # define jl_cpu_wake() ((void)0) # define JL_CPU_WAKE_NOOP 1 #elif defined(_CPU_X86_64_) || defined(_CPU_X86_) /* !__MIC__ */ # define jl_cpu_pause() _mm_pause() # define jl_cpu_wake() ((void)0) # define JL_CPU_WAKE_NOOP 1 #elif defined(_CPU_AARCH64_) || (defined(_CPU_ARM_) && __ARM_ARCH >= 7) # define jl_cpu_pause() __asm__ volatile ("wfe" ::: "memory") # define jl_cpu_wake() __asm__ volatile ("sev" ::: "memory") # define JL_CPU_WAKE_NOOP 0 #else # define jl_cpu_pause() ((void)0) # define jl_cpu_wake() ((void)0) # define JL_CPU_WAKE_NOOP 1 #endif // Copied from libuv. Add `JL_CONST_FUNC` so that the compiler // can optimize this better. static inline unsigned long JL_CONST_FUNC jl_thread_self(void) { #ifdef _OS_WINDOWS_ return (unsigned long)GetCurrentThreadId(); #else return (unsigned long)pthread_self(); #endif } /** * Thread synchronization primitives: * * These roughly follows the c11/c++11 memory model and the act as memory * barriers at both the compiler level and the hardware level. * The only exception is the GC safepoint and GC state transitions for which * we use only a compiler (signal) barrier and use the signal handler to do the * synchronization in order to lower the mutator overhead as much as possible. * * We use the compiler intrinsics to implement a similar API to the c11/c++11 * one instead of using it directly because, * * 1. We support GCC 4.7 and GCC add support for c11 atomics in 4.9. * Luckily, the __atomic intrinsics were added in GCC 4.7. * 2. (most importantly) we need interoperability between code written * in different languages. * The current c++ standard (c++14) does not allow using c11 atomic * functions or types and there's currently no guarantee that the two * types are compatible (although most of them probably are). * We also need to access these atomic variables from the LLVM JIT code * which is very hard unless the layout of the object is fully * specified. */ #if defined(__GNUC__) # define jl_signal_fence() __atomic_signal_fence(__ATOMIC_SEQ_CST) # define jl_atomic_fetch_add_relaxed(obj, arg) \ __atomic_fetch_add(obj, arg, __ATOMIC_RELAXED) # define jl_atomic_fetch_add(obj, arg) \ __atomic_fetch_add(obj, arg, __ATOMIC_SEQ_CST) # define jl_atomic_fetch_and_relaxed(obj, arg) \ __atomic_fetch_and(obj, arg, __ATOMIC_RELAXED) # define jl_atomic_fetch_and(obj, arg) \ __atomic_fetch_and(obj, arg, __ATOMIC_SEQ_CST) # define jl_atomic_fetch_or_relaxed(obj, arg) \ __atomic_fetch_or(obj, arg, __ATOMIC_RELAXED) # define jl_atomic_fetch_or(obj, arg) \ __atomic_fetch_or(obj, arg, __ATOMIC_SEQ_CST) // Returns the original value of `obj` // Use the legacy __sync builtins for now, this can also be written using // the __atomic builtins or c11 atomics with GNU extension or c11 _Generic # define jl_atomic_compare_exchange(obj, expected, desired) \ __sync_val_compare_and_swap(obj, expected, desired) # define jl_atomic_exchange(obj, desired) \ __atomic_exchange_n(obj, desired, __ATOMIC_SEQ_CST) # define jl_atomic_exchange_relaxed(obj, desired) \ __atomic_exchange_n(obj, desired, __ATOMIC_RELAXED) // TODO: Maybe add jl_atomic_compare_exchange_weak for spin lock # define jl_atomic_store(obj, val) \ __atomic_store_n(obj, val, __ATOMIC_SEQ_CST) # if defined(__clang__) || defined(__ICC) || defined(__INTEL_COMPILER) || \ !(defined(_CPU_X86_) || defined(_CPU_X86_64_)) // ICC and Clang doesn't have this bug... # define jl_atomic_store_release(obj, val) \ __atomic_store_n(obj, val, __ATOMIC_RELEASE) # else // Workaround a GCC bug when using store with release order by using the // stronger version instead. // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67458 # define jl_atomic_store_release(obj, val) do { \ jl_signal_fence(); \ __atomic_store_n(obj, val, __ATOMIC_RELEASE); \ } while (0) # endif # define jl_atomic_load(obj) \ __atomic_load_n(obj, __ATOMIC_SEQ_CST) # define jl_atomic_load_acquire(obj) \ __atomic_load_n(obj, __ATOMIC_ACQUIRE) #elif defined(_COMPILER_MICROSOFT_) # define jl_signal_fence() _ReadWriteBarrier() // add template static inline typename std::enable_if::type jl_atomic_fetch_add(T *obj, T2 arg) { return (T)_InterlockedExchangeAdd8((volatile char*)obj, (char)arg); } template static inline typename std::enable_if::type jl_atomic_fetch_add(T *obj, T2 arg) { return (T)_InterlockedExchangeAdd16((volatile short*)obj, (short)arg); } template static inline typename std::enable_if::type jl_atomic_fetch_add(T *obj, T2 arg) { return (T)_InterlockedExchangeAdd((volatile LONG*)obj, (LONG)arg); } template static inline typename std::enable_if::type jl_atomic_fetch_add(T *obj, T2 arg) { return (T)_InterlockedExchangeAdd64((volatile __int64*)obj, (__int64)arg); } #define jl_atomic_fetch_add_relaxed(obj, arg) jl_atomic_fetch_add(obj, arg) // and template static inline typename std::enable_if::type jl_atomic_fetch_and(T *obj, T2 arg) { return (T)_InterlockedAnd8((volatile char*)obj, (char)arg); } template static inline typename std::enable_if::type jl_atomic_fetch_and(T *obj, T2 arg) { return (T)_InterlockedAnd16((volatile short*)obj, (short)arg); } template static inline typename std::enable_if::type jl_atomic_fetch_and(T *obj, T2 arg) { return (T)_InterlockedAnd((volatile LONG*)obj, (LONG)arg); } template static inline typename std::enable_if::type jl_atomic_fetch_and(T *obj, T2 arg) { return (T)_InterlockedAnd64((volatile __int64*)obj, (__int64)arg); } #define jl_atomic_fetch_and_relaxed(obj, arg) jl_atomic_fetch_and(obj, arg) // or template static inline typename std::enable_if::type jl_atomic_fetch_or(T *obj, T2 arg) { return (T)_InterlockedOr8((volatile char*)obj, (char)arg); } template static inline typename std::enable_if::type jl_atomic_fetch_or(T *obj, T2 arg) { return (T)_InterlockedOr16((volatile short*)obj, (short)arg); } template static inline typename std::enable_if::type jl_atomic_fetch_or(T *obj, T2 arg) { return (T)_InterlockedOr((volatile LONG*)obj, (LONG)arg); } template static inline typename std::enable_if::type jl_atomic_fetch_or(T *obj, T2 arg) { return (T)_InterlockedOr64((volatile __int64*)obj, (__int64)arg); } #define jl_atomic_fetch_or_relaxed(obj, arg) jl_atomic_fetch_or(obj, arg) // Returns the original value of `obj` template static inline typename std::enable_if::type jl_atomic_compare_exchange(volatile T *obj, T2 expected, T3 desired) { return (T)_InterlockedCompareExchange8((volatile char*)obj, (char)desired, (char)expected); } template static inline typename std::enable_if::type jl_atomic_compare_exchange(volatile T *obj, T2 expected, T3 desired) { return (T)_InterlockedCompareExchange16((volatile short*)obj, (short)desired, (short)expected); } template static inline typename std::enable_if::type jl_atomic_compare_exchange(volatile T *obj, T2 expected, T3 desired) { return (T)_InterlockedCompareExchange((volatile LONG*)obj, (LONG)desired, (LONG)expected); } template static inline typename std::enable_if::type jl_atomic_compare_exchange(volatile T *obj, T2 expected, T3 desired) { return (T)_InterlockedCompareExchange64((volatile __int64*)obj, (__int64)desired, (__int64)expected); } // atomic exchange template static inline typename std::enable_if::type jl_atomic_exchange(volatile T *obj, T2 val) { return _InterlockedExchange8((volatile char*)obj, (char)val); } template static inline typename std::enable_if::type jl_atomic_exchange(volatile T *obj, T2 val) { return _InterlockedExchange16((volatile short*)obj, (short)val); } template static inline typename std::enable_if::type jl_atomic_exchange(volatile T *obj, T2 val) { return _InterlockedExchange((volatile LONG*)obj, (LONG)val); } template static inline typename std::enable_if::type jl_atomic_exchange(volatile T *obj, T2 val) { return _InterlockedExchange64((volatile __int64*)obj, (__int64)val); } #define jl_atomic_exchange_relaxed(obj, val) jl_atomic_exchange(obj, val) // atomic stores template static inline typename std::enable_if::type jl_atomic_store(volatile T *obj, T2 val) { _InterlockedExchange8((volatile char*)obj, (char)val); } template static inline typename std::enable_if::type jl_atomic_store(volatile T *obj, T2 val) { _InterlockedExchange16((volatile short*)obj, (short)val); } template static inline typename std::enable_if::type jl_atomic_store(volatile T *obj, T2 val) { _InterlockedExchange((volatile LONG*)obj, (LONG)val); } template static inline typename std::enable_if::type jl_atomic_store(volatile T *obj, T2 val) { _InterlockedExchange64((volatile __int64*)obj, (__int64)val); } template static inline void jl_atomic_store_release(volatile T *obj, T2 val) { jl_signal_fence(); *obj = (T)val; } // atomic loads template static inline T jl_atomic_load(volatile T *obj) { // Trick to generate cheaper instructions compare to `_InterlockedOr` // Note that we don't care whether the exchange succeeded or not... return jl_atomic_compare_exchange(obj, T(0), T(0)); } template static inline T jl_atomic_load_acquire(volatile T *obj) { T val = *obj; jl_signal_fence(); return val; } #else # error "No atomic operations supported." #endif #ifdef __cplusplus extern "C" { #endif JL_DLLEXPORT int16_t jl_threadid(void); JL_DLLEXPORT void *jl_threadgroup(void); JL_DLLEXPORT void jl_threading_profile(void); JL_DLLEXPORT void (jl_cpu_pause)(void); JL_DLLEXPORT void (jl_cpu_wake)(void); // Accessing the tls variables, gc safepoint and gc states JL_DLLEXPORT JL_CONST_FUNC jl_ptls_t (jl_get_ptls_states)(void); // This triggers a SegFault when we are in GC // Assign it to a variable to make sure the compiler emit the load // and to avoid Clang warning for -Wunused-volatile-lvalue #define jl_gc_safepoint_(ptls) do { \ jl_signal_fence(); \ size_t safepoint_load = *ptls->safepoint; \ jl_signal_fence(); \ (void)safepoint_load; \ } while (0) #define jl_sigint_safepoint(ptls) do { \ jl_signal_fence(); \ size_t safepoint_load = ptls->safepoint[-1]; \ jl_signal_fence(); \ (void)safepoint_load; \ } while (0) #ifndef JULIA_ENABLE_THREADING extern JL_DLLEXPORT jl_tls_states_t jl_tls_states; #define jl_get_ptls_states() (&jl_tls_states) #define jl_gc_state(ptls) ((int8_t)0) STATIC_INLINE int8_t jl_gc_state_set(jl_ptls_t ptls, int8_t state, int8_t old_state) { (void)ptls; (void)state; return old_state; } #else // ifndef JULIA_ENABLE_THREADING typedef jl_ptls_t (*jl_get_ptls_states_func)(void); #if !defined(_OS_DARWIN_) && !defined(_OS_WINDOWS_) JL_DLLEXPORT void jl_set_ptls_states_getter(jl_get_ptls_states_func f); #endif // Make sure jl_gc_state() is always a rvalue #define jl_gc_state(ptls) ((int8_t)ptls->gc_state) STATIC_INLINE int8_t jl_gc_state_set(jl_ptls_t ptls, int8_t state, int8_t old_state) { ptls->gc_state = state; // A safe point is required if we transition from GC-safe region to // non GC-safe region. if (old_state && !state) jl_gc_safepoint_(ptls); return old_state; } #endif // ifndef JULIA_ENABLE_THREADING STATIC_INLINE int8_t jl_gc_state_save_and_set(jl_ptls_t ptls, int8_t state) { return jl_gc_state_set(ptls, state, jl_gc_state(ptls)); } #define jl_gc_unsafe_enter(ptls) jl_gc_state_save_and_set(ptls, 0) #define jl_gc_unsafe_leave(ptls, state) ((void)jl_gc_state_set(ptls, (state), 0)) #define jl_gc_safe_enter(ptls) jl_gc_state_save_and_set(ptls, JL_GC_STATE_SAFE) #define jl_gc_safe_leave(ptls, state) ((void)jl_gc_state_set(ptls, (state), JL_GC_STATE_SAFE)) JL_DLLEXPORT void (jl_gc_safepoint)(void); #define JL_SIGATOMIC_BEGIN() do { \ jl_get_ptls_states()->defer_signal++; \ jl_signal_fence(); \ } while (0) #define JL_SIGATOMIC_END() do { \ jl_signal_fence(); \ if (--jl_get_ptls_states()->defer_signal == 0) { \ jl_sigint_safepoint(jl_get_ptls_states()); \ } \ } while (0) // Recursive spin lock typedef struct { volatile unsigned long owner; uint32_t count; } jl_mutex_t; JL_DLLEXPORT void jl_gc_enable_finalizers(jl_ptls_t ptls, int on); static inline void jl_lock_frame_push(jl_mutex_t *lock); static inline void jl_lock_frame_pop(void); // JL_LOCK and jl_mutex_lock are GC safe points while JL_LOCK_NOGC // and jl_mutex_lock_nogc are not. // Always use JL_LOCK unless no one holding the lock can trigger a GC or GC // safepoint. JL_LOCK_NOGC should only be needed for GC internal locks. // The JL_LOCK* and JL_UNLOCK* macros are no-op for non-threading build // while the jl_mutex_* functions are always locking and unlocking the locks. static inline void jl_mutex_wait(jl_mutex_t *lock, int safepoint) { unsigned long self = jl_thread_self(); unsigned long owner = jl_atomic_load_acquire(&lock->owner); if (owner == self) { lock->count++; return; } while (1) { if (owner == 0 && jl_atomic_compare_exchange(&lock->owner, 0, self) == 0) { lock->count = 1; return; } if (safepoint) { jl_ptls_t ptls = jl_get_ptls_states(); jl_gc_safepoint_(ptls); } jl_cpu_pause(); owner = lock->owner; } } static inline void jl_mutex_lock_nogc(jl_mutex_t *lock) { jl_mutex_wait(lock, 0); } static inline void jl_mutex_lock(jl_mutex_t *lock) { jl_ptls_t ptls = jl_get_ptls_states(); JL_SIGATOMIC_BEGIN(); jl_mutex_wait(lock, 1); jl_lock_frame_push(lock); jl_gc_enable_finalizers(ptls, 0); } /* Call this function for code that could be called from either a managed or an unmanaged thread */ static inline void jl_mutex_lock_maybe_nogc(jl_mutex_t *lock) { jl_ptls_t ptls = jl_get_ptls_states(); if (ptls->safepoint) { jl_mutex_lock(lock); } else { jl_mutex_lock_nogc(lock); } } static inline void jl_mutex_unlock_nogc(jl_mutex_t *lock) { assert(lock->owner == jl_thread_self() && "Unlocking a lock in a different thread."); if (--lock->count == 0) { jl_atomic_store_release(&lock->owner, 0); jl_cpu_wake(); } } static inline void jl_mutex_unlock(jl_mutex_t *lock) { jl_ptls_t ptls = jl_get_ptls_states(); jl_mutex_unlock_nogc(lock); jl_gc_enable_finalizers(ptls, 1); jl_lock_frame_pop(); JL_SIGATOMIC_END(); } static inline void jl_mutex_unlock_maybe_nogc(jl_mutex_t *lock) { jl_ptls_t ptls = jl_get_ptls_states(); if (ptls->safepoint) { jl_mutex_unlock(lock); } else { jl_mutex_unlock_nogc(lock); } } static inline void jl_mutex_init(jl_mutex_t *lock) { lock->owner = 0; lock->count = 0; } // Locks #ifdef JULIA_ENABLE_THREADING #define JL_MUTEX_INIT(m) jl_mutex_init(m) #define JL_LOCK(m) jl_mutex_lock(m) #define JL_UNLOCK(m) jl_mutex_unlock(m) #define JL_LOCK_NOGC(m) jl_mutex_lock_nogc(m) #define JL_UNLOCK_NOGC(m) jl_mutex_unlock_nogc(m) #else // JULIA_ENABLE_THREADING static inline void jl_mutex_check_type(jl_mutex_t *m) { (void)m; } #define JL_MUTEX_INIT(m) jl_mutex_check_type(m) #define JL_LOCK(m) do { \ JL_SIGATOMIC_BEGIN(); \ jl_gc_enable_finalizers(jl_get_ptls_states(), 0); \ jl_mutex_check_type(m); \ } while (0) #define JL_UNLOCK(m) do { \ jl_gc_enable_finalizers(jl_get_ptls_states(), 1); \ jl_mutex_check_type(m); \ JL_SIGATOMIC_END(); \ } while (0) #define JL_LOCK_NOGC(m) jl_mutex_check_type(m) #define JL_UNLOCK_NOGC(m) jl_mutex_check_type(m) #endif // JULIA_ENABLE_THREADING #ifdef __cplusplus } #endif #endif