--- old/src/share/vm/runtime/orderAccess.hpp 2015-02-19 22:27:00.974867269 -0500 +++ new/src/share/vm/runtime/orderAccess.hpp 2015-02-19 22:26:59.518784854 -0500 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -29,11 +29,7 @@ // Memory Access Ordering Model // -// This interface is based on the JSR-133 Cookbook for Compiler Writers -// and on the IA64 memory model. It is the dynamic equivalent of the -// C/C++ volatile specifier. I.e., volatility restricts compile-time -// memory access reordering in a way similar to what we want to occur -// at runtime. +// This interface is based on the JSR-133 Cookbook for Compiler Writers. // // In the following, the terms 'previous', 'subsequent', 'before', // 'after', 'preceding' and 'succeeding' refer to program order. The @@ -41,7 +37,6 @@ // relative to program order, while 'up' and 'above' refer to backward // motion. // -// // We define four primitive memory barrier operations. // // LoadLoad: Load1(s); LoadLoad; Load2 @@ -69,86 +64,88 @@ // operations. Stores before Store1 may *not* float below Load2 and any // subsequent load operations. // +// We define two further barriers: acquire and release. // -// We define two further operations, 'release' and 'acquire'. They are -// mirror images of each other. +// Conceptually, acquire/release semantics form unidirectional and +// asynchronous barriers w.r.t. a synchronizing load(X) and store(X) pair. +// They should always be used in pairs to publish (release store) and +// access (load acquire) some implicitly understood shared data between +// threads in a relatively cheap fashion not requiring storeload. If not +// used in such a pair, it is advised to use a membar instead: +// acquire/release only make sense as pairs. +// +// T1: access_shared_data +// T1: ]release +// T1: (...) +// T1: store(X) +// +// T2: load(X) +// T2: (...) +// T2: acquire[ +// T2: access_shared_data +// +// It is guaranteed that if T2: load(X) synchronizes with (observes the +// value written by) T1: store(X), then the memory accesses before the T1: +// ]release happen before the memory accesses after the T2: acquire[. +// +// Total Store Order (TSO) machines can be seen as machines issuing a +// release store for each store and a load acquire for each load. Therefore +// there is an inherent resemblence between TSO and acquire/release +// semantics. TSO can be seen as an abstract machine where loads are +// executed immediately when encountered (hence loadload reordering not +// happening) but enqueues stores in a FIFO queue +// for asynchronous serialization (neither storestore or loadstore +// reordering happening). The only reordering happening is storeload due to +// the queue asynchronously serializing stores (yet in order). +// +// Acquire/release semantics essentially exploits this asynchronicity: when +// the load(X) acquire[ observes the store of ]release store(X), the +// accesses before the release must have happened before the accesses after +// acquire. +// +// The API offers both stand-alone acquire() and release() as well as bound +// load_acquire() and release_store(). It is guaranteed that these are +// semantically equivalent w.r.t. the defined model. However, since +// stand-alone acquire()/release() does not know which previous +// load/subsequent store is considered the synchronizing load/store, they +// may be more conservative in implementations. We advise using the bound +// variants whenever possible. +// +// Finally, we define a "fence" operation, as a bidirectional barrier. +// It guarantees that any memory access preceding the fence is not +// reordered w.r.t. any memory accesses subsequent to the fence in program +// order. This may be used to prevent sequences of loads from floating up +// above sequences of stores. +// +// The following table shows the implementations on some architectures: +// +// Constraint x86 sparc TSO ppc +// --------------------------------------------------------------------------- +// fence LoadStore | lock membar #StoreLoad sync +// StoreStore | addl 0,(sp) +// LoadLoad | +// StoreLoad +// +// release LoadStore | lwsync +// StoreStore +// +// acquire LoadLoad | lwsync +// LoadStore +// +// release_store lwsync +// +// +// release_store_fence xchg lwsync +// membar #StoreLoad +// sync // -// Execution by a processor of release makes the effect of all memory -// accesses issued by it previous to the release visible to all -// processors *before* the release completes. The effect of subsequent -// memory accesses issued by it *may* be made visible *before* the -// release. I.e., subsequent memory accesses may float above the -// release, but prior ones may not float below it. -// -// Execution by a processor of acquire makes the effect of all memory -// accesses issued by it subsequent to the acquire visible to all -// processors *after* the acquire completes. The effect of prior memory -// accesses issued by it *may* be made visible *after* the acquire. -// I.e., prior memory accesses may float below the acquire, but -// subsequent ones may not float above it. -// -// Finally, we define a 'fence' operation, which conceptually is a -// release combined with an acquire. In the real world these operations -// require one or more machine instructions which can float above and -// below the release or acquire, so we usually can't just issue the -// release-acquire back-to-back. All machines we know of implement some -// sort of memory fence instruction. -// -// -// The standalone implementations of release and acquire need an associated -// dummy volatile store or load respectively. To avoid redundant operations, -// we can define the composite operators: 'release_store', 'store_fence' and -// 'load_acquire'. Here's a summary of the machine instructions corresponding -// to each operation. -// -// sparc RMO ia64 x86 -// --------------------------------------------------------------------- -// fence membar #LoadStore | mf lock addl 0,(sp) -// #StoreStore | -// #LoadLoad | -// #StoreLoad -// -// release membar #LoadStore | st.rel [sp]=r0 movl $0, -// #StoreStore -// st %g0,[] -// -// acquire ld [%sp],%g0 ld.acq =[sp] movl (sp), -// membar #LoadLoad | -// #LoadStore -// -// release_store membar #LoadStore | st.rel -// #StoreStore -// st -// -// store_fence st st lock xchg -// fence mf -// -// load_acquire ld ld.acq -// membar #LoadLoad | -// #LoadStore -// -// Using only release_store and load_acquire, we can implement the -// following ordered sequences. -// -// 1. load, load == load_acquire, load -// or load_acquire, load_acquire -// 2. load, store == load, release_store -// or load_acquire, store -// or load_acquire, release_store -// 3. store, store == store, release_store -// or release_store, release_store // -// These require no membar instructions for sparc-TSO and no extra -// instructions for ia64. +// load_acquire +// lwsync // -// Ordering a load relative to preceding stores requires a store_fence, +// Ordering a load relative to preceding stores requires a StoreLoad, // which implies a membar #StoreLoad between the store and load under -// sparc-TSO. A fence is required by ia64. On x86, we use locked xchg. -// -// 4. store, load == store_fence, load -// -// Use store_fence to make sure all stores done in an 'interesting' -// region are made visible prior to both subsequent loads and stores. +// sparc-TSO. On x86, we use explicitly locked add. // // Conventional usage is to issue a load_acquire for ordered loads. Use // release_store for ordered stores when you care only that prior stores @@ -157,27 +154,19 @@ // release_store_fence to update values like the thread state, where we // don't want the current thread to continue until all our prior memory // accesses (including the new thread state) are visible to other threads. +// This is equivalent to the volatile semantics of the Java Memory Model. // +// C++ Volatile Semantics // -// C++ Volatility -// -// C++ guarantees ordering at operations termed 'sequence points' (defined -// to be volatile accesses and calls to library I/O functions). 'Side -// effects' (defined as volatile accesses, calls to library I/O functions -// and object modification) previous to a sequence point must be visible -// at that sequence point. See the C++ standard, section 1.9, titled -// "Program Execution". This means that all barrier implementations, -// including standalone loadload, storestore, loadstore, storeload, acquire -// and release must include a sequence point, usually via a volatile memory -// access. Other ways to guarantee a sequence point are, e.g., use of -// indirect calls and linux's __asm__ volatile. -// Note: as of 6973570, we have replaced the originally static "dummy" field -// (see above) by a volatile store to the stack. All of the versions of the -// compilers that we currently use (SunStudio, gcc and VC++) respect the -// semantics of volatile here. If you build HotSpot using other -// compilers, you may need to verify that no compiler reordering occurs -// across the sequence point represented by the volatile access. -// +// C++ volatile semantics prevent compiler re-ordering between +// volatile memory accesses. However, reordering between non-volatile +// and volatile memory accesses is in general undefined. For compiler +// reordering constraints taking non-volatile memory accesses into +// consideration, a compiler barrier has to be used instead. Some +// compiler implementations may choose to enforce additional +// constraints beyond those required by the language. Note also that +// both volatile semantics and compiler barrier do not prevent +// hardware reordering. // // os::is_MP Considered Redundant // @@ -240,8 +229,32 @@ // order. If their implementations change such that these assumptions // are violated, a whole lot of code will break. +enum ScopedFenceType { + X_ACQUIRE + , RELEASE_X + , RELEASE_X_FENCE +}; + +template +class ScopedFenceGeneral: public StackObj { + public: + void prefix() {} + void postfix() {} +}; + +template +class ScopedFence : public ScopedFenceGeneral { + void *const _field; + public: + ScopedFence(void *const field) : _field(field) { prefix(); } + ~ScopedFence() { postfix(); } + void prefix() { ScopedFenceGeneral::prefix(); } + void postfix() { ScopedFenceGeneral::postfix(); } +}; + class OrderAccess : AllStatic { public: + // barriers static void loadload(); static void storestore(); static void loadstore(); @@ -280,20 +293,6 @@ static void release_store_ptr(volatile intptr_t* p, intptr_t v); static void release_store_ptr(volatile void* p, void* v); - static void store_fence(jbyte* p, jbyte v); - static void store_fence(jshort* p, jshort v); - static void store_fence(jint* p, jint v); - static void store_fence(jlong* p, jlong v); - static void store_fence(jubyte* p, jubyte v); - static void store_fence(jushort* p, jushort v); - static void store_fence(juint* p, juint v); - static void store_fence(julong* p, julong v); - static void store_fence(jfloat* p, jfloat v); - static void store_fence(jdouble* p, jdouble v); - - static void store_ptr_fence(intptr_t* p, intptr_t v); - static void store_ptr_fence(void** p, void* v); - static void release_store_fence(volatile jbyte* p, jbyte v); static void release_store_fence(volatile jshort* p, jshort v); static void release_store_fence(volatile jint* p, jint v); @@ -313,6 +312,47 @@ // routine if it exists, It should only be used by platforms that // don't have another way to do the inline assembly. static void StubRoutines_fence(); + + // Give platforms a variation point to specialize. + template static T specialized_load_acquire (volatile T* p ); + template static void specialized_release_store (volatile T* p, T v); + template static void specialized_release_store_fence(volatile T* p, T v); + + template + static void ordered_store(volatile FieldType* p, FieldType v); + + template + static FieldType ordered_load(volatile FieldType* p); + + static void store(volatile jbyte* p, jbyte v); + static void store(volatile jshort* p, jshort v); + static void store(volatile jint* p, jint v); + static void store(volatile jlong* p, jlong v); + static void store(volatile jdouble* p, jdouble v); + static void store(volatile jfloat* p, jfloat v); + + static jbyte load (volatile jbyte* p); + static jshort load (volatile jshort* p); + static jint load (volatile jint* p); + static jlong load (volatile jlong* p); + static jdouble load (volatile jdouble* p); + static jfloat load (volatile jfloat* p); + + // The following store_fence methods are deprecated and will be removed + // when all repos conform to the new generalized OrderAccess. + static void store_fence(jbyte* p, jbyte v); + static void store_fence(jshort* p, jshort v); + static void store_fence(jint* p, jint v); + static void store_fence(jlong* p, jlong v); + static void store_fence(jubyte* p, jubyte v); + static void store_fence(jushort* p, jushort v); + static void store_fence(juint* p, juint v); + static void store_fence(julong* p, julong v); + static void store_fence(jfloat* p, jfloat v); + static void store_fence(jdouble* p, jdouble v); + + static void store_ptr_fence(intptr_t* p, intptr_t v); + static void store_ptr_fence(void** p, void* v); }; #endif // SHARE_VM_RUNTIME_ORDERACCESS_HPP