--- old/src/cpu/x86/vm/assembler_x86.hpp 2014-08-29 17:54:26.421179150 +0400 +++ new/src/cpu/x86/vm/assembler_x86.hpp 2014-08-29 17:54:26.313176026 +0400 @@ -26,6 +26,7 @@ #define CPU_X86_VM_ASSEMBLER_X86_HPP #include "asm/register.hpp" +#include "vm_version_x86.hpp" class BiasedLockingCounters; @@ -1283,14 +1284,34 @@ if (order_constraint & StoreLoad) { // All usable chips support "locked" instructions which suffice // as barriers, and are much faster than the alternative of - // using cpuid instruction. We use here a locked add [esp],0. + // using cpuid instruction. We use here a locked add [esp-C],0. // This is conveniently otherwise a no-op except for blowing - // flags. + // flags, and introducing a false dependency on target memory + // location. We can't do anything with flags, but we can avoid + // memory dependencies in the current method by locked-adding + // somewhere else on the stack. Doing [esp+C] will collide with + // something on stack in current method, hence we go for [esp-C]. + // It is convenient since it is almost always in data cache, for + // any small C. We need to step back from SP to avoid data + // dependencies with other things on below SP (callee-saves, for + // example). Without a clear way to figure out the minimal safe + // distance from SP, it makes sense to step back the complete + // cache line, as this will also avoid possible second-order effects + // with locked ops against the cache line. Our choice of offset + // is bounded by x86 operand encoding, which should stay within + // [-128; +127] to have the 8-byte displacement encoding. + // // Any change to this code may need to revisit other places in // the code where this idiom is used, in particular the // orderAccess code. + + int offset = -VM_Version::L1_line_size(); + if (offset < -128) { + offset = -128; + } + lock(); - addl(Address(rsp, 0), 0);// Assert the lock# signal here + addl(Address(rsp, offset), 0);// Assert the lock# signal here } } }