--- /dev/null 2015-08-19 12:23:38.462857388 -0400 +++ new/src/cpu/sparc/vm/memset_with_concurrent_readers_sparc.cpp 2015-08-19 16:27:34.595434050 -0400 @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#include "precompiled.hpp" + +#include "gc/shared/memset_with_concurrent_readers.hpp" +#include "runtime/prefetch.inline.hpp" +#include "utilities/debug.hpp" +#include "utilities/globalDefinitions.hpp" +#include "utilities/macros.hpp" + +#if INCLUDE_ALL_GCS + +// An implementation of memset, for use when there may be concurrent +// readers of the region being stored into. +// +// We can't use the standard library memset if it is implemented using +// block initializing stores. Doing so can result in concurrent readers +// seeing spurious zeros. +// +// We can't use the obvious C/C++ for-loop, because the compiler may +// recognize the idiomatic loop and optimize it into a call to the +// standard library memset; we've seen exactly this happen with, for +// example, Solaris Studio 12.3. Hence the use of inline assembly +// code, hiding loops from the compiler's optimizer. +// +// We don't attempt to use the standard library memset when it is safe +// to do so. We could conservatively do so by detecting the presence +// of block initializing stores (VM_Version::has_blk_init()), but the +// implementation provided here should be sufficient. + +static void fill_subword(void* start, void* end, int value); + +void memset_with_concurrent_readers(void* to, int value, size_t size) { + Prefetch::write(to, 0); + void* end = static_cast(to) + size; + if (size >= BytesPerWord) { + // Fill any partial word prefix. + uintx* aligned_to = static_cast(align_ptr_up(to, BytesPerWord)); + fill_subword(to, aligned_to, value); + + // Compute fill word. + STATIC_ASSERT(BitsPerByte == 8); + STATIC_ASSERT(BitsPerWord == 64); + uintx xvalue = value & 0xff; + xvalue |= (xvalue << 8); + xvalue |= (xvalue << 16); + xvalue |= (xvalue << 32); + + uintx* aligned_end = static_cast(align_ptr_down(end, BytesPerWord)); + assert(aligned_to <= aligned_end, "invariant"); + + // for ( ; aligned_to < aligned_end; ++aligned_to) { + // *aligned_to = xvalue; + // } + uintptr_t temp; + __asm__ volatile( + // Unroll loop x8. + " sub %[aend], %[ato], %[temp]\n\t" + " cmp %[temp], 56\n\t" // cc := (aligned_end - aligned_to) > 7 words + " ba %xcc, 2f\n\t" // goto TEST always + " sub %[aend], 56, %[temp]\n\t" // limit := aligned_end - 7 words + // LOOP: + "1:\n\t" // unrolled x8 store loop top + " cmp %[temp], %[ato]\n\t" // cc := limit > (next) aligned_to + " stx %[xvalue], [%[ato]-64]\n\t" // store 8 words, aligned_to pre-incremented + " stx %[xvalue], [%[ato]-56]\n\t" + " stx %[xvalue], [%[ato]-48]\n\t" + " stx %[xvalue], [%[ato]-40]\n\t" + " stx %[xvalue], [%[ato]-32]\n\t" + " stx %[xvalue], [%[ato]-24]\n\t" + " stx %[xvalue], [%[ato]-16]\n\t" + " stx %[xvalue], [%[ato]-8]\n\t" + // TEST: + "2:\n\t" + " bgu,a %xcc, 1b\n\t" // goto LOOP if more than 7 words remaining + " add %[ato], 64, %[ato]\n\t" // aligned_to += 8, for next iteration + // Fill remaining < 8 full words. + // Dispatch on (aligned_end - aligned_to). + // offset := (7 - (aligned_end - aligned_to)) + 3 + // 3 instructions from rdpc to DISPATCH + " sub %[ato], %[aend], %[ato]\n\t" // offset := aligned_to - aligned_end + " srax %[ato], 1, %[ato]\n\t" // scale offset for instruction size of 4 + " add %[ato], 40, %[ato]\n\t" // offset += 10 * instruction size + " rd %pc, %[temp]\n\t" // dispatch on scaled offset + " jmpl %[temp]+%[ato], %g0\n\t" + " nop\n\t" + // DISPATCH: no direct reference, but without it the store block may be elided. + "3:\n\t" + " stx %[xvalue], [%[aend]-56]\n\t" // aligned_end[-7] = xvalue + " stx %[xvalue], [%[aend]-48]\n\t" + " stx %[xvalue], [%[aend]-40]\n\t" + " stx %[xvalue], [%[aend]-32]\n\t" + " stx %[xvalue], [%[aend]-24]\n\t" + " stx %[xvalue], [%[aend]-16]\n\t" + " stx %[xvalue], [%[aend]-8]\n\t" // aligned_end[-1] = xvalue + : /* no outputs */ + : [ato] "&+r" (aligned_to), + [aend] "r" (aligned_end), + [xvalue] "r" (xvalue), + [temp] "&=r" (temp) + : "cc", "memory"); + to = aligned_end; // setup for suffix + } + // Fill any partial word suffix. Also the prefix if size < BytesPerWord. + fill_subword(to, end, value); +} + +static void fill_subword(void* start, void* end, int value) { + STATIC_ASSERT(BytesPerWord == 8); + assert(pointer_delta(end, start, 1) < BytesPerWord, "precondition"); + // Dispatch on (end - start). + void* pc; + __asm__ volatile( + // offset := (7 - (end - start)) + 3 + // 3 instructions from rdpc to DISPATCH + " sub %[offset], %[end], %[offset]\n\t" // offset := start - end + " sllx %[offset], 2, %[offset]\n\t" // scale offset for instruction size of 4 + " add %[offset], 40, %[offset]\n\t" // offset += 10 * instruction size + " rd %pc, %[pc]\n\t" // dispatch on scaled offset + " jmpl %[pc]+%[offset], %g0\n\t" + " nop\n\t" + // DISPATCH: no direct reference, but without it the store block may be elided. + "1:\n\t" + " stb %[value], [%[end]-7]\n\t" // end[-7] = value + " stb %[value], [%[end]-6]\n\t" + " stb %[value], [%[end]-5]\n\t" + " stb %[value], [%[end]-4]\n\t" + " stb %[value], [%[end]-3]\n\t" + " stb %[value], [%[end]-2]\n\t" + " stb %[value], [%[end]-1]\n\t" // end[-1] = value + : /* no outputs */ + : [offset] "&+r" (start), + [end] "r" (end), + [value] "r" (value), + [pc] "&=r" (pc) + : "memory"); +} + +#endif // INCLUDE_ALL_GCS