New src/share/vm/gc/shared/memset_with_concurrent_readers

   1 /*
   2  * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifdef SPARC
  26 
  27 #include "precompiled.hpp"
  28 
  29 #include "gc/shared/memset_with_concurrent_readers.hpp"
  30 #include "runtime/prefetch.inline.hpp"
  31 #include "utilities/align.hpp"
  32 #include "utilities/debug.hpp"
  33 #include "utilities/globalDefinitions.hpp"
  34 #include "utilities/macros.hpp"
  35 
  36 #if INCLUDE_ALL_GCS
  37 
  38 // An implementation of memset, for use when there may be concurrent
  39 // readers of the region being stored into.
  40 //
  41 // We can't use the standard library memset if it is implemented using
  42 // block initializing stores.  Doing so can result in concurrent readers
  43 // seeing spurious zeros.
  44 //
  45 // We can't use the obvious C/C++ for-loop, because the compiler may
  46 // recognize the idiomatic loop and optimize it into a call to the
  47 // standard library memset; we've seen exactly this happen with, for
  48 // example, Solaris Studio 12.3.  Hence the use of inline assembly
  49 // code, hiding loops from the compiler's optimizer.
  50 //
  51 // We don't attempt to use the standard library memset when it is safe
  52 // to do so.  We could conservatively do so by detecting the presence
  53 // of block initializing stores (VM_Version::has_blk_init()), but the
  54 // implementation provided here should be sufficient.
  55 
  56 inline void fill_subword(void* start, void* end, int value) {
  57   STATIC_ASSERT(BytesPerWord == 8);
  58   assert(pointer_delta(end, start, 1) < (size_t)BytesPerWord, "precondition");
  59   // Dispatch on (end - start).
  60   void* pc;
  61   __asm__ volatile(
  62     // offset := (7 - (end - start)) + 3
  63     //   3 instructions from rdpc to DISPATCH
  64     " sub %[offset], %[end], %[offset]\n\t" // offset := start - end
  65     " sllx %[offset], 2, %[offset]\n\t" // scale offset for instruction size of 4
  66     " add %[offset], 40, %[offset]\n\t" // offset += 10 * instruction size
  67     " rd %%pc, %[pc]\n\t"               // dispatch on scaled offset
  68     " jmpl %[pc]+%[offset], %%g0\n\t"
  69     "  nop\n\t"
  70     // DISPATCH: no direct reference, but without it the store block may be elided.
  71     "1:\n\t"
  72     " stb %[value], [%[end]-7]\n\t" // end[-7] = value
  73     " stb %[value], [%[end]-6]\n\t"
  74     " stb %[value], [%[end]-5]\n\t"
  75     " stb %[value], [%[end]-4]\n\t"
  76     " stb %[value], [%[end]-3]\n\t"
  77     " stb %[value], [%[end]-2]\n\t"
  78     " stb %[value], [%[end]-1]\n\t" // end[-1] = value
  79     : /* only temporaries/overwritten outputs */
  80       [pc] "=&r" (pc),               // temp
  81       [offset] "+&r" (start)
  82     : [end] "r" (end),
  83       [value] "r" (value)
  84     : "memory");
  85 }
  86 
  87 void memset_with_concurrent_readers(void* to, int value, size_t size) {
  88   Prefetch::write(to, 0);
  89   void* end = static_cast<char*>(to) + size;
  90   if (size >= (size_t)BytesPerWord) {
  91     // Fill any partial word prefix.
  92     uintx* aligned_to = static_cast<uintx*>(align_up(to, BytesPerWord));
  93     fill_subword(to, aligned_to, value);
  94 
  95     // Compute fill word.
  96     STATIC_ASSERT(BitsPerByte == 8);
  97     STATIC_ASSERT(BitsPerWord == 64);
  98     uintx xvalue = value & 0xff;
  99     xvalue |= (xvalue << 8);
 100     xvalue |= (xvalue << 16);
 101     xvalue |= (xvalue << 32);
 102 
 103     uintx* aligned_end = static_cast<uintx*>(align_down(end, BytesPerWord));
 104     assert(aligned_to <= aligned_end, "invariant");
 105 
 106     // for ( ; aligned_to < aligned_end; ++aligned_to) {
 107     //   *aligned_to = xvalue;
 108     // }
 109     uintptr_t temp;
 110     __asm__ volatile(
 111       // Unroll loop x8.
 112       " sub %[aend], %[ato], %[temp]\n\t"
 113       " cmp %[temp], 56\n\t"           // cc := (aligned_end - aligned_to) > 7 words
 114       " ba %%xcc, 2f\n\t"              // goto TEST always
 115       "  sub %[aend], 56, %[temp]\n\t" // limit := aligned_end - 7 words
 116       // LOOP:
 117       "1:\n\t"                         // unrolled x8 store loop top
 118       " cmp %[temp], %[ato]\n\t"       // cc := limit > (next) aligned_to
 119       " stx %[xvalue], [%[ato]-64]\n\t" // store 8 words, aligned_to pre-incremented
 120       " stx %[xvalue], [%[ato]-56]\n\t"
 121       " stx %[xvalue], [%[ato]-48]\n\t"
 122       " stx %[xvalue], [%[ato]-40]\n\t"
 123       " stx %[xvalue], [%[ato]-32]\n\t"
 124       " stx %[xvalue], [%[ato]-24]\n\t"
 125       " stx %[xvalue], [%[ato]-16]\n\t"
 126       " stx %[xvalue], [%[ato]-8]\n\t"
 127       // TEST:
 128       "2:\n\t"
 129       " bgu,a %%xcc, 1b\n\t"           // goto LOOP if more than 7 words remaining
 130       "  add %[ato], 64, %[ato]\n\t"   // aligned_to += 8, for next iteration
 131       // Fill remaining < 8 full words.
 132       // Dispatch on (aligned_end - aligned_to).
 133       // offset := (7 - (aligned_end - aligned_to)) + 3
 134       //   3 instructions from rdpc to DISPATCH
 135       " sub %[ato], %[aend], %[ato]\n\t" // offset := aligned_to - aligned_end
 136       " srax %[ato], 1, %[ato]\n\t"      // scale offset for instruction size of 4
 137       " add %[ato], 40, %[ato]\n\t"      // offset += 10 * instruction size
 138       " rd %%pc, %[temp]\n\t"            // dispatch on scaled offset
 139       " jmpl %[temp]+%[ato], %%g0\n\t"
 140       "  nop\n\t"
 141       // DISPATCH: no direct reference, but without it the store block may be elided.
 142       "3:\n\t"
 143       " stx %[xvalue], [%[aend]-56]\n\t" // aligned_end[-7] = xvalue
 144       " stx %[xvalue], [%[aend]-48]\n\t"
 145       " stx %[xvalue], [%[aend]-40]\n\t"
 146       " stx %[xvalue], [%[aend]-32]\n\t"
 147       " stx %[xvalue], [%[aend]-24]\n\t"
 148       " stx %[xvalue], [%[aend]-16]\n\t"
 149       " stx %[xvalue], [%[aend]-8]\n\t"  // aligned_end[-1] = xvalue
 150       : /* only temporaries/overwritten outputs */
 151         [temp] "=&r" (temp),
 152         [ato] "+&r" (aligned_to)
 153       : [aend] "r" (aligned_end),
 154         [xvalue] "r" (xvalue)
 155       : "cc", "memory");
 156     to = aligned_end;           // setup for suffix
 157   }
 158   // Fill any partial word suffix.  Also the prefix if size < BytesPerWord.
 159   fill_subword(to, end, value);
 160 }
 161 
 162 #endif // INCLUDE_ALL_GCS
 163 
 164 #endif // SPARC