1 /*
   2  * Copyright (c) 2016, Red Hat, Inc. and/or its affiliates.
   3  *
   4  * This code is free software; you can redistribute it and/or modify it
   5  * under the terms of the GNU General Public License version 2 only, as
   6  * published by the Free Software Foundation.
   7  *
   8  * This code is distributed in the hope that it will be useful, but WITHOUT
   9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  11  * version 2 for more details (a copy is included in the LICENSE file that
  12  * accompanied this code).
  13  *
  14  * You should have received a copy of the GNU General Public License version
  15  * 2 along with this work; if not, write to the Free Software Foundation,
  16  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  17  *
  18  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  19  * or visit www.oracle.com if you need additional information or have any
  20  * questions.
  21  *
  22  */
  23 
  24 #ifndef SHARE_VM_GC_SHENANDOAH_SHENANDOAH_TASKQUEUE_HPP
  25 #define SHARE_VM_GC_SHENANDOAH_SHENANDOAH_TASKQUEUE_HPP
  26 
  27 #include "memory/padded.hpp"
  28 #include "utilities/taskqueue.hpp"
  29 #include "runtime/mutex.hpp"
  30 
  31 class Thread;
  32 
  33 template<class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE>
  34 class BufferedOverflowTaskQueue: public OverflowTaskQueue<E, F, N>
  35 {
  36 public:
  37   typedef OverflowTaskQueue<E, F, N> taskqueue_t;
  38 
  39   BufferedOverflowTaskQueue() : _buf_empty(true) {};
  40 
  41   TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;)
  42 
  43   // Push task t onto:
  44   //   - first, try buffer;
  45   //   - then, try the queue;
  46   //   - then, overflow stack.
  47   // Return true.
  48   inline bool push(E t);
  49 
  50   // Attempt to pop from the buffer; return true if anything was popped.
  51   inline bool pop_buffer(E &t);
  52 
  53   inline void clear_buffer()  { _buf_empty = true; }
  54   inline bool buffer_empty()  const { return _buf_empty; }
  55   inline bool is_empty()        const {
  56     return taskqueue_t::is_empty() && buffer_empty();
  57   }
  58 
  59 private:
  60   bool _buf_empty;
  61   E _elem;
  62 };
  63 
  64 // ObjArrayChunkedTask
  65 //
  66 // Encodes both regular oops, and the array oops plus chunking data for parallel array processing.
  67 // The design goal is to make the regular oop ops very fast, because that would be the prevailing
  68 // case. On the other hand, it should not block parallel array processing from efficiently dividing
  69 // the array work.
  70 //
  71 // The idea is to steal the bits from the 64-bit oop to encode array data, if needed. For the
  72 // proper divide-and-conquer strategies, we want to encode the "blocking" data. It turns out, the
  73 // most efficient way to do this is to encode the array block as (chunk * 2^pow), where it is assumed
  74 // that the block has the size of 2^pow. This requires for pow to have only 5 bits (2^32) to encode
  75 // all possible arrays.
  76 //
  77 //    |---------oop---------|-pow-|--chunk---|
  78 //    0                    49     54        64
  79 //
  80 // By definition, chunk == 0 means "no chunk", i.e. chunking starts from 1.
  81 //
  82 // This encoding gives a few interesting benefits:
  83 //
  84 // a) Encoding/decoding regular oops is very simple, because the upper bits are zero in that task:
  85 //
  86 //    |---------oop---------|00000|0000000000| // no chunk data
  87 //
  88 //    This helps the most ubiquitous path. The initialization amounts to putting the oop into the word
  89 //    with zero padding. Testing for "chunkedness" is testing for zero with chunk mask.
  90 //
  91 // b) Splitting tasks for divide-and-conquer is possible. Suppose we have chunk <C, P> that covers
  92 // interval [ (C-1)*2^P; C*2^P ). We can then split it into two chunks:
  93 //      <2*C - 1, P-1>, that covers interval [ (2*C - 2)*2^(P-1); (2*C - 1)*2^(P-1) )
  94 //      <2*C, P-1>,     that covers interval [ (2*C - 1)*2^(P-1);       2*C*2^(P-1) )
  95 //
  96 //    Observe that the union of these two intervals is:
  97 //      [ (2*C - 2)*2^(P-1); 2*C*2^(P-1) )
  98 //
  99 //    ...which is the original interval:
 100 //      [ (C-1)*2^P; C*2^P )
 101 //
 102 // c) The divide-and-conquer strategy could even start with chunk <1, round-log2-len(arr)>, and split
 103 //    down in the parallel threads, which alleviates the upfront (serial) splitting costs.
 104 //
 105 // Encoding limitations caused by current bitscales mean:
 106 //    10 bits for chunk: max 1024 blocks per array
 107 //     5 bits for power: max 2^32 array
 108 //    49 bits for   oop: max 512 TB of addressable space
 109 //
 110 // Stealing bits from oop trims down the addressable space. Stealing too few bits for chunk ID limits
 111 // potential parallelism. Stealing too few bits for pow limits the maximum array size that can be handled.
 112 // In future, these might be rebalanced to favor one degree of freedom against another. For example,
 113 // if/when Arrays 2.0 bring 2^64-sized arrays, we might need to steal another bit for power. We could regain
 114 // some bits back if chunks are counted in ObjArrayMarkingStride units.
 115 //
 116 // There is also a fallback version that uses plain fields, when we don't have enough space to steal the
 117 // bits from the native pointer. It is useful to debug the _LP64 version.
 118 //
 119 #ifdef _LP64
 120 class ObjArrayChunkedTask
 121 {
 122 public:
 123   enum {
 124     chunk_bits   = 10,
 125     pow_bits     = 5,
 126     oop_bits     = sizeof(uintptr_t)*8 - chunk_bits - pow_bits,
 127   };
 128   enum {
 129     chunk_size   = nth_bit(chunk_bits),
 130     pow_size     = nth_bit(pow_bits),
 131     oop_size     = nth_bit(oop_bits),
 132   };
 133   enum {
 134     oop_shift    = 0,
 135     pow_shift    = oop_shift + oop_bits,
 136     chunk_shift  = pow_shift + pow_bits,
 137   };
 138   enum {
 139     oop_mask     = right_n_bits(oop_bits),
 140     pow_mask     = right_n_bits(pow_bits),
 141     chunk_mask   = right_n_bits(chunk_bits),
 142     chunk_mask_unshift = ~right_n_bits(oop_bits + pow_bits),
 143   };
 144 
 145 public:
 146   ObjArrayChunkedTask(oop o = NULL) {
 147     _obj = ((uintptr_t)(void*) o) << oop_shift;
 148   }
 149   ObjArrayChunkedTask(oop o, int chunk, int mult) {
 150     assert(0 <= chunk && chunk < chunk_size, err_msg("chunk is sane: %d", chunk));
 151     assert(0 <= mult && mult < pow_size, err_msg("pow is sane: %d", mult));
 152     uintptr_t t_b = ((uintptr_t) chunk) << chunk_shift;
 153     uintptr_t t_m = ((uintptr_t) mult) << pow_shift;
 154     uintptr_t obj = (uintptr_t)(void*)o;
 155     assert(obj < oop_size, err_msg("obj ref is sane: " PTR_FORMAT, obj));
 156     intptr_t t_o = obj << oop_shift;
 157     _obj = t_o | t_m | t_b;
 158   }
 159   ObjArrayChunkedTask(const ObjArrayChunkedTask& t): _obj(t._obj) { }
 160 
 161   ObjArrayChunkedTask& operator =(const ObjArrayChunkedTask& t) {
 162     _obj = t._obj;
 163     return *this;
 164   }
 165   volatile ObjArrayChunkedTask&
 166   operator =(const volatile ObjArrayChunkedTask& t) volatile {
 167     (void)const_cast<uintptr_t&>(_obj = t._obj);
 168     return *this;
 169   }
 170 
 171   inline oop obj()   const { return (oop) reinterpret_cast<void*>((_obj >> oop_shift) & oop_mask); }
 172   inline int chunk() const { return (int) (_obj >> chunk_shift) & chunk_mask; }
 173   inline int pow()   const { return (int) ((_obj >> pow_shift) & pow_mask); }
 174   inline bool is_not_chunked() const { return (_obj & chunk_mask_unshift) == 0; }
 175 
 176   DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid.
 177 
 178 private:
 179   uintptr_t _obj;
 180 };
 181 #else
 182 class ObjArrayChunkedTask
 183 {
 184 public:
 185   enum {
 186     chunk_bits  = 10,
 187     pow_bits    = 5,
 188   };
 189   enum {
 190     chunk_size  = nth_bit(chunk_bits),
 191     pow_size    = nth_bit(pow_bits),
 192   };
 193 public:
 194   ObjArrayChunkedTask(oop o = NULL, int chunk = 0, int pow = 0): _obj(o) {
 195     assert(0 <= chunk && chunk < chunk_size, err_msg("chunk is sane: %d", chunk));
 196     assert(0 <= pow && pow < pow_size, err_msg("pow is sane: %d", pow));
 197     _chunk = chunk;
 198     _pow = pow;
 199   }
 200   ObjArrayChunkedTask(const ObjArrayChunkedTask& t): _obj(t._obj), _chunk(t._chunk), _pow(t._pow) { }
 201 
 202   ObjArrayChunkedTask& operator =(const ObjArrayChunkedTask& t) {
 203     _obj = t._obj;
 204     _chunk = t._chunk;
 205     _pow = t._pow;
 206     return *this;
 207   }
 208   volatile ObjArrayChunkedTask&
 209   operator =(const volatile ObjArrayChunkedTask& t) volatile {
 210     (void)const_cast<oop&>(_obj = t._obj);
 211     _chunk = t._chunk;
 212     _pow = t._pow;
 213     return *this;
 214   }
 215 
 216   inline oop obj()   const { return _obj; }
 217   inline int chunk() const { return _chunk; }
 218   inline int pow()  const { return _pow; }
 219 
 220   inline bool is_not_chunked() const { return _chunk == 0; }
 221 
 222   DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid.
 223 
 224 private:
 225   oop _obj;
 226   int _chunk;
 227   int _pow;
 228 };
 229 #endif
 230 
 231 typedef ObjArrayChunkedTask SCMTask;
 232 typedef BufferedOverflowTaskQueue<SCMTask, mtGC> ShenandoahBufferedOverflowTaskQueue;
 233 typedef Padded<ShenandoahBufferedOverflowTaskQueue> SCMObjToScanQueue;
 234 
 235 template <class T, MEMFLAGS F>
 236 class ParallelClaimableQueueSet: public GenericTaskQueueSet<T, F> {
 237 private:
 238   volatile jint     _claimed_index;
 239   debug_only(uint   _reserved;  )
 240 
 241 public:
 242   using GenericTaskQueueSet<T, F>::size;
 243 
 244 public:
 245   ParallelClaimableQueueSet(int n) : GenericTaskQueueSet<T, F>(n) {
 246     debug_only(_reserved = 0; )
 247   }
 248 
 249   void clear_claimed() { _claimed_index = 0; }
 250   T*   claim_next();
 251 
 252   // reserve queues that not for parallel claiming
 253   void reserve(uint n) {
 254     assert(n <= size(), "Sanity");
 255     _claimed_index = (jint)n;
 256     debug_only(_reserved = n;)
 257   }
 258 
 259   debug_only(uint get_reserved() const { return (uint)_reserved; })
 260 };
 261 
 262 
 263 template <class T, MEMFLAGS F>
 264 T* ParallelClaimableQueueSet<T, F>::claim_next() {
 265   jint size = (jint)GenericTaskQueueSet<T, F>::size();
 266 
 267   if (_claimed_index >= size) {
 268     return NULL;
 269   }
 270 
 271   jint index = Atomic::add(1, &_claimed_index);
 272 
 273   if (index <= size) {
 274     return GenericTaskQueueSet<T, F>::queue((uint)index - 1);
 275   } else {
 276     return NULL;
 277   }
 278 }
 279 
 280 class SCMObjToScanQueueSet: public ParallelClaimableQueueSet<SCMObjToScanQueue, mtGC> {
 281 
 282 public:
 283   SCMObjToScanQueueSet(int n) : ParallelClaimableQueueSet<SCMObjToScanQueue, mtGC>(n) {
 284   }
 285 
 286   bool is_empty();
 287 
 288   void clear();
 289 };
 290 
 291 
 292 /*
 293  * This is an enhanced implementation of Google's work stealing
 294  * protocol, which is described in the paper:
 295  * Understanding and improving JVM GC work stealing at the data center scale
 296  * (http://dl.acm.org/citation.cfm?id=2926706)
 297  *
 298  * Instead of a dedicated spin-master, our implementation will let spin-master to relinquish
 299  * the role before it goes to sleep/wait, so allows newly arrived thread to compete for the role.
 300  * The intention of above enhancement, is to reduce spin-master's latency on detecting new tasks
 301  * for stealing and termination condition.
 302  */
 303 
 304 class ShenandoahTaskTerminator: public ParallelTaskTerminator {
 305 private:
 306   Monitor*    _blocker;
 307   Thread*     _spin_master;
 308 
 309 
 310 public:
 311   ShenandoahTaskTerminator(uint n_threads, TaskQueueSetSuper* queue_set) :
 312     ParallelTaskTerminator(n_threads, queue_set), _spin_master(NULL) {
 313     _blocker = new Monitor(Mutex::leaf, "ShenandoahTaskTerminator", false);
 314   }
 315 
 316   bool offer_termination(TerminatorTerminator* terminator);
 317 
 318 private:
 319   size_t tasks_in_queue_set() { return _queue_set->tasks(); }
 320 
 321 
 322   /*
 323    * Perform spin-master task.
 324    * return true if termination condition is detected
 325    * otherwise, return false
 326    */
 327   bool do_spin_master_work(TerminatorTerminator* terminator);
 328 };
 329 
 330 class ShenandoahCancelledTerminatorTerminator : public TerminatorTerminator {
 331   virtual bool should_exit_termination() {
 332     return false;
 333   }
 334   virtual bool should_force_termination() {
 335     return true;
 336   }
 337 };
 338 
 339 #endif // SHARE_VM_GC_SHENANDOAH_SHENANDOAH_TASKQUEUE_HPP