1 /*
   2  * Copyright (c) 2016, 2018, Red Hat, Inc. All rights reserved.
   3  *
   4  * This code is free software; you can redistribute it and/or modify it
   5  * under the terms of the GNU General Public License version 2 only, as
   6  * published by the Free Software Foundation.
   7  *
   8  * This code is distributed in the hope that it will be useful, but WITHOUT
   9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  11  * version 2 for more details (a copy is included in the LICENSE file that
  12  * accompanied this code).
  13  *
  14  * You should have received a copy of the GNU General Public License version
  15  * 2 along with this work; if not, write to the Free Software Foundation,
  16  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  17  *
  18  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  19  * or visit www.oracle.com if you need additional information or have any
  20  * questions.
  21  *
  22  */
  23 
  24 #ifndef SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP
  25 #define SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP
  26 
  27 #include "gc/shared/taskqueue.hpp"
  28 #include "gc/shared/taskqueue.inline.hpp"
  29 #include "runtime/mutex.hpp"
  30 #include "runtime/thread.hpp"
  31 
  32 template<class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE>
  33 class BufferedOverflowTaskQueue: public OverflowTaskQueue<E, F, N>
  34 {
  35 public:
  36   typedef OverflowTaskQueue<E, F, N> taskqueue_t;
  37 
  38   BufferedOverflowTaskQueue() : _buf_empty(true) {};
  39 
  40   TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;)
  41 
  42   // Push task t into the queue. Returns true on success.
  43   inline bool push(E t);
  44 
  45   // Attempt to pop from the queue. Returns true on success.
  46   inline bool pop(E &t);
  47 
  48   inline void clear()  {
  49     _buf_empty = true;
  50     taskqueue_t::set_empty();
  51     taskqueue_t::overflow_stack()->clear();
  52   }
  53 
  54   inline bool is_empty()        const {
  55     return _buf_empty && taskqueue_t::is_empty();
  56   }
  57 
  58 private:
  59   bool _buf_empty;
  60   E _elem;
  61 };
  62 
  63 // ObjArrayChunkedTask
  64 //
  65 // Encodes both regular oops, and the array oops plus chunking data for parallel array processing.
  66 // The design goal is to make the regular oop ops very fast, because that would be the prevailing
  67 // case. On the other hand, it should not block parallel array processing from efficiently dividing
  68 // the array work.
  69 //
  70 // The idea is to steal the bits from the 64-bit oop to encode array data, if needed. For the
  71 // proper divide-and-conquer strategies, we want to encode the "blocking" data. It turns out, the
  72 // most efficient way to do this is to encode the array block as (chunk * 2^pow), where it is assumed
  73 // that the block has the size of 2^pow. This requires for pow to have only 5 bits (2^32) to encode
  74 // all possible arrays.
  75 //
  76 //    |---------oop---------|-pow-|--chunk---|
  77 //    0                    49     54        64
  78 //
  79 // By definition, chunk == 0 means "no chunk", i.e. chunking starts from 1.
  80 //
  81 // This encoding gives a few interesting benefits:
  82 //
  83 // a) Encoding/decoding regular oops is very simple, because the upper bits are zero in that task:
  84 //
  85 //    |---------oop---------|00000|0000000000| // no chunk data
  86 //
  87 //    This helps the most ubiquitous path. The initialization amounts to putting the oop into the word
  88 //    with zero padding. Testing for "chunkedness" is testing for zero with chunk mask.
  89 //
  90 // b) Splitting tasks for divide-and-conquer is possible. Suppose we have chunk <C, P> that covers
  91 // interval [ (C-1)*2^P; C*2^P ). We can then split it into two chunks:
  92 //      <2*C - 1, P-1>, that covers interval [ (2*C - 2)*2^(P-1); (2*C - 1)*2^(P-1) )
  93 //      <2*C, P-1>,     that covers interval [ (2*C - 1)*2^(P-1);       2*C*2^(P-1) )
  94 //
  95 //    Observe that the union of these two intervals is:
  96 //      [ (2*C - 2)*2^(P-1); 2*C*2^(P-1) )
  97 //
  98 //    ...which is the original interval:
  99 //      [ (C-1)*2^P; C*2^P )
 100 //
 101 // c) The divide-and-conquer strategy could even start with chunk <1, round-log2-len(arr)>, and split
 102 //    down in the parallel threads, which alleviates the upfront (serial) splitting costs.
 103 //
 104 // Encoding limitations caused by current bitscales mean:
 105 //    10 bits for chunk: max 1024 blocks per array
 106 //     5 bits for power: max 2^32 array
 107 //    49 bits for   oop: max 512 TB of addressable space
 108 //
 109 // Stealing bits from oop trims down the addressable space. Stealing too few bits for chunk ID limits
 110 // potential parallelism. Stealing too few bits for pow limits the maximum array size that can be handled.
 111 // In future, these might be rebalanced to favor one degree of freedom against another. For example,
 112 // if/when Arrays 2.0 bring 2^64-sized arrays, we might need to steal another bit for power. We could regain
 113 // some bits back if chunks are counted in ObjArrayMarkingStride units.
 114 //
 115 // There is also a fallback version that uses plain fields, when we don't have enough space to steal the
 116 // bits from the native pointer. It is useful to debug the _LP64 version.
 117 //
 118 
 119 #ifdef _MSC_VER
 120 #pragma warning(push)
 121 // warning C4522: multiple assignment operators specified
 122 #pragma warning( disable:4522 )
 123 #endif
 124 
 125 #ifdef _LP64
 126 class ObjArrayChunkedTask
 127 {
 128 public:
 129   enum {
 130     chunk_bits   = 10,
 131     pow_bits     = 5,
 132     oop_bits     = sizeof(uintptr_t)*8 - chunk_bits - pow_bits,
 133   };
 134   enum {
 135     oop_shift    = 0,
 136     pow_shift    = oop_shift + oop_bits,
 137     chunk_shift  = pow_shift + pow_bits,
 138   };
 139 
 140 public:
 141   ObjArrayChunkedTask(oop o = NULL) {
 142     _obj = ((uintptr_t)(void*) o) << oop_shift;
 143   }
 144   ObjArrayChunkedTask(oop o, int chunk, int mult) {
 145     assert(0 <= chunk && chunk < nth_bit(chunk_bits), "chunk is sane: %d", chunk);
 146     assert(0 <= mult && mult < nth_bit(pow_bits), "pow is sane: %d", mult);
 147     uintptr_t t_b = ((uintptr_t) chunk) << chunk_shift;
 148     uintptr_t t_m = ((uintptr_t) mult) << pow_shift;
 149     uintptr_t obj = (uintptr_t)(void*)o;
 150     assert(obj < nth_bit(oop_bits), "obj ref is sane: " PTR_FORMAT, obj);
 151     intptr_t t_o = obj << oop_shift;
 152     _obj = t_o | t_m | t_b;
 153   }
 154   ObjArrayChunkedTask(const ObjArrayChunkedTask& t): _obj(t._obj) { }
 155 
 156   ObjArrayChunkedTask& operator =(const ObjArrayChunkedTask& t) {
 157     _obj = t._obj;
 158     return *this;
 159   }
 160   volatile ObjArrayChunkedTask&
 161   operator =(const volatile ObjArrayChunkedTask& t) volatile {
 162     (void)const_cast<uintptr_t&>(_obj = t._obj);
 163     return *this;
 164   }
 165 
 166   inline oop obj()   const { return (oop) reinterpret_cast<void*>((_obj >> oop_shift) & right_n_bits(oop_bits)); }
 167   inline int chunk() const { return (int) (_obj >> chunk_shift) & right_n_bits(chunk_bits); }
 168   inline int pow()   const { return (int) ((_obj >> pow_shift) & right_n_bits(pow_bits)); }
 169   inline bool is_not_chunked() const { return (_obj & ~right_n_bits(oop_bits + pow_bits)) == 0; }
 170 
 171   DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid.
 172 
 173   static size_t max_addressable() {
 174     return nth_bit(oop_bits);
 175   }
 176 
 177   static int chunk_size() {
 178     return nth_bit(chunk_bits);
 179   }
 180 
 181 private:
 182   uintptr_t _obj;
 183 };
 184 #else
 185 class ObjArrayChunkedTask
 186 {
 187 public:
 188   enum {
 189     chunk_bits  = 10,
 190     pow_bits    = 5,
 191   };
 192 public:
 193   ObjArrayChunkedTask(oop o = NULL, int chunk = 0, int pow = 0): _obj(o) {
 194     assert(0 <= chunk && chunk < nth_bit(chunk_bits), "chunk is sane: %d", chunk);
 195     assert(0 <= pow && pow < nth_bit(pow_bits), "pow is sane: %d", pow);
 196     _chunk = chunk;
 197     _pow = pow;
 198   }
 199   ObjArrayChunkedTask(const ObjArrayChunkedTask& t): _obj(t._obj), _chunk(t._chunk), _pow(t._pow) { }
 200 
 201   ObjArrayChunkedTask& operator =(const ObjArrayChunkedTask& t) {
 202     _obj = t._obj;
 203     _chunk = t._chunk;
 204     _pow = t._pow;
 205     return *this;
 206   }
 207   volatile ObjArrayChunkedTask&
 208   operator =(const volatile ObjArrayChunkedTask& t) volatile {
 209     (void)const_cast<oop&>(_obj = t._obj);
 210     _chunk = t._chunk;
 211     _pow = t._pow;
 212     return *this;
 213   }
 214 
 215   inline oop obj()   const { return _obj; }
 216   inline int chunk() const { return _chunk; }
 217   inline int pow()  const { return _pow; }
 218 
 219   inline bool is_not_chunked() const { return _chunk == 0; }
 220 
 221   DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid.
 222 
 223   static size_t max_addressable() {
 224     return sizeof(oop);
 225   }
 226 
 227   static int chunk_size() {
 228     return nth_bit(chunk_bits);
 229   }
 230 
 231 private:
 232   oop _obj;
 233   int _chunk;
 234   int _pow;
 235 };
 236 #endif
 237 
 238 #ifdef _MSC_VER
 239 #pragma warning(pop)
 240 #endif
 241 
 242 typedef ObjArrayChunkedTask ShenandoahMarkTask;
 243 typedef BufferedOverflowTaskQueue<ShenandoahMarkTask, mtGC> ShenandoahBufferedOverflowTaskQueue;
 244 typedef Padded<ShenandoahBufferedOverflowTaskQueue> ShenandoahObjToScanQueue;
 245 
 246 template <class T, MEMFLAGS F>
 247 class ParallelClaimableQueueSet: public GenericTaskQueueSet<T, F> {
 248 private:
 249   DEFINE_PAD_MINUS_SIZE(0, DEFAULT_CACHE_LINE_SIZE, sizeof(volatile jint));
 250   volatile jint     _claimed_index;
 251   DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, 0);
 252 
 253   debug_only(uint   _reserved;  )
 254 
 255 public:
 256   using GenericTaskQueueSet<T, F>::size;
 257 
 258 public:
 259   ParallelClaimableQueueSet(int n) : GenericTaskQueueSet<T, F>(n), _claimed_index(0) {
 260     debug_only(_reserved = 0; )
 261   }
 262 
 263   void clear_claimed() { _claimed_index = 0; }
 264   T*   claim_next();
 265 
 266   // reserve queues that not for parallel claiming
 267   void reserve(uint n) {
 268     assert(n <= size(), "Sanity");
 269     _claimed_index = (jint)n;
 270     debug_only(_reserved = n;)
 271   }
 272 
 273   debug_only(uint get_reserved() const { return (uint)_reserved; })
 274 };
 275 
 276 template <class T, MEMFLAGS F>
 277 T* ParallelClaimableQueueSet<T, F>::claim_next() {
 278   jint size = (jint)GenericTaskQueueSet<T, F>::size();
 279 
 280   if (_claimed_index >= size) {
 281     return NULL;
 282   }
 283 
 284   jint index = Atomic::add(1, &_claimed_index);
 285 
 286   if (index <= size) {
 287     return GenericTaskQueueSet<T, F>::queue((uint)index - 1);
 288   } else {
 289     return NULL;
 290   }
 291 }
 292 
 293 class ShenandoahObjToScanQueueSet: public ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC> {
 294 public:
 295   ShenandoahObjToScanQueueSet(int n) : ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC>(n) {}
 296 
 297   bool is_empty();
 298   void clear();
 299 
 300 #if TASKQUEUE_STATS
 301   static void print_taskqueue_stats_hdr(outputStream* const st);
 302   void print_taskqueue_stats() const;
 303   void reset_taskqueue_stats();
 304 #endif // TASKQUEUE_STATS
 305 };
 306 
 307 class ShenandoahTerminatorTerminator : public TerminatorTerminator {
 308 public:
 309   // return true, terminates immediately, even if there's remaining work left
 310   virtual bool should_force_termination() { return false; }
 311 };
 312 
 313 /*
 314  * This is an enhanced implementation of Google's work stealing
 315  * protocol, which is described in the paper:
 316  * Understanding and improving JVM GC work stealing at the data center scale
 317  * (http://dl.acm.org/citation.cfm?id=2926706)
 318  *
 319  * Instead of a dedicated spin-master, our implementation will let spin-master to relinquish
 320  * the role before it goes to sleep/wait, so allows newly arrived thread to compete for the role.
 321  * The intention of above enhancement, is to reduce spin-master's latency on detecting new tasks
 322  * for stealing and termination condition.
 323  */
 324 
 325 class ShenandoahTaskTerminator: public ParallelTaskTerminator {
 326 private:
 327   Monitor*    _blocker;
 328   Thread*     _spin_master;
 329 
 330 public:
 331   ShenandoahTaskTerminator(uint n_threads, TaskQueueSetSuper* queue_set) :
 332     ParallelTaskTerminator(n_threads, queue_set), _spin_master(NULL) {
 333     _blocker = new Monitor(Mutex::leaf, "ShenandoahTaskTerminator", false, Monitor::_safepoint_check_never);
 334   }
 335 
 336   ~ShenandoahTaskTerminator() {
 337     assert(_blocker != NULL, "Can not be NULL");
 338     delete _blocker;
 339   }
 340 
 341   bool offer_termination(ShenandoahTerminatorTerminator* terminator);
 342   bool offer_termination() { return offer_termination((ShenandoahTerminatorTerminator*)NULL); }
 343 
 344 private:
 345   bool offer_termination(TerminatorTerminator* terminator) {
 346     ShouldNotReachHere();
 347     return false;
 348   }
 349 
 350 private:
 351   size_t tasks_in_queue_set() { return _queue_set->tasks(); }
 352 
 353   /*
 354    * Perform spin-master task.
 355    * return true if termination condition is detected
 356    * otherwise, return false
 357    */
 358   bool do_spin_master_work(ShenandoahTerminatorTerminator* terminator);
 359 };
 360 
 361 class ShenandoahCancelledTerminatorTerminator : public ShenandoahTerminatorTerminator {
 362   virtual bool should_exit_termination() {
 363     return false;
 364   }
 365   virtual bool should_force_termination() {
 366     return true;
 367   }
 368 };
 369 
 370 #endif // SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP