1 /*
   2  * Copyright (c) 2016, 2019, Red Hat, Inc. All rights reserved.
   3  *
   4  * This code is free software; you can redistribute it and/or modify it
   5  * under the terms of the GNU General Public License version 2 only, as
   6  * published by the Free Software Foundation.
   7  *
   8  * This code is distributed in the hope that it will be useful, but WITHOUT
   9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  11  * version 2 for more details (a copy is included in the LICENSE file that
  12  * accompanied this code).
  13  *
  14  * You should have received a copy of the GNU General Public License version
  15  * 2 along with this work; if not, write to the Free Software Foundation,
  16  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  17  *
  18  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  19  * or visit www.oracle.com if you need additional information or have any
  20  * questions.
  21  *
  22  */
  23 
  24 #ifndef SHARE_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP
  25 #define SHARE_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP
  26 #include "gc/shared/owstTaskTerminator.hpp"
  27 #include "gc/shared/taskqueue.hpp"
  28 #include "memory/allocation.hpp"
  29 #include "runtime/atomic.hpp"
  30 #include "runtime/mutex.hpp"
  31 #include "runtime/thread.hpp"
  32 
  33 template<class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE>
  34 class BufferedOverflowTaskQueue: public OverflowTaskQueue<E, F, N>
  35 {
  36 public:
  37   typedef OverflowTaskQueue<E, F, N> taskqueue_t;
  38 
  39   BufferedOverflowTaskQueue() : _buf_empty(true) {};
  40 
  41   TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;)
  42 
  43   // Push task t into the queue. Returns true on success.
  44   inline bool push(E t);
  45 
  46   // Attempt to pop from the queue. Returns true on success.
  47   inline bool pop(E &t);
  48 
  49   inline void clear();
  50 
  51   inline bool is_empty()        const {
  52     return _buf_empty && taskqueue_t::is_empty();
  53   }
  54 
  55 private:
  56   bool _buf_empty;
  57   E _elem;
  58 };
  59 
  60 // ObjArrayChunkedTask
  61 //
  62 // Encodes both regular oops, and the array oops plus chunking data for parallel array processing.
  63 // The design goal is to make the regular oop ops very fast, because that would be the prevailing
  64 // case. On the other hand, it should not block parallel array processing from efficiently dividing
  65 // the array work.
  66 //
  67 // The idea is to steal the bits from the 64-bit oop to encode array data, if needed. For the
  68 // proper divide-and-conquer strategies, we want to encode the "blocking" data. It turns out, the
  69 // most efficient way to do this is to encode the array block as (chunk * 2^pow), where it is assumed
  70 // that the block has the size of 2^pow. This requires for pow to have only 5 bits (2^32) to encode
  71 // all possible arrays.
  72 //
  73 //    |---------oop---------|-pow-|--chunk---|
  74 //    0                    49     54        64
  75 //
  76 // By definition, chunk == 0 means "no chunk", i.e. chunking starts from 1.
  77 //
  78 // This encoding gives a few interesting benefits:
  79 //
  80 // a) Encoding/decoding regular oops is very simple, because the upper bits are zero in that task:
  81 //
  82 //    |---------oop---------|00000|0000000000| // no chunk data
  83 //
  84 //    This helps the most ubiquitous path. The initialization amounts to putting the oop into the word
  85 //    with zero padding. Testing for "chunkedness" is testing for zero with chunk mask.
  86 //
  87 // b) Splitting tasks for divide-and-conquer is possible. Suppose we have chunk <C, P> that covers
  88 // interval [ (C-1)*2^P; C*2^P ). We can then split it into two chunks:
  89 //      <2*C - 1, P-1>, that covers interval [ (2*C - 2)*2^(P-1); (2*C - 1)*2^(P-1) )
  90 //      <2*C, P-1>,     that covers interval [ (2*C - 1)*2^(P-1);       2*C*2^(P-1) )
  91 //
  92 //    Observe that the union of these two intervals is:
  93 //      [ (2*C - 2)*2^(P-1); 2*C*2^(P-1) )
  94 //
  95 //    ...which is the original interval:
  96 //      [ (C-1)*2^P; C*2^P )
  97 //
  98 // c) The divide-and-conquer strategy could even start with chunk <1, round-log2-len(arr)>, and split
  99 //    down in the parallel threads, which alleviates the upfront (serial) splitting costs.
 100 //
 101 // Encoding limitations caused by current bitscales mean:
 102 //    10 bits for chunk: max 1024 blocks per array
 103 //     5 bits for power: max 2^32 array
 104 //    49 bits for   oop: max 512 TB of addressable space
 105 //
 106 // Stealing bits from oop trims down the addressable space. Stealing too few bits for chunk ID limits
 107 // potential parallelism. Stealing too few bits for pow limits the maximum array size that can be handled.
 108 // In future, these might be rebalanced to favor one degree of freedom against another. For example,
 109 // if/when Arrays 2.0 bring 2^64-sized arrays, we might need to steal another bit for power. We could regain
 110 // some bits back if chunks are counted in ObjArrayMarkingStride units.
 111 //
 112 // There is also a fallback version that uses plain fields, when we don't have enough space to steal the
 113 // bits from the native pointer. It is useful to debug the optimized version.
 114 //
 115 
 116 #ifdef _MSC_VER
 117 #pragma warning(push)
 118 // warning C4522: multiple assignment operators specified
 119 #pragma warning( disable:4522 )
 120 #endif
 121 
 122 #ifdef _LP64
 123 #define SHENANDOAH_OPTIMIZED_OBJTASK 1
 124 #else
 125 #define SHENANDOAH_OPTIMIZED_OBJTASK 0
 126 #endif
 127 
 128 #if SHENANDOAH_OPTIMIZED_OBJTASK
 129 class ObjArrayChunkedTask
 130 {
 131 public:
 132   enum {
 133     chunk_bits   = 10,
 134     pow_bits     = 5,
 135     oop_bits     = sizeof(uintptr_t)*8 - chunk_bits - pow_bits
 136   };
 137   enum {
 138     oop_shift    = 0,
 139     pow_shift    = oop_shift + oop_bits,
 140     chunk_shift  = pow_shift + pow_bits
 141   };
 142 
 143 public:
 144   ObjArrayChunkedTask(oop o = NULL) {
 145     assert(decode_oop(encode_oop(o)) ==  o, "oop can be encoded: " PTR_FORMAT, p2i(o));
 146     _obj = encode_oop(o);
 147   }
 148   ObjArrayChunkedTask(oop o, int chunk, int pow) {
 149     assert(decode_oop(encode_oop(o)) == o, "oop can be encoded: " PTR_FORMAT, p2i(o));
 150     assert(decode_chunk(encode_chunk(chunk)) == chunk, "chunk can be encoded: %d", chunk);
 151     assert(decode_pow(encode_pow(pow)) == pow, "pow can be encoded: %d", pow);
 152     _obj = encode_oop(o) | encode_chunk(chunk) | encode_pow(pow);
 153   }
 154   ObjArrayChunkedTask(const ObjArrayChunkedTask& t): _obj(t._obj) { }
 155 
 156   ObjArrayChunkedTask& operator =(const ObjArrayChunkedTask& t) {
 157     _obj = t._obj;
 158     return *this;
 159   }
 160   volatile ObjArrayChunkedTask&
 161   operator =(const volatile ObjArrayChunkedTask& t) volatile {
 162     (void)const_cast<uintptr_t&>(_obj = t._obj);
 163     return *this;
 164   }
 165 
 166   inline oop decode_oop(uintptr_t val) const {
 167     return (oop) reinterpret_cast<void*>((val >> oop_shift) & right_n_bits(oop_bits));
 168   }
 169 
 170   inline int decode_chunk(uintptr_t val) const {
 171     return (int) ((val >> chunk_shift) & right_n_bits(chunk_bits));
 172   }
 173 
 174   inline int decode_pow(uintptr_t val) const {
 175     return (int) ((val >> pow_shift) & right_n_bits(pow_bits));
 176   }
 177 
 178   inline uintptr_t encode_oop(oop obj) const {
 179     return ((uintptr_t)(void*) obj) << oop_shift;
 180   }
 181 
 182   inline uintptr_t encode_chunk(int chunk) const {
 183     return ((uintptr_t) chunk) << chunk_shift;
 184   }
 185 
 186   inline uintptr_t encode_pow(int pow) const {
 187     return ((uintptr_t) pow) << pow_shift;
 188   }
 189 
 190   inline oop obj()   const { return decode_oop(_obj);   }
 191   inline int chunk() const { return decode_chunk(_obj); }
 192   inline int pow()   const { return decode_pow(_obj);   }
 193   inline bool is_not_chunked() const { return (_obj & ~right_n_bits(oop_bits + pow_bits)) == 0; }
 194 
 195   DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid.
 196 
 197   static uintptr_t max_addressable() {
 198     return nth_bit(oop_bits);
 199   }
 200 
 201   static int chunk_size() {
 202     return nth_bit(chunk_bits);
 203   }
 204 
 205 private:
 206   uintptr_t _obj;
 207 };
 208 #else
 209 class ObjArrayChunkedTask
 210 {
 211 public:
 212   enum {
 213     chunk_bits  = 10,
 214     pow_bits    = 5,
 215   };
 216 public:
 217   ObjArrayChunkedTask(oop o = NULL, int chunk = 0, int pow = 0): _obj(o) {
 218     assert(0 <= chunk && chunk < nth_bit(chunk_bits), "chunk is sane: %d", chunk);
 219     assert(0 <= pow && pow < nth_bit(pow_bits), "pow is sane: %d", pow);
 220     _chunk = chunk;
 221     _pow = pow;
 222   }
 223   ObjArrayChunkedTask(const ObjArrayChunkedTask& t): _obj(t._obj), _chunk(t._chunk), _pow(t._pow) { }
 224 
 225   ObjArrayChunkedTask& operator =(const ObjArrayChunkedTask& t) {
 226     _obj = t._obj;
 227     _chunk = t._chunk;
 228     _pow = t._pow;
 229     return *this;
 230   }
 231   volatile ObjArrayChunkedTask&
 232   operator =(const volatile ObjArrayChunkedTask& t) volatile {
 233     (void)const_cast<oop&>(_obj = t._obj);
 234     _chunk = t._chunk;
 235     _pow = t._pow;
 236     return *this;
 237   }
 238 
 239   inline oop obj()   const { return _obj; }
 240   inline int chunk() const { return _chunk; }
 241   inline int pow()  const { return _pow; }
 242 
 243   inline bool is_not_chunked() const { return _chunk == 0; }
 244 
 245   DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid.
 246 
 247   static size_t max_addressable() {
 248     return sizeof(oop);
 249   }
 250 
 251   static int chunk_size() {
 252     return nth_bit(chunk_bits);
 253   }
 254 
 255 private:
 256   oop _obj;
 257   int _chunk;
 258   int _pow;
 259 };
 260 #endif // SHENANDOAH_OPTIMIZED_OBJTASK
 261 
 262 #ifdef _MSC_VER
 263 #pragma warning(pop)
 264 #endif
 265 
 266 typedef ObjArrayChunkedTask ShenandoahMarkTask;
 267 typedef BufferedOverflowTaskQueue<ShenandoahMarkTask, mtGC> ShenandoahBufferedOverflowTaskQueue;
 268 typedef Padded<ShenandoahBufferedOverflowTaskQueue> ShenandoahObjToScanQueue;
 269 
 270 template <class T, MEMFLAGS F>
 271 class ParallelClaimableQueueSet: public GenericTaskQueueSet<T, F> {
 272 private:
 273   DEFINE_PAD_MINUS_SIZE(0, DEFAULT_CACHE_LINE_SIZE, sizeof(volatile jint));
 274   volatile jint     _claimed_index;
 275   DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, 0);
 276 
 277   debug_only(uint   _reserved;  )
 278 
 279 public:
 280   using GenericTaskQueueSet<T, F>::size;
 281 
 282 public:
 283   ParallelClaimableQueueSet(int n) : GenericTaskQueueSet<T, F>(n), _claimed_index(0) {
 284     debug_only(_reserved = 0; )
 285   }
 286 
 287   void clear_claimed() { _claimed_index = 0; }
 288   T*   claim_next();
 289 
 290   // reserve queues that not for parallel claiming
 291   void reserve(uint n) {
 292     assert(n <= size(), "Sanity");
 293     _claimed_index = (jint)n;
 294     debug_only(_reserved = n;)
 295   }
 296 
 297   debug_only(uint get_reserved() const { return (uint)_reserved; })
 298 };
 299 
 300 template <class T, MEMFLAGS F>
 301 T* ParallelClaimableQueueSet<T, F>::claim_next() {
 302   jint size = (jint)GenericTaskQueueSet<T, F>::size();
 303 
 304   if (_claimed_index >= size) {
 305     return NULL;
 306   }
 307 
 308   jint index = Atomic::add(&_claimed_index, 1);
 309 
 310   if (index <= size) {
 311     return GenericTaskQueueSet<T, F>::queue((uint)index - 1);
 312   } else {
 313     return NULL;
 314   }
 315 }
 316 
 317 class ShenandoahObjToScanQueueSet: public ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC> {
 318 public:
 319   ShenandoahObjToScanQueueSet(int n) : ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC>(n) {}
 320 
 321   bool is_empty();
 322   void clear();
 323 
 324 #if TASKQUEUE_STATS
 325   static void print_taskqueue_stats_hdr(outputStream* const st);
 326   void print_taskqueue_stats() const;
 327   void reset_taskqueue_stats();
 328 #endif // TASKQUEUE_STATS
 329 };
 330 
 331 class ShenandoahTerminatorTerminator : public TerminatorTerminator {
 332 private:
 333   ShenandoahHeap* _heap;
 334 public:
 335   ShenandoahTerminatorTerminator(ShenandoahHeap* const heap) : _heap(heap) { }
 336   // return true, terminates immediately, even if there's remaining work left
 337   virtual bool should_exit_termination() { return _heap->cancelled_gc(); }
 338 };
 339 
 340 class ShenandoahTaskTerminator : public StackObj {
 341 private:
 342   OWSTTaskTerminator* const   _terminator;
 343 public:
 344   ShenandoahTaskTerminator(uint n_threads, TaskQueueSetSuper* queue_set);
 345   ~ShenandoahTaskTerminator();
 346 
 347   bool offer_termination(ShenandoahTerminatorTerminator* terminator) {
 348     return _terminator->offer_termination(terminator);
 349   }
 350 
 351   void reset_for_reuse() { _terminator->reset_for_reuse(); }
 352   bool offer_termination() { return offer_termination((ShenandoahTerminatorTerminator*)NULL); }
 353 };
 354 
 355 #endif // SHARE_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP