1 /* 2 * Copyright (c) 2016, 2018, Red Hat, Inc. All rights reserved. 3 * 4 * This code is free software; you can redistribute it and/or modify it 5 * under the terms of the GNU General Public License version 2 only, as 6 * published by the Free Software Foundation. 7 * 8 * This code is distributed in the hope that it will be useful, but WITHOUT 9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 11 * version 2 for more details (a copy is included in the LICENSE file that 12 * accompanied this code). 13 * 14 * You should have received a copy of the GNU General Public License version 15 * 2 along with this work; if not, write to the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 17 * 18 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 19 * or visit www.oracle.com if you need additional information or have any 20 * questions. 21 * 22 */ 23 24 #ifndef SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP 25 #define SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP 26 27 #include "memory/padded.hpp" 28 #include "utilities/taskqueue.hpp" 29 #include "runtime/mutex.hpp" 30 31 class Thread; 32 33 template<class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE> 34 class BufferedOverflowTaskQueue: public OverflowTaskQueue<E, F, N> 35 { 36 public: 37 typedef OverflowTaskQueue<E, F, N> taskqueue_t; 38 39 BufferedOverflowTaskQueue() : _buf_empty(true) {}; 40 41 TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;) 42 43 // Push task t into the queue. Returns true on success. 44 inline bool push(E t); 45 46 // Attempt to pop from the queue. Returns true on success. 47 inline bool pop(E &t); 48 49 inline void clear() { 50 _buf_empty = true; 51 taskqueue_t::set_empty(); 52 taskqueue_t::overflow_stack()->clear(); 53 } 54 55 inline bool is_empty() const { 56 return _buf_empty && taskqueue_t::is_empty(); 57 } 58 59 private: 60 bool _buf_empty; 61 E _elem; 62 }; 63 64 #ifdef _MSC_VER 65 #pragma warning(push) 66 // warning C4522: multiple assignment operators specified 67 #pragma warning(disable:4522) 68 #endif 69 70 // ObjArrayChunkedTask 71 // 72 // Encodes both regular oops, and the array oops plus chunking data for parallel array processing. 73 // The design goal is to make the regular oop ops very fast, because that would be the prevailing 74 // case. On the other hand, it should not block parallel array processing from efficiently dividing 75 // the array work. 76 // 77 // The idea is to steal the bits from the 64-bit oop to encode array data, if needed. For the 78 // proper divide-and-conquer strategies, we want to encode the "blocking" data. It turns out, the 79 // most efficient way to do this is to encode the array block as (chunk * 2^pow), where it is assumed 80 // that the block has the size of 2^pow. This requires for pow to have only 5 bits (2^32) to encode 81 // all possible arrays. 82 // 83 // |---------oop---------|-pow-|--chunk---| 84 // 0 49 54 64 85 // 86 // By definition, chunk == 0 means "no chunk", i.e. chunking starts from 1. 87 // 88 // This encoding gives a few interesting benefits: 89 // 90 // a) Encoding/decoding regular oops is very simple, because the upper bits are zero in that task: 91 // 92 // |---------oop---------|00000|0000000000| // no chunk data 93 // 94 // This helps the most ubiquitous path. The initialization amounts to putting the oop into the word 95 // with zero padding. Testing for "chunkedness" is testing for zero with chunk mask. 96 // 97 // b) Splitting tasks for divide-and-conquer is possible. Suppose we have chunk <C, P> that covers 98 // interval [ (C-1)*2^P; C*2^P ). We can then split it into two chunks: 99 // <2*C - 1, P-1>, that covers interval [ (2*C - 2)*2^(P-1); (2*C - 1)*2^(P-1) ) 100 // <2*C, P-1>, that covers interval [ (2*C - 1)*2^(P-1); 2*C*2^(P-1) ) 101 // 102 // Observe that the union of these two intervals is: 103 // [ (2*C - 2)*2^(P-1); 2*C*2^(P-1) ) 104 // 105 // ...which is the original interval: 106 // [ (C-1)*2^P; C*2^P ) 107 // 108 // c) The divide-and-conquer strategy could even start with chunk <1, round-log2-len(arr)>, and split 109 // down in the parallel threads, which alleviates the upfront (serial) splitting costs. 110 // 111 // Encoding limitations caused by current bitscales mean: 112 // 10 bits for chunk: max 1024 blocks per array 113 // 5 bits for power: max 2^32 array 114 // 49 bits for oop: max 512 TB of addressable space 115 // 116 // Stealing bits from oop trims down the addressable space. Stealing too few bits for chunk ID limits 117 // potential parallelism. Stealing too few bits for pow limits the maximum array size that can be handled. 118 // In future, these might be rebalanced to favor one degree of freedom against another. For example, 119 // if/when Arrays 2.0 bring 2^64-sized arrays, we might need to steal another bit for power. We could regain 120 // some bits back if chunks are counted in ObjArrayMarkingStride units. 121 // 122 // There is also a fallback version that uses plain fields, when we don't have enough space to steal the 123 // bits from the native pointer. It is useful to debug the _LP64 version. 124 // 125 126 #ifdef _MSC_VER 127 #pragma warning(push) 128 // warning C4522: multiple assignment operators specified 129 #pragma warning( disable:4522 ) 130 #endif 131 132 #ifdef _LP64 133 class ObjArrayChunkedTask 134 { 135 public: 136 enum { 137 chunk_bits = 10, 138 pow_bits = 5, 139 oop_bits = sizeof(uintptr_t)*8 - chunk_bits - pow_bits, 140 }; 141 enum { 142 oop_shift = 0, 143 pow_shift = oop_shift + oop_bits, 144 chunk_shift = pow_shift + pow_bits, 145 }; 146 147 public: 148 ObjArrayChunkedTask(oop o = NULL) { 149 _obj = ((uintptr_t)(void*) o) << oop_shift; 150 } 151 ObjArrayChunkedTask(oop o, int chunk, int mult) { 152 assert(0 <= chunk && chunk < nth_bit(chunk_bits), err_msg("chunk is sane: %d", chunk)); 153 assert(0 <= mult && mult < nth_bit(pow_bits), err_msg("pow is sane: %d", mult)); 154 uintptr_t t_b = ((uintptr_t) chunk) << chunk_shift; 155 uintptr_t t_m = ((uintptr_t) mult) << pow_shift; 156 uintptr_t obj = (uintptr_t)(void*)o; 157 assert(obj < nth_bit(oop_bits), err_msg("obj ref is sane: " PTR_FORMAT, obj)); 158 intptr_t t_o = obj << oop_shift; 159 _obj = t_o | t_m | t_b; 160 } 161 ObjArrayChunkedTask(const ObjArrayChunkedTask& t): _obj(t._obj) { } 162 163 ObjArrayChunkedTask& operator =(const ObjArrayChunkedTask& t) { 164 _obj = t._obj; 165 return *this; 166 } 167 volatile ObjArrayChunkedTask& 168 operator =(const volatile ObjArrayChunkedTask& t) volatile { 169 (void)const_cast<uintptr_t&>(_obj = t._obj); 170 return *this; 171 } 172 173 inline oop obj() const { return (oop) reinterpret_cast<void*>((_obj >> oop_shift) & right_n_bits(oop_bits)); } 174 inline int chunk() const { return (int) (_obj >> chunk_shift) & right_n_bits(chunk_bits); } 175 inline int pow() const { return (int) ((_obj >> pow_shift) & right_n_bits(pow_bits)); } 176 inline bool is_not_chunked() const { return (_obj & ~right_n_bits(oop_bits + pow_bits)) == 0; } 177 178 DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid. 179 180 static size_t max_addressable() { 181 return nth_bit(oop_bits); 182 } 183 184 static int chunk_size() { 185 return nth_bit(chunk_bits); 186 } 187 188 private: 189 uintptr_t _obj; 190 }; 191 #else 192 class ObjArrayChunkedTask 193 { 194 public: 195 enum { 196 chunk_bits = 10, 197 pow_bits = 5, 198 }; 199 public: 200 ObjArrayChunkedTask(oop o = NULL, int chunk = 0, int pow = 0): _obj(o) { 201 assert(0 <= chunk && chunk < nth_bit(chunk_bits), err_msg("chunk is sane: %d", chunk)); 202 assert(0 <= pow && pow < nth_bit(pow_bits), err_msg("pow is sane: %d", pow)); 203 _chunk = chunk; 204 _pow = pow; 205 } 206 ObjArrayChunkedTask(const ObjArrayChunkedTask& t): _obj(t._obj), _chunk(t._chunk), _pow(t._pow) { } 207 208 ObjArrayChunkedTask& operator =(const ObjArrayChunkedTask& t) { 209 _obj = t._obj; 210 _chunk = t._chunk; 211 _pow = t._pow; 212 return *this; 213 } 214 volatile ObjArrayChunkedTask& 215 operator =(const volatile ObjArrayChunkedTask& t) volatile { 216 (void)const_cast<oop&>(_obj = t._obj); 217 _chunk = t._chunk; 218 _pow = t._pow; 219 return *this; 220 } 221 222 inline oop obj() const { return _obj; } 223 inline int chunk() const { return _chunk; } 224 inline int pow() const { return _pow; } 225 226 inline bool is_not_chunked() const { return _chunk == 0; } 227 228 DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid. 229 230 static size_t max_addressable() { 231 return sizeof(oop); 232 } 233 234 static int chunk_size() { 235 return nth_bit(chunk_bits); 236 } 237 238 private: 239 oop _obj; 240 int _chunk; 241 int _pow; 242 }; 243 #endif 244 245 #ifdef _MSC_VER 246 #pragma warning(pop) 247 #endif 248 249 typedef ObjArrayChunkedTask ShenandoahMarkTask; 250 typedef BufferedOverflowTaskQueue<ShenandoahMarkTask, mtGC> ShenandoahBufferedOverflowTaskQueue; 251 typedef Padded<ShenandoahBufferedOverflowTaskQueue> ShenandoahObjToScanQueue; 252 253 template <class T, MEMFLAGS F> 254 class ParallelClaimableQueueSet: public GenericTaskQueueSet<T, F> { 255 private: 256 char _pad0[DEFAULT_CACHE_LINE_SIZE]; 257 volatile jint _claimed_index; 258 char _pad1[DEFAULT_CACHE_LINE_SIZE]; 259 260 debug_only(uint _reserved; ) 261 262 public: 263 using GenericTaskQueueSet<T, F>::size; 264 265 public: 266 ParallelClaimableQueueSet(int n) : GenericTaskQueueSet<T, F>(n), _claimed_index(0) { 267 debug_only(_reserved = 0; ) 268 } 269 270 void clear_claimed() { _claimed_index = 0; } 271 T* claim_next(); 272 273 // reserve queues that not for parallel claiming 274 void reserve(uint n) { 275 assert(n <= size(), "Sanity"); 276 _claimed_index = (jint)n; 277 debug_only(_reserved = n;) 278 } 279 280 debug_only(uint get_reserved() const { return (uint)_reserved; }) 281 }; 282 283 template <class T, MEMFLAGS F> 284 T* ParallelClaimableQueueSet<T, F>::claim_next() { 285 jint size = (jint)GenericTaskQueueSet<T, F>::size(); 286 287 if (_claimed_index >= size) { 288 return NULL; 289 } 290 291 jint index = Atomic::add(1, &_claimed_index); 292 293 if (index <= size) { 294 return GenericTaskQueueSet<T, F>::queue((uint)index - 1); 295 } else { 296 return NULL; 297 } 298 } 299 300 class ShenandoahObjToScanQueueSet: public ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC> { 301 public: 302 ShenandoahObjToScanQueueSet(int n) : ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC>(n) {} 303 304 bool is_empty(); 305 void clear(); 306 307 #if TASKQUEUE_STATS 308 static void print_taskqueue_stats_hdr(outputStream* const st); 309 void print_taskqueue_stats(); 310 void reset_taskqueue_stats(); 311 #endif // TASKQUEUE_STATS 312 }; 313 314 class ShenandoahTerminatorTerminator : public TerminatorTerminator { 315 public: 316 // return true, terminates immediately, even if there's remaining work left 317 virtual bool should_force_termination() { return false; } 318 }; 319 320 /* 321 * This is an enhanced implementation of Google's work stealing 322 * protocol, which is described in the paper: 323 * Understanding and improving JVM GC work stealing at the data center scale 324 * (http://dl.acm.org/citation.cfm?id=2926706) 325 * 326 * Instead of a dedicated spin-master, our implementation will let spin-master to relinquish 327 * the role before it goes to sleep/wait, so allows newly arrived thread to compete for the role. 328 * The intention of above enhancement, is to reduce spin-master's latency on detecting new tasks 329 * for stealing and termination condition. 330 */ 331 332 class ShenandoahTaskTerminator: public ParallelTaskTerminator { 333 private: 334 Monitor* _blocker; 335 Thread* _spin_master; 336 337 public: 338 ShenandoahTaskTerminator(uint n_threads, TaskQueueSetSuper* queue_set) : 339 ParallelTaskTerminator(n_threads, queue_set), _spin_master(NULL) { 340 _blocker = new Monitor(Mutex::leaf, "ShenandoahTaskTerminator", false); 341 } 342 343 ~ShenandoahTaskTerminator() { 344 assert(_blocker != NULL, "Can not be NULL"); 345 delete _blocker; 346 } 347 348 bool offer_termination(ShenandoahTerminatorTerminator* terminator); 349 bool offer_termination() { return offer_termination((ShenandoahTerminatorTerminator*)NULL); } 350 351 private: 352 bool offer_termination(TerminatorTerminator* terminator) { 353 ShouldNotReachHere(); 354 return false; 355 } 356 357 private: 358 size_t tasks_in_queue_set() { return _queue_set->tasks(); } 359 360 /* 361 * Perform spin-master task. 362 * return true if termination condition is detected 363 * otherwise, return false 364 */ 365 bool do_spin_master_work(ShenandoahTerminatorTerminator* terminator); 366 }; 367 368 class ShenandoahCancelledTerminatorTerminator : public ShenandoahTerminatorTerminator { 369 virtual bool should_exit_termination() { 370 return false; 371 } 372 virtual bool should_force_termination() { 373 return true; 374 } 375 }; 376 377 #endif // SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP