1 /* 2 * Copyright (c) 2016, Red Hat, Inc. and/or its affiliates. 3 * 4 * This code is free software; you can redistribute it and/or modify it 5 * under the terms of the GNU General Public License version 2 only, as 6 * published by the Free Software Foundation. 7 * 8 * This code is distributed in the hope that it will be useful, but WITHOUT 9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 11 * version 2 for more details (a copy is included in the LICENSE file that 12 * accompanied this code). 13 * 14 * You should have received a copy of the GNU General Public License version 15 * 2 along with this work; if not, write to the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 17 * 18 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 19 * or visit www.oracle.com if you need additional information or have any 20 * questions. 21 * 22 */ 23 24 #ifndef SHARE_VM_GC_SHENANDOAH_SHENANDOAH_TASKQUEUE_HPP 25 #define SHARE_VM_GC_SHENANDOAH_SHENANDOAH_TASKQUEUE_HPP 26 27 #include "memory/padded.hpp" 28 #include "utilities/taskqueue.hpp" 29 #include "runtime/mutex.hpp" 30 31 class Thread; 32 33 template<class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE> 34 class BufferedOverflowTaskQueue: public OverflowTaskQueue<E, F, N> 35 { 36 public: 37 typedef OverflowTaskQueue<E, F, N> taskqueue_t; 38 39 BufferedOverflowTaskQueue() : _buf_empty(true) {}; 40 41 TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;) 42 43 // Push task t onto: 44 // - first, try buffer; 45 // - then, try the queue; 46 // - then, overflow stack. 47 // Return true. 48 inline bool push(E t); 49 50 // Attempt to pop from the buffer; return true if anything was popped. 51 inline bool pop_buffer(E &t); 52 53 inline void clear_buffer() { _buf_empty = true; } 54 inline bool buffer_empty() const { return _buf_empty; } 55 inline bool is_empty() const { 56 return taskqueue_t::is_empty() && buffer_empty(); 57 } 58 59 private: 60 bool _buf_empty; 61 E _elem; 62 }; 63 64 // ObjArrayChunkedTask 65 // 66 // Encodes both regular oops, and the array oops plus chunking data for parallel array processing. 67 // The design goal is to make the regular oop ops very fast, because that would be the prevailing 68 // case. On the other hand, it should not block parallel array processing from efficiently dividing 69 // the array work. 70 // 71 // The idea is to steal the bits from the 64-bit oop to encode array data, if needed. For the 72 // proper divide-and-conquer strategies, we want to encode the "blocking" data. It turns out, the 73 // most efficient way to do this is to encode the array block as (chunk * 2^pow), where it is assumed 74 // that the block has the size of 2^pow. This requires for pow to have only 5 bits (2^32) to encode 75 // all possible arrays. 76 // 77 // |---------oop---------|-pow-|--chunk---| 78 // 0 49 54 64 79 // 80 // By definition, chunk == 0 means "no chunk", i.e. chunking starts from 1. 81 // 82 // This encoding gives a few interesting benefits: 83 // 84 // a) Encoding/decoding regular oops is very simple, because the upper bits are zero in that task: 85 // 86 // |---------oop---------|00000|0000000000| // no chunk data 87 // 88 // This helps the most ubiquitous path. The initialization amounts to putting the oop into the word 89 // with zero padding. Testing for "chunkedness" is testing for zero with chunk mask. 90 // 91 // b) Splitting tasks for divide-and-conquer is possible. Suppose we have chunk <C, P> that covers 92 // interval [ (C-1)*2^P; C*2^P ). We can then split it into two chunks: 93 // <2*C - 1, P-1>, that covers interval [ (2*C - 2)*2^(P-1); (2*C - 1)*2^(P-1) ) 94 // <2*C, P-1>, that covers interval [ (2*C - 1)*2^(P-1); 2*C*2^(P-1) ) 95 // 96 // Observe that the union of these two intervals is: 97 // [ (2*C - 2)*2^(P-1); 2*C*2^(P-1) ) 98 // 99 // ...which is the original interval: 100 // [ (C-1)*2^P; C*2^P ) 101 // 102 // c) The divide-and-conquer strategy could even start with chunk <1, round-log2-len(arr)>, and split 103 // down in the parallel threads, which alleviates the upfront (serial) splitting costs. 104 // 105 // Encoding limitations caused by current bitscales mean: 106 // 10 bits for chunk: max 1024 blocks per array 107 // 5 bits for power: max 2^32 array 108 // 49 bits for oop: max 512 TB of addressable space 109 // 110 // Stealing bits from oop trims down the addressable space. Stealing too few bits for chunk ID limits 111 // potential parallelism. Stealing too few bits for pow limits the maximum array size that can be handled. 112 // In future, these might be rebalanced to favor one degree of freedom against another. For example, 113 // if/when Arrays 2.0 bring 2^64-sized arrays, we might need to steal another bit for power. We could regain 114 // some bits back if chunks are counted in ObjArrayMarkingStride units. 115 // 116 // There is also a fallback version that uses plain fields, when we don't have enough space to steal the 117 // bits from the native pointer. It is useful to debug the _LP64 version. 118 // 119 #ifdef _LP64 120 class ObjArrayChunkedTask 121 { 122 public: 123 enum { 124 chunk_bits = 10, 125 pow_bits = 5, 126 oop_bits = sizeof(uintptr_t)*8 - chunk_bits - pow_bits, 127 }; 128 enum { 129 chunk_size = nth_bit(chunk_bits), 130 pow_size = nth_bit(pow_bits), 131 oop_size = nth_bit(oop_bits), 132 }; 133 enum { 134 oop_shift = 0, 135 pow_shift = oop_shift + oop_bits, 136 chunk_shift = pow_shift + pow_bits, 137 }; 138 enum { 139 oop_mask = right_n_bits(oop_bits), 140 pow_mask = right_n_bits(pow_bits), 141 chunk_mask = right_n_bits(chunk_bits), 142 chunk_mask_unshift = ~right_n_bits(oop_bits + pow_bits), 143 }; 144 145 public: 146 ObjArrayChunkedTask(oop o = NULL) { 147 _obj = ((uintptr_t)(void*) o) << oop_shift; 148 } 149 ObjArrayChunkedTask(oop o, int chunk, int mult) { 150 assert(0 <= chunk && chunk < chunk_size, err_msg("chunk is sane: %d", chunk)); 151 assert(0 <= mult && mult < pow_size, err_msg("pow is sane: %d", mult)); 152 uintptr_t t_b = ((uintptr_t) chunk) << chunk_shift; 153 uintptr_t t_m = ((uintptr_t) mult) << pow_shift; 154 uintptr_t obj = (uintptr_t)(void*)o; 155 assert(obj < oop_size, err_msg("obj ref is sane: " PTR_FORMAT, obj)); 156 intptr_t t_o = obj << oop_shift; 157 _obj = t_o | t_m | t_b; 158 } 159 ObjArrayChunkedTask(const ObjArrayChunkedTask& t): _obj(t._obj) { } 160 161 ObjArrayChunkedTask& operator =(const ObjArrayChunkedTask& t) { 162 _obj = t._obj; 163 return *this; 164 } 165 volatile ObjArrayChunkedTask& 166 operator =(const volatile ObjArrayChunkedTask& t) volatile { 167 (void)const_cast<uintptr_t&>(_obj = t._obj); 168 return *this; 169 } 170 171 inline oop obj() const { return (oop) reinterpret_cast<void*>((_obj >> oop_shift) & oop_mask); } 172 inline int chunk() const { return (int) (_obj >> chunk_shift) & chunk_mask; } 173 inline int pow() const { return (int) ((_obj >> pow_shift) & pow_mask); } 174 inline bool is_not_chunked() const { return (_obj & chunk_mask_unshift) == 0; } 175 176 DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid. 177 178 private: 179 uintptr_t _obj; 180 }; 181 #else 182 class ObjArrayChunkedTask 183 { 184 public: 185 enum { 186 chunk_bits = 10, 187 pow_bits = 5, 188 }; 189 enum { 190 chunk_size = nth_bit(chunk_bits), 191 pow_size = nth_bit(pow_bits), 192 }; 193 public: 194 ObjArrayChunkedTask(oop o = NULL, int chunk = 0, int pow = 0): _obj(o) { 195 assert(0 <= chunk && chunk < chunk_size, err_msg("chunk is sane: %d", chunk)); 196 assert(0 <= pow && pow < pow_size, err_msg("pow is sane: %d", pow)); 197 _chunk = chunk; 198 _pow = pow; 199 } 200 ObjArrayChunkedTask(const ObjArrayChunkedTask& t): _obj(t._obj), _chunk(t._chunk), _pow(t._pow) { } 201 202 ObjArrayChunkedTask& operator =(const ObjArrayChunkedTask& t) { 203 _obj = t._obj; 204 _chunk = t._chunk; 205 _pow = t._pow; 206 return *this; 207 } 208 volatile ObjArrayChunkedTask& 209 operator =(const volatile ObjArrayChunkedTask& t) volatile { 210 (void)const_cast<oop&>(_obj = t._obj); 211 _chunk = t._chunk; 212 _pow = t._pow; 213 return *this; 214 } 215 216 inline oop obj() const { return _obj; } 217 inline int chunk() const { return _chunk; } 218 inline int pow() const { return _pow; } 219 220 inline bool is_not_chunked() const { return _chunk == 0; } 221 222 DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid. 223 224 private: 225 oop _obj; 226 int _chunk; 227 int _pow; 228 }; 229 #endif 230 231 typedef ObjArrayChunkedTask SCMTask; 232 typedef BufferedOverflowTaskQueue<SCMTask, mtGC> ShenandoahBufferedOverflowTaskQueue; 233 typedef Padded<ShenandoahBufferedOverflowTaskQueue> SCMObjToScanQueue; 234 235 template <class T, MEMFLAGS F> 236 class ParallelClaimableQueueSet: public GenericTaskQueueSet<T, F> { 237 private: 238 volatile jint _claimed_index; 239 debug_only(uint _reserved; ) 240 241 public: 242 using GenericTaskQueueSet<T, F>::size; 243 244 public: 245 ParallelClaimableQueueSet(int n) : GenericTaskQueueSet<T, F>(n) { 246 debug_only(_reserved = 0; ) 247 } 248 249 void clear_claimed() { _claimed_index = 0; } 250 T* claim_next(); 251 252 // reserve queues that not for parallel claiming 253 void reserve(uint n) { 254 assert(n <= size(), "Sanity"); 255 _claimed_index = (jint)n; 256 debug_only(_reserved = n;) 257 } 258 259 debug_only(uint get_reserved() const { return (uint)_reserved; }) 260 }; 261 262 263 template <class T, MEMFLAGS F> 264 T* ParallelClaimableQueueSet<T, F>::claim_next() { 265 jint size = (jint)GenericTaskQueueSet<T, F>::size(); 266 267 if (_claimed_index >= size) { 268 return NULL; 269 } 270 271 jint index = Atomic::add(1, &_claimed_index); 272 273 if (index <= size) { 274 return GenericTaskQueueSet<T, F>::queue((uint)index - 1); 275 } else { 276 return NULL; 277 } 278 } 279 280 class SCMObjToScanQueueSet: public ParallelClaimableQueueSet<SCMObjToScanQueue, mtGC> { 281 282 public: 283 SCMObjToScanQueueSet(int n) : ParallelClaimableQueueSet<SCMObjToScanQueue, mtGC>(n) { 284 } 285 286 bool is_empty(); 287 288 void clear(); 289 }; 290 291 292 /* 293 * This is an enhanced implementation of Google's work stealing 294 * protocol, which is described in the paper: 295 * Understanding and improving JVM GC work stealing at the data center scale 296 * (http://dl.acm.org/citation.cfm?id=2926706) 297 * 298 * Instead of a dedicated spin-master, our implementation will let spin-master to relinquish 299 * the role before it goes to sleep/wait, so allows newly arrived thread to compete for the role. 300 * The intention of above enhancement, is to reduce spin-master's latency on detecting new tasks 301 * for stealing and termination condition. 302 */ 303 304 class ShenandoahTaskTerminator: public ParallelTaskTerminator { 305 private: 306 Monitor* _blocker; 307 Thread* _spin_master; 308 309 310 public: 311 ShenandoahTaskTerminator(uint n_threads, TaskQueueSetSuper* queue_set) : 312 ParallelTaskTerminator(n_threads, queue_set), _spin_master(NULL) { 313 _blocker = new Monitor(Mutex::leaf, "ShenandoahTaskTerminator", false); 314 } 315 316 bool offer_termination(TerminatorTerminator* terminator); 317 318 private: 319 size_t tasks_in_queue_set() { return _queue_set->tasks(); } 320 321 322 /* 323 * Perform spin-master task. 324 * return true if termination condition is detected 325 * otherwise, return false 326 */ 327 bool do_spin_master_work(TerminatorTerminator* terminator); 328 }; 329 330 class ShenandoahCancelledTerminatorTerminator : public TerminatorTerminator { 331 virtual bool should_exit_termination() { 332 return false; 333 } 334 virtual bool should_force_termination() { 335 return true; 336 } 337 }; 338 339 #endif // SHARE_VM_GC_SHENANDOAH_SHENANDOAH_TASKQUEUE_HPP