1 /* 2 * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #ifndef SHARE_GC_SHARED_TASKQUEUE_HPP 26 #define SHARE_GC_SHARED_TASKQUEUE_HPP 27 28 #include "memory/allocation.hpp" 29 #include "memory/padded.hpp" 30 #include "oops/oopsHierarchy.hpp" 31 #include "runtime/atomic.hpp" 32 #include "utilities/debug.hpp" 33 #include "utilities/globalDefinitions.hpp" 34 #include "utilities/ostream.hpp" 35 #include "utilities/stack.hpp" 36 37 // Simple TaskQueue stats that are collected by default in debug builds. 38 39 #if !defined(TASKQUEUE_STATS) && defined(ASSERT) 40 #define TASKQUEUE_STATS 1 41 #elif !defined(TASKQUEUE_STATS) 42 #define TASKQUEUE_STATS 0 43 #endif 44 45 #if TASKQUEUE_STATS 46 #define TASKQUEUE_STATS_ONLY(code) code 47 #else 48 #define TASKQUEUE_STATS_ONLY(code) 49 #endif // TASKQUEUE_STATS 50 51 #if TASKQUEUE_STATS 52 class TaskQueueStats { 53 public: 54 enum StatId { 55 push, // number of taskqueue pushes 56 pop, // number of taskqueue pops 57 pop_slow, // subset of taskqueue pops that were done slow-path 58 steal_attempt, // number of taskqueue steal attempts 59 steal, // number of taskqueue steals 60 overflow, // number of overflow pushes 61 overflow_max_len, // max length of overflow stack 62 last_stat_id 63 }; 64 65 public: 66 inline TaskQueueStats() { reset(); } 67 68 inline void record_push() { ++_stats[push]; } 69 inline void record_pop() { ++_stats[pop]; } 70 inline void record_pop_slow() { record_pop(); ++_stats[pop_slow]; } 71 inline void record_steal_attempt() { ++_stats[steal_attempt]; } 72 inline void record_steal() { ++_stats[steal]; } 73 inline void record_overflow(size_t new_length); 74 75 TaskQueueStats & operator +=(const TaskQueueStats & addend); 76 77 inline size_t get(StatId id) const { return _stats[id]; } 78 inline const size_t* get() const { return _stats; } 79 80 inline void reset(); 81 82 // Print the specified line of the header (does not include a line separator). 83 static void print_header(unsigned int line, outputStream* const stream = tty, 84 unsigned int width = 10); 85 // Print the statistics (does not include a line separator). 86 void print(outputStream* const stream = tty, unsigned int width = 10) const; 87 88 DEBUG_ONLY(void verify() const;) 89 90 private: 91 size_t _stats[last_stat_id]; 92 static const char * const _names[last_stat_id]; 93 }; 94 95 void TaskQueueStats::record_overflow(size_t new_len) { 96 ++_stats[overflow]; 97 if (new_len > _stats[overflow_max_len]) _stats[overflow_max_len] = new_len; 98 } 99 100 void TaskQueueStats::reset() { 101 memset(_stats, 0, sizeof(_stats)); 102 } 103 #endif // TASKQUEUE_STATS 104 105 // TaskQueueSuper collects functionality common to all GenericTaskQueue instances. 106 107 template <unsigned int N, MEMFLAGS F> 108 class TaskQueueSuper: public CHeapObj<F> { 109 protected: 110 // Internal type for indexing the queue; also used for the tag. 111 typedef NOT_LP64(uint16_t) LP64_ONLY(uint32_t) idx_t; 112 STATIC_ASSERT(N == idx_t(N)); // Ensure N fits in an idx_t. 113 114 // N must be a power of 2 for computing modulo via masking. 115 // N must be >= 2 for the algorithm to work at all, though larger is better. 116 // C++11: is_power_of_2 is not (yet) constexpr. 117 STATIC_ASSERT((N >= 2) && ((N & (N - 1)) == 0)); 118 static const uint MOD_N_MASK = N - 1; 119 120 class Age { 121 friend class TaskQueueSuper; 122 123 public: 124 explicit Age(size_t data = 0) : _data(data) {} 125 Age(idx_t top, idx_t tag) { _fields._top = top; _fields._tag = tag; } 126 127 idx_t top() const { return _fields._top; } 128 idx_t tag() const { return _fields._tag; } 129 130 bool operator ==(const Age& other) const { return _data == other._data; } 131 132 private: 133 struct fields { 134 idx_t _top; 135 idx_t _tag; 136 }; 137 union { 138 size_t _data; 139 fields _fields; 140 }; 141 STATIC_ASSERT(sizeof(size_t) >= sizeof(fields)); 142 }; 143 144 uint bottom_relaxed() const { 145 return Atomic::load(&_bottom); 146 } 147 148 uint bottom_acquire() const { 149 return Atomic::load_acquire(&_bottom); 150 } 151 152 void set_bottom_relaxed(uint new_bottom) { 153 Atomic::store(&_bottom, new_bottom); 154 } 155 156 void release_set_bottom(uint new_bottom) { 157 Atomic::release_store(&_bottom, new_bottom); 158 } 159 160 Age age_relaxed() const { 161 return Age(Atomic::load(&_age._data)); 162 } 163 164 void set_age_relaxed(Age new_age) { 165 Atomic::store(&_age._data, new_age._data); 166 } 167 168 Age cmpxchg_age(Age old_age, Age new_age) { 169 return Age(Atomic::cmpxchg(&_age._data, old_age._data, new_age._data)); 170 } 171 172 idx_t age_top_relaxed() const { 173 // Atomically accessing a subfield of an "atomic" member. 174 return Atomic::load(&_age._fields._top); 175 } 176 177 // These both operate mod N. 178 static uint increment_index(uint ind) { 179 return (ind + 1) & MOD_N_MASK; 180 } 181 static uint decrement_index(uint ind) { 182 return (ind - 1) & MOD_N_MASK; 183 } 184 185 // Returns a number in the range [0..N). If the result is "N-1", it should be 186 // interpreted as 0. 187 uint dirty_size(uint bot, uint top) const { 188 return (bot - top) & MOD_N_MASK; 189 } 190 191 // Returns the size corresponding to the given "bot" and "top". 192 uint clean_size(uint bot, uint top) const { 193 uint sz = dirty_size(bot, top); 194 // Has the queue "wrapped", so that bottom is less than top? There's a 195 // complicated special case here. A pair of threads could perform pop_local 196 // and pop_global operations concurrently, starting from a state in which 197 // _bottom == _top+1. The pop_local could succeed in decrementing _bottom, 198 // and the pop_global in incrementing _top (in which case the pop_global 199 // will be awarded the contested queue element.) The resulting state must 200 // be interpreted as an empty queue. (We only need to worry about one such 201 // event: only the queue owner performs pop_local's, and several concurrent 202 // threads attempting to perform the pop_global will all perform the same 203 // CAS, and only one can succeed.) Any stealing thread that reads after 204 // either the increment or decrement will see an empty queue, and will not 205 // join the competitors. The "sz == -1" / "sz == N-1" state will not be 206 // modified by concurrent threads, so the owner thread can reset the state 207 // to _bottom == top so subsequent pushes will be performed normally. 208 return (sz == N - 1) ? 0 : sz; 209 } 210 211 // Assert that we're not in the underflow state where bottom has 212 // been decremented past top, so that _bottom+1 mod N == top. See 213 // the discussion in clean_size. 214 215 void assert_not_underflow(uint bot, uint top) const { 216 assert_not_underflow(dirty_size(bot, top)); 217 } 218 219 void assert_not_underflow(uint dirty_size) const { 220 assert(dirty_size != N - 1, "invariant"); 221 } 222 223 private: 224 DEFINE_PAD_MINUS_SIZE(0, DEFAULT_CACHE_LINE_SIZE, 0); 225 226 // Index of the first free element after the last one pushed (mod N). 227 volatile uint _bottom; 228 DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(uint)); 229 230 // top() is the index of the oldest pushed element (mod N), and tag() 231 // is the associated epoch, to distinguish different modifications of 232 // the age. There is no available element if top() == _bottom or 233 // (_bottom - top()) mod N == N-1; the latter indicates underflow 234 // during concurrent pop_local/pop_global. 235 volatile Age _age; 236 DEFINE_PAD_MINUS_SIZE(2, DEFAULT_CACHE_LINE_SIZE, sizeof(Age)); 237 238 NONCOPYABLE(TaskQueueSuper); 239 240 public: 241 TaskQueueSuper() : _bottom(0), _age() {} 242 243 // Assert the queue is empty. 244 // Unreliable if there are concurrent pushes or pops. 245 void assert_empty() const { 246 assert(bottom_relaxed() == age_top_relaxed(), "not empty"); 247 } 248 249 bool is_empty() const { 250 return size() == 0; 251 } 252 253 // Return an estimate of the number of elements in the queue. 254 // Treats pop_local/pop_global race that underflows as empty. 255 uint size() const { 256 return clean_size(bottom_relaxed(), age_top_relaxed()); 257 } 258 259 // Discard the contents of the queue. 260 void set_empty() { 261 set_bottom_relaxed(0); 262 set_age_relaxed(Age()); 263 } 264 265 // Maximum number of elements allowed in the queue. This is two less 266 // than the actual queue size, so that a full queue can be distinguished 267 // from underflow involving pop_local and concurrent pop_global operations 268 // in GenericTaskQueue. 269 uint max_elems() const { return N - 2; } 270 271 TASKQUEUE_STATS_ONLY(TaskQueueStats stats;) 272 }; 273 274 // 275 // GenericTaskQueue implements an ABP, Aurora-Blumofe-Plaxton, double- 276 // ended-queue (deque), intended for use in work stealing. Queue operations 277 // are non-blocking. 278 // 279 // A queue owner thread performs push() and pop_local() operations on one end 280 // of the queue, while other threads may steal work using the pop_global() 281 // method. 282 // 283 // The main difference to the original algorithm is that this 284 // implementation allows wrap-around at the end of its allocated 285 // storage, which is an array. 286 // 287 // The original paper is: 288 // 289 // Arora, N. S., Blumofe, R. D., and Plaxton, C. G. 290 // Thread scheduling for multiprogrammed multiprocessors. 291 // Theory of Computing Systems 34, 2 (2001), 115-144. 292 // 293 // The following paper provides an correctness proof and an 294 // implementation for weakly ordered memory models including (pseudo-) 295 // code containing memory barriers for a Chase-Lev deque. Chase-Lev is 296 // similar to ABP, with the main difference that it allows resizing of the 297 // underlying storage: 298 // 299 // Le, N. M., Pop, A., Cohen A., and Nardell, F. Z. 300 // Correct and efficient work-stealing for weak memory models 301 // Proceedings of the 18th ACM SIGPLAN symposium on Principles and 302 // practice of parallel programming (PPoPP 2013), 69-80 303 // 304 305 template <class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE> 306 class GenericTaskQueue: public TaskQueueSuper<N, F> { 307 protected: 308 typedef typename TaskQueueSuper<N, F>::Age Age; 309 typedef typename TaskQueueSuper<N, F>::idx_t idx_t; 310 311 using TaskQueueSuper<N, F>::MOD_N_MASK; 312 313 using TaskQueueSuper<N, F>::bottom_relaxed; 314 using TaskQueueSuper<N, F>::bottom_acquire; 315 316 using TaskQueueSuper<N, F>::set_bottom_relaxed; 317 using TaskQueueSuper<N, F>::release_set_bottom; 318 319 using TaskQueueSuper<N, F>::age_relaxed; 320 using TaskQueueSuper<N, F>::set_age_relaxed; 321 using TaskQueueSuper<N, F>::cmpxchg_age; 322 using TaskQueueSuper<N, F>::age_top_relaxed; 323 324 using TaskQueueSuper<N, F>::increment_index; 325 using TaskQueueSuper<N, F>::decrement_index; 326 using TaskQueueSuper<N, F>::dirty_size; 327 using TaskQueueSuper<N, F>::clean_size; 328 using TaskQueueSuper<N, F>::assert_not_underflow; 329 330 public: 331 using TaskQueueSuper<N, F>::max_elems; 332 using TaskQueueSuper<N, F>::size; 333 334 #if TASKQUEUE_STATS 335 using TaskQueueSuper<N, F>::stats; 336 #endif 337 338 private: 339 // Slow path for pop_local, dealing with possible conflict with pop_global. 340 bool pop_local_slow(uint localBot, Age oldAge); 341 342 public: 343 typedef E element_type; 344 345 // Initializes the queue to empty. 346 GenericTaskQueue(); 347 348 void initialize(); 349 350 // Push the task "t" on the queue. Returns "false" iff the queue is full. 351 inline bool push(E t); 352 353 // Attempts to claim a task from the "local" end of the queue (the most 354 // recently pushed) as long as the number of entries exceeds the threshold. 355 // If successfully claims a task, returns true and sets t to the task; 356 // otherwise, returns false and t is unspecified. May fail and return 357 // false because of a successful steal by pop_global. 358 inline bool pop_local(E& t, uint threshold = 0); 359 360 // Like pop_local(), but uses the "global" end of the queue (the least 361 // recently pushed). 362 bool pop_global(E& t); 363 364 // Delete any resource associated with the queue. 365 ~GenericTaskQueue(); 366 367 // Apply fn to each element in the task queue. The queue must not 368 // be modified while iterating. 369 template<typename Fn> void iterate(Fn fn); 370 371 private: 372 // Base class has trailing padding. 373 374 // Element array. 375 E* _elems; 376 377 DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(E*)); 378 // Queue owner local variables. Not to be accessed by other threads. 379 380 static const uint InvalidQueueId = uint(-1); 381 uint _last_stolen_queue_id; // The id of the queue we last stole from 382 383 int _seed; // Current random seed used for selecting a random queue during stealing. 384 385 DEFINE_PAD_MINUS_SIZE(2, DEFAULT_CACHE_LINE_SIZE, sizeof(uint) + sizeof(int)); 386 public: 387 int next_random_queue_id(); 388 389 void set_last_stolen_queue_id(uint id) { _last_stolen_queue_id = id; } 390 uint last_stolen_queue_id() const { return _last_stolen_queue_id; } 391 bool is_last_stolen_queue_id_valid() const { return _last_stolen_queue_id != InvalidQueueId; } 392 void invalidate_last_stolen_queue_id() { _last_stolen_queue_id = InvalidQueueId; } 393 }; 394 395 template<class E, MEMFLAGS F, unsigned int N> 396 GenericTaskQueue<E, F, N>::GenericTaskQueue() : _last_stolen_queue_id(InvalidQueueId), _seed(17 /* random number */) { 397 assert(sizeof(Age) == sizeof(size_t), "Depends on this."); 398 } 399 400 // OverflowTaskQueue is a TaskQueue that also includes an overflow stack for 401 // elements that do not fit in the TaskQueue. 402 // 403 // This class hides two methods from super classes: 404 // 405 // push() - push onto the task queue or, if that fails, onto the overflow stack 406 // is_empty() - return true if both the TaskQueue and overflow stack are empty 407 // 408 // Note that size() is not hidden--it returns the number of elements in the 409 // TaskQueue, and does not include the size of the overflow stack. This 410 // simplifies replacement of GenericTaskQueues with OverflowTaskQueues. 411 template<class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE> 412 class OverflowTaskQueue: public GenericTaskQueue<E, F, N> 413 { 414 public: 415 typedef Stack<E, F> overflow_t; 416 typedef GenericTaskQueue<E, F, N> taskqueue_t; 417 418 TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;) 419 420 // Push task t onto the queue or onto the overflow stack. Return true. 421 inline bool push(E t); 422 // Try to push task t onto the queue only. Returns true if successful, false otherwise. 423 inline bool try_push_to_taskqueue(E t); 424 425 // Attempt to pop from the overflow stack; return true if anything was popped. 426 inline bool pop_overflow(E& t); 427 428 inline overflow_t* overflow_stack() { return &_overflow_stack; } 429 430 inline bool taskqueue_empty() const { return taskqueue_t::is_empty(); } 431 inline bool overflow_empty() const { return _overflow_stack.is_empty(); } 432 inline bool is_empty() const { 433 return taskqueue_empty() && overflow_empty(); 434 } 435 436 private: 437 overflow_t _overflow_stack; 438 }; 439 440 class TaskQueueSetSuper { 441 public: 442 // Assert all queues in the set are empty. 443 NOT_DEBUG(void assert_empty() const {}) 444 DEBUG_ONLY(virtual void assert_empty() const = 0;) 445 446 // Tasks in queue 447 virtual uint tasks() const = 0; 448 }; 449 450 template <MEMFLAGS F> class TaskQueueSetSuperImpl: public CHeapObj<F>, public TaskQueueSetSuper { 451 }; 452 453 template<class T, MEMFLAGS F> 454 class GenericTaskQueueSet: public TaskQueueSetSuperImpl<F> { 455 public: 456 typedef typename T::element_type E; 457 458 private: 459 uint _n; 460 T** _queues; 461 462 bool steal_best_of_2(uint queue_num, E& t); 463 464 public: 465 GenericTaskQueueSet(uint n); 466 ~GenericTaskQueueSet(); 467 468 void register_queue(uint i, T* q); 469 470 T* queue(uint n); 471 472 // Try to steal a task from some other queue than queue_num. It may perform several attempts at doing so. 473 // Returns if stealing succeeds, and sets "t" to the stolen task. 474 bool steal(uint queue_num, E& t); 475 476 DEBUG_ONLY(virtual void assert_empty() const;) 477 478 virtual uint tasks() const; 479 480 uint size() const { return _n; } 481 }; 482 483 template<class T, MEMFLAGS F> void 484 GenericTaskQueueSet<T, F>::register_queue(uint i, T* q) { 485 assert(i < _n, "index out of range."); 486 _queues[i] = q; 487 } 488 489 template<class T, MEMFLAGS F> T* 490 GenericTaskQueueSet<T, F>::queue(uint i) { 491 return _queues[i]; 492 } 493 494 #ifdef ASSERT 495 template<class T, MEMFLAGS F> 496 void GenericTaskQueueSet<T, F>::assert_empty() const { 497 for (uint j = 0; j < _n; j++) { 498 _queues[j]->assert_empty(); 499 } 500 } 501 #endif // ASSERT 502 503 template<class T, MEMFLAGS F> 504 uint GenericTaskQueueSet<T, F>::tasks() const { 505 uint n = 0; 506 for (uint j = 0; j < _n; j++) { 507 n += _queues[j]->size(); 508 } 509 return n; 510 } 511 512 // When to terminate from the termination protocol. 513 class TerminatorTerminator: public CHeapObj<mtInternal> { 514 public: 515 virtual bool should_exit_termination() = 0; 516 }; 517 518 // This is a container class for either an oop* or a narrowOop*. 519 // Both are pushed onto a task queue and the consumer will test is_narrow() 520 // to determine which should be processed. 521 class StarTask { 522 void* _holder; // either union oop* or narrowOop* 523 524 enum { COMPRESSED_OOP_MASK = 1 }; 525 526 public: 527 StarTask(narrowOop* p) { 528 assert(((uintptr_t)p & COMPRESSED_OOP_MASK) == 0, "Information loss!"); 529 _holder = (void *)((uintptr_t)p | COMPRESSED_OOP_MASK); 530 } 531 StarTask(oop* p) { 532 assert(((uintptr_t)p & COMPRESSED_OOP_MASK) == 0, "Information loss!"); 533 _holder = (void*)p; 534 } 535 StarTask() { _holder = NULL; } 536 // Trivially copyable, for use in GenericTaskQueue. 537 538 operator oop*() { return (oop*)_holder; } 539 operator narrowOop*() { 540 return (narrowOop*)((uintptr_t)_holder & ~COMPRESSED_OOP_MASK); 541 } 542 543 bool is_narrow() const { 544 return (((uintptr_t)_holder & COMPRESSED_OOP_MASK) != 0); 545 } 546 }; 547 548 class ObjArrayTask 549 { 550 public: 551 ObjArrayTask(oop o = NULL, int idx = 0): _obj(o), _index(idx) { } 552 ObjArrayTask(oop o, size_t idx): _obj(o), _index(int(idx)) { 553 assert(idx <= size_t(max_jint), "too big"); 554 } 555 // Trivially copyable, for use in GenericTaskQueue. 556 557 inline oop obj() const { return _obj; } 558 inline int index() const { return _index; } 559 560 DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid. 561 562 private: 563 oop _obj; 564 int _index; 565 }; 566 567 #endif // SHARE_GC_SHARED_TASKQUEUE_HPP