1 /* 2 * Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #ifndef SHARE_GC_SHARED_TASKQUEUE_HPP 26 #define SHARE_GC_SHARED_TASKQUEUE_HPP 27 28 #include "memory/allocation.hpp" 29 #include "memory/padded.hpp" 30 #include "oops/oopsHierarchy.hpp" 31 #include "runtime/atomic.hpp" 32 #include "utilities/debug.hpp" 33 #include "utilities/globalDefinitions.hpp" 34 #include "utilities/ostream.hpp" 35 #include "utilities/stack.hpp" 36 37 // Simple TaskQueue stats that are collected by default in debug builds. 38 39 #if !defined(TASKQUEUE_STATS) && defined(ASSERT) 40 #define TASKQUEUE_STATS 1 41 #elif !defined(TASKQUEUE_STATS) 42 #define TASKQUEUE_STATS 0 43 #endif 44 45 #if TASKQUEUE_STATS 46 #define TASKQUEUE_STATS_ONLY(code) code 47 #else 48 #define TASKQUEUE_STATS_ONLY(code) 49 #endif // TASKQUEUE_STATS 50 51 #if TASKQUEUE_STATS 52 class TaskQueueStats { 53 public: 54 enum StatId { 55 push, // number of taskqueue pushes 56 pop, // number of taskqueue pops 57 pop_slow, // subset of taskqueue pops that were done slow-path 58 steal_attempt, // number of taskqueue steal attempts 59 steal, // number of taskqueue steals 60 overflow, // number of overflow pushes 61 overflow_max_len, // max length of overflow stack 62 last_stat_id 63 }; 64 65 public: 66 inline TaskQueueStats() { reset(); } 67 68 inline void record_push() { ++_stats[push]; } 69 inline void record_pop() { ++_stats[pop]; } 70 inline void record_pop_slow() { record_pop(); ++_stats[pop_slow]; } 71 inline void record_steal_attempt() { ++_stats[steal_attempt]; } 72 inline void record_steal() { ++_stats[steal]; } 73 inline void record_overflow(size_t new_length); 74 75 TaskQueueStats & operator +=(const TaskQueueStats & addend); 76 77 inline size_t get(StatId id) const { return _stats[id]; } 78 inline const size_t* get() const { return _stats; } 79 80 inline void reset(); 81 82 // Print the specified line of the header (does not include a line separator). 83 static void print_header(unsigned int line, outputStream* const stream = tty, 84 unsigned int width = 10); 85 // Print the statistics (does not include a line separator). 86 void print(outputStream* const stream = tty, unsigned int width = 10) const; 87 88 DEBUG_ONLY(void verify() const;) 89 90 private: 91 size_t _stats[last_stat_id]; 92 static const char * const _names[last_stat_id]; 93 }; 94 95 void TaskQueueStats::record_overflow(size_t new_len) { 96 ++_stats[overflow]; 97 if (new_len > _stats[overflow_max_len]) _stats[overflow_max_len] = new_len; 98 } 99 100 void TaskQueueStats::reset() { 101 memset(_stats, 0, sizeof(_stats)); 102 } 103 #endif // TASKQUEUE_STATS 104 105 // TaskQueueSuper collects functionality common to all GenericTaskQueue instances. 106 107 template <unsigned int N, MEMFLAGS F> 108 class TaskQueueSuper: public CHeapObj<F> { 109 protected: 110 // Internal type for indexing the queue; also used for the tag. 111 typedef NOT_LP64(uint16_t) LP64_ONLY(uint32_t) idx_t; 112 STATIC_ASSERT(N == idx_t(N)); // Ensure N fits in an idx_t. 113 114 // N must be a power of 2 for computing modulo via masking. 115 // N must be >= 2 for the algorithm to work at all, though larger is better. 116 // C++11: is_power_of_2 is not (yet) constexpr. 117 STATIC_ASSERT((N >= 2) && ((N & (N - 1)) == 0)); 118 static const uint MOD_N_MASK = N - 1; 119 120 class Age { 121 friend class TaskQueueSuper; 122 123 public: 124 explicit Age(size_t data = 0) : _data(data) {} 125 Age(idx_t top, idx_t tag) { _fields._top = top; _fields._tag = tag; } 126 127 idx_t top() const { return _fields._top; } 128 idx_t tag() const { return _fields._tag; } 129 130 bool operator ==(const Age& other) const { return _data == other._data; } 131 132 private: 133 struct fields { 134 idx_t _top; 135 idx_t _tag; 136 }; 137 union { 138 size_t _data; 139 fields _fields; 140 }; 141 STATIC_ASSERT(sizeof(size_t) >= sizeof(fields)); 142 }; 143 144 uint bottom_relaxed() const { 145 return Atomic::load(&_bottom); 146 } 147 148 uint bottom_acquire() const { 149 return Atomic::load_acquire(&_bottom); 150 } 151 152 void set_bottom_relaxed(uint new_bottom) { 153 Atomic::store(&_bottom, new_bottom); 154 } 155 156 void release_set_bottom(uint new_bottom) { 157 Atomic::release_store(&_bottom, new_bottom); 158 } 159 160 Age age_relaxed() const { 161 return Age(Atomic::load(&_age._data)); 162 } 163 164 void set_age_relaxed(Age new_age) { 165 Atomic::store(&_age._data, new_age._data); 166 } 167 168 Age cmpxchg_age(Age old_age, Age new_age) { 169 return Age(Atomic::cmpxchg(&_age._data, old_age._data, new_age._data)); 170 } 171 172 idx_t age_top_relaxed() const { 173 // Atomically accessing a subfield of an "atomic" member. 174 return Atomic::load(&_age._fields._top); 175 } 176 177 // These both operate mod N. 178 static uint increment_index(uint ind) { 179 return (ind + 1) & MOD_N_MASK; 180 } 181 static uint decrement_index(uint ind) { 182 return (ind - 1) & MOD_N_MASK; 183 } 184 185 // Returns a number in the range [0..N). If the result is "N-1", it should be 186 // interpreted as 0. 187 uint dirty_size(uint bot, uint top) const { 188 return (bot - top) & MOD_N_MASK; 189 } 190 191 // Returns the size corresponding to the given "bot" and "top". 192 uint clean_size(uint bot, uint top) const { 193 uint sz = dirty_size(bot, top); 194 // Has the queue "wrapped", so that bottom is less than top? There's a 195 // complicated special case here. A pair of threads could perform pop_local 196 // and pop_global operations concurrently, starting from a state in which 197 // _bottom == _top+1. The pop_local could succeed in decrementing _bottom, 198 // and the pop_global in incrementing _top (in which case the pop_global 199 // will be awarded the contested queue element.) The resulting state must 200 // be interpreted as an empty queue. (We only need to worry about one such 201 // event: only the queue owner performs pop_local's, and several concurrent 202 // threads attempting to perform the pop_global will all perform the same 203 // CAS, and only one can succeed.) Any stealing thread that reads after 204 // either the increment or decrement will see an empty queue, and will not 205 // join the competitors. The "sz == -1 || sz == N-1" state will not be 206 // modified by concurrent queues, so the owner thread can reset the state to 207 // _bottom == top so subsequent pushes will be performed normally. 208 return (sz == N - 1) ? 0 : sz; 209 } 210 211 private: 212 DEFINE_PAD_MINUS_SIZE(0, DEFAULT_CACHE_LINE_SIZE, 0); 213 214 // Index of the first free element after the last one pushed (mod N). 215 volatile uint _bottom; 216 DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(uint)); 217 218 // top() is the index of the oldest pushed element (mod N), and tag() 219 // is the associated epoch, to distinguish different modifications of 220 // the age. There is no available element if top() == _bottom or 221 // (_bottom - top()) mod N == N-1; the latter indicates underflow 222 // during concurrent pop_local/pop_global. 223 volatile Age _age; 224 DEFINE_PAD_MINUS_SIZE(2, DEFAULT_CACHE_LINE_SIZE, sizeof(Age)); 225 226 NONCOPYABLE(TaskQueueSuper); 227 228 public: 229 TaskQueueSuper() : _bottom(0), _age() {} 230 231 // Assert the queue is empty. 232 // Unreliable if there are concurrent pushes or pops. 233 void assert_empty() const { 234 assert(bottom_relaxed() == age_top_relaxed(), "not empty"); 235 } 236 237 bool is_empty() const { 238 return size() == 0; 239 } 240 241 // Return an estimate of the number of elements in the queue. 242 // Treats pop_local/pop_global race that underflows as empty. 243 uint size() const { 244 return clean_size(bottom_relaxed(), age_top_relaxed()); 245 } 246 247 // Discard the contents of the queue. 248 void set_empty() { 249 set_bottom_relaxed(0); 250 set_age_relaxed(Age()); 251 } 252 253 // Maximum number of elements allowed in the queue. This is two less 254 // than the actual queue size, so that a full queue can be distinguished 255 // from underflow involving pop_local and concurrent pop_global operations 256 // in GenericTaskQueue. 257 uint max_elems() const { return N - 2; } 258 259 TASKQUEUE_STATS_ONLY(TaskQueueStats stats;) 260 }; 261 262 // 263 // GenericTaskQueue implements an ABP, Aurora-Blumofe-Plaxton, double- 264 // ended-queue (deque), intended for use in work stealing. Queue operations 265 // are non-blocking. 266 // 267 // A queue owner thread performs push() and pop_local() operations on one end 268 // of the queue, while other threads may steal work using the pop_global() 269 // method. 270 // 271 // The main difference to the original algorithm is that this 272 // implementation allows wrap-around at the end of its allocated 273 // storage, which is an array. 274 // 275 // The original paper is: 276 // 277 // Arora, N. S., Blumofe, R. D., and Plaxton, C. G. 278 // Thread scheduling for multiprogrammed multiprocessors. 279 // Theory of Computing Systems 34, 2 (2001), 115-144. 280 // 281 // The following paper provides an correctness proof and an 282 // implementation for weakly ordered memory models including (pseudo-) 283 // code containing memory barriers for a Chase-Lev deque. Chase-Lev is 284 // similar to ABP, with the main difference that it allows resizing of the 285 // underlying storage: 286 // 287 // Le, N. M., Pop, A., Cohen A., and Nardell, F. Z. 288 // Correct and efficient work-stealing for weak memory models 289 // Proceedings of the 18th ACM SIGPLAN symposium on Principles and 290 // practice of parallel programming (PPoPP 2013), 69-80 291 // 292 293 template <class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE> 294 class GenericTaskQueue: public TaskQueueSuper<N, F> { 295 protected: 296 typedef typename TaskQueueSuper<N, F>::Age Age; 297 typedef typename TaskQueueSuper<N, F>::idx_t idx_t; 298 299 using TaskQueueSuper<N, F>::MOD_N_MASK; 300 301 using TaskQueueSuper<N, F>::bottom_relaxed; 302 using TaskQueueSuper<N, F>::bottom_acquire; 303 304 using TaskQueueSuper<N, F>::set_bottom_relaxed; 305 using TaskQueueSuper<N, F>::release_set_bottom; 306 307 using TaskQueueSuper<N, F>::age_relaxed; 308 using TaskQueueSuper<N, F>::set_age_relaxed; 309 using TaskQueueSuper<N, F>::cmpxchg_age; 310 using TaskQueueSuper<N, F>::age_top_relaxed; 311 312 using TaskQueueSuper<N, F>::increment_index; 313 using TaskQueueSuper<N, F>::decrement_index; 314 using TaskQueueSuper<N, F>::dirty_size; 315 using TaskQueueSuper<N, F>::clean_size; 316 317 public: 318 using TaskQueueSuper<N, F>::max_elems; 319 using TaskQueueSuper<N, F>::size; 320 321 #if TASKQUEUE_STATS 322 using TaskQueueSuper<N, F>::stats; 323 #endif 324 325 private: 326 // Slow path for pop_local, dealing with possible conflict with pop_global. 327 bool pop_local_slow(uint localBot, Age oldAge); 328 329 public: 330 typedef E element_type; 331 332 // Initializes the queue to empty. 333 GenericTaskQueue(); 334 335 void initialize(); 336 337 // Push the task "t" on the queue. Returns "false" iff the queue is full. 338 inline bool push(E t); 339 340 // Attempts to claim a task from the "local" end of the queue (the most 341 // recently pushed) as long as the number of entries exceeds the threshold. 342 // If successfully claims a task, returns true and sets t to the task; 343 // otherwise, returns false and t is unspecified. May fail and return 344 // false because of a successful steal by pop_global. 345 inline bool pop_local(E& t, uint threshold = 0); 346 347 // Like pop_local(), but uses the "global" end of the queue (the least 348 // recently pushed). 349 bool pop_global(E& t); 350 351 // Delete any resource associated with the queue. 352 ~GenericTaskQueue(); 353 354 // Apply fn to each element in the task queue. The queue must not 355 // be modified while iterating. 356 template<typename Fn> void iterate(Fn fn); 357 358 private: 359 // Base class has trailing padding. 360 361 // Element array. 362 E* _elems; 363 364 DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(E*)); 365 // Queue owner local variables. Not to be accessed by other threads. 366 367 static const uint InvalidQueueId = uint(-1); 368 uint _last_stolen_queue_id; // The id of the queue we last stole from 369 370 int _seed; // Current random seed used for selecting a random queue during stealing. 371 372 DEFINE_PAD_MINUS_SIZE(2, DEFAULT_CACHE_LINE_SIZE, sizeof(uint) + sizeof(int)); 373 public: 374 int next_random_queue_id(); 375 376 void set_last_stolen_queue_id(uint id) { _last_stolen_queue_id = id; } 377 uint last_stolen_queue_id() const { return _last_stolen_queue_id; } 378 bool is_last_stolen_queue_id_valid() const { return _last_stolen_queue_id != InvalidQueueId; } 379 void invalidate_last_stolen_queue_id() { _last_stolen_queue_id = InvalidQueueId; } 380 }; 381 382 template<class E, MEMFLAGS F, unsigned int N> 383 GenericTaskQueue<E, F, N>::GenericTaskQueue() : _last_stolen_queue_id(InvalidQueueId), _seed(17 /* random number */) { 384 assert(sizeof(Age) == sizeof(size_t), "Depends on this."); 385 } 386 387 // OverflowTaskQueue is a TaskQueue that also includes an overflow stack for 388 // elements that do not fit in the TaskQueue. 389 // 390 // This class hides two methods from super classes: 391 // 392 // push() - push onto the task queue or, if that fails, onto the overflow stack 393 // is_empty() - return true if both the TaskQueue and overflow stack are empty 394 // 395 // Note that size() is not hidden--it returns the number of elements in the 396 // TaskQueue, and does not include the size of the overflow stack. This 397 // simplifies replacement of GenericTaskQueues with OverflowTaskQueues. 398 template<class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE> 399 class OverflowTaskQueue: public GenericTaskQueue<E, F, N> 400 { 401 public: 402 typedef Stack<E, F> overflow_t; 403 typedef GenericTaskQueue<E, F, N> taskqueue_t; 404 405 TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;) 406 407 // Push task t onto the queue or onto the overflow stack. Return true. 408 inline bool push(E t); 409 // Try to push task t onto the queue only. Returns true if successful, false otherwise. 410 inline bool try_push_to_taskqueue(E t); 411 412 // Attempt to pop from the overflow stack; return true if anything was popped. 413 inline bool pop_overflow(E& t); 414 415 inline overflow_t* overflow_stack() { return &_overflow_stack; } 416 417 inline bool taskqueue_empty() const { return taskqueue_t::is_empty(); } 418 inline bool overflow_empty() const { return _overflow_stack.is_empty(); } 419 inline bool is_empty() const { 420 return taskqueue_empty() && overflow_empty(); 421 } 422 423 private: 424 overflow_t _overflow_stack; 425 }; 426 427 class TaskQueueSetSuper { 428 public: 429 // Assert all queues in the set are empty. 430 NOT_DEBUG(void assert_empty() const {}) 431 DEBUG_ONLY(virtual void assert_empty() const = 0;) 432 433 // Tasks in queue 434 virtual uint tasks() const = 0; 435 }; 436 437 template <MEMFLAGS F> class TaskQueueSetSuperImpl: public CHeapObj<F>, public TaskQueueSetSuper { 438 }; 439 440 template<class T, MEMFLAGS F> 441 class GenericTaskQueueSet: public TaskQueueSetSuperImpl<F> { 442 public: 443 typedef typename T::element_type E; 444 445 private: 446 uint _n; 447 T** _queues; 448 449 bool steal_best_of_2(uint queue_num, E& t); 450 451 public: 452 GenericTaskQueueSet(uint n); 453 ~GenericTaskQueueSet(); 454 455 void register_queue(uint i, T* q); 456 457 T* queue(uint n); 458 459 // Try to steal a task from some other queue than queue_num. It may perform several attempts at doing so. 460 // Returns if stealing succeeds, and sets "t" to the stolen task. 461 bool steal(uint queue_num, E& t); 462 463 DEBUG_ONLY(virtual void assert_empty() const;) 464 465 virtual uint tasks() const; 466 467 uint size() const { return _n; } 468 }; 469 470 template<class T, MEMFLAGS F> void 471 GenericTaskQueueSet<T, F>::register_queue(uint i, T* q) { 472 assert(i < _n, "index out of range."); 473 _queues[i] = q; 474 } 475 476 template<class T, MEMFLAGS F> T* 477 GenericTaskQueueSet<T, F>::queue(uint i) { 478 return _queues[i]; 479 } 480 481 #ifdef ASSERT 482 template<class T, MEMFLAGS F> 483 void GenericTaskQueueSet<T, F>::assert_empty() const { 484 for (uint j = 0; j < _n; j++) { 485 _queues[j]->assert_empty(); 486 } 487 } 488 #endif // ASSERT 489 490 template<class T, MEMFLAGS F> 491 uint GenericTaskQueueSet<T, F>::tasks() const { 492 uint n = 0; 493 for (uint j = 0; j < _n; j++) { 494 n += _queues[j]->size(); 495 } 496 return n; 497 } 498 499 // When to terminate from the termination protocol. 500 class TerminatorTerminator: public CHeapObj<mtInternal> { 501 public: 502 virtual bool should_exit_termination() = 0; 503 }; 504 505 // This is a container class for either an oop* or a narrowOop*. 506 // Both are pushed onto a task queue and the consumer will test is_narrow() 507 // to determine which should be processed. 508 class StarTask { 509 void* _holder; // either union oop* or narrowOop* 510 511 enum { COMPRESSED_OOP_MASK = 1 }; 512 513 public: 514 StarTask(narrowOop* p) { 515 assert(((uintptr_t)p & COMPRESSED_OOP_MASK) == 0, "Information loss!"); 516 _holder = (void *)((uintptr_t)p | COMPRESSED_OOP_MASK); 517 } 518 StarTask(oop* p) { 519 assert(((uintptr_t)p & COMPRESSED_OOP_MASK) == 0, "Information loss!"); 520 _holder = (void*)p; 521 } 522 StarTask() { _holder = NULL; } 523 // Trivially copyable, for use in GenericTaskQueue. 524 525 operator oop*() { return (oop*)_holder; } 526 operator narrowOop*() { 527 return (narrowOop*)((uintptr_t)_holder & ~COMPRESSED_OOP_MASK); 528 } 529 530 bool is_narrow() const { 531 return (((uintptr_t)_holder & COMPRESSED_OOP_MASK) != 0); 532 } 533 }; 534 535 class ObjArrayTask 536 { 537 public: 538 ObjArrayTask(oop o = NULL, int idx = 0): _obj(o), _index(idx) { } 539 ObjArrayTask(oop o, size_t idx): _obj(o), _index(int(idx)) { 540 assert(idx <= size_t(max_jint), "too big"); 541 } 542 // Trivially copyable, for use in GenericTaskQueue. 543 544 inline oop obj() const { return _obj; } 545 inline int index() const { return _index; } 546 547 DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid. 548 549 private: 550 oop _obj; 551 int _index; 552 }; 553 554 #endif // SHARE_GC_SHARED_TASKQUEUE_HPP