src/share/vm/utilities/taskqueue.hpp

Print this page
rev 3897 : Fix memory ordering in taskqueue for platforms with weak memory ordering (PPC)

Accesses to fields _bottom and _age were not properly ordered for
platforms with weak memory ordering. Volatile is not sufficient to
enforce this, because it depends on what the compiler assumes to be
necessary for volatile variables.

Pull getter/setter routines to TaskQueueSuper and use methods from
OrderAccess to access the fields in _age. Change the code to always
use the getter/setter methods.
Relax constraints for accesses to locals oldAge and newAge.

The OrderAccess routines used do simple load/stores on x86_64.


 121   ++_stats[steal_attempt];
 122   if (success) ++_stats[steal];
 123 }
 124 
 125 void TaskQueueStats::record_overflow(size_t new_len) {
 126   ++_stats[overflow];
 127   if (new_len > _stats[overflow_max_len]) _stats[overflow_max_len] = new_len;
 128 }
 129 
 130 void TaskQueueStats::reset() {
 131   memset(_stats, 0, sizeof(_stats));
 132 }
 133 #endif // TASKQUEUE_STATS
 134 
 135 template <unsigned int N, MEMFLAGS F>
 136 class TaskQueueSuper: public CHeapObj<F> {
 137 protected:
 138   // Internal type for indexing the queue; also used for the tag.
 139   typedef NOT_LP64(uint16_t) LP64_ONLY(uint32_t) idx_t;
 140 

 141   // The first free element after the last one pushed (mod N).

 142   volatile uint _bottom;
 143 










 144   enum { MOD_N_MASK = N - 1 };
 145 

 146   class Age {
 147   public:
 148     Age(size_t data = 0)         { _data = data; }
 149     Age(const Age& age)          { _data = age._data; }
 150     Age(idx_t top, idx_t tag)    { _fields._top = top; _fields._tag = tag; }
 151 
 152     Age   get()        const volatile { return _data; }
 153     void  set(Age age) volatile       { _data = age._data; }
 154 
 155     idx_t top()        const volatile { return _fields._top; }
 156     idx_t tag()        const volatile { return _fields._tag; }
 157 
 158     // Increment top; if it wraps, increment tag also.
 159     void increment() {
 160       _fields._top = increment_index(_fields._top);
 161       if (_fields._top == 0) ++_fields._tag;
 162     }
 163 
 164     Age cmpxchg(const Age new_age, const Age old_age) volatile {
 165       return (size_t) Atomic::cmpxchg_ptr((intptr_t)new_age._data,
 166                                           (volatile intptr_t *)&_data,
 167                                           (intptr_t)old_age._data);
 168     }
 169 
 170     bool operator ==(const Age& other) const { return _data == other._data; }
 171 
 172   private:
 173     struct fields {
 174       idx_t _top;
 175       idx_t _tag;
 176     };
 177     union {
 178       size_t _data;
 179       fields _fields;
 180     };
 181   };
 182 



 183   volatile Age _age;
 184 




















 185   // These both operate mod N.
 186   static uint increment_index(uint ind) {
 187     return (ind + 1) & MOD_N_MASK;
 188   }
 189   static uint decrement_index(uint ind) {
 190     return (ind - 1) & MOD_N_MASK;
 191   }
 192 
 193   // Returns a number in the range [0..N).  If the result is "N-1", it should be
 194   // interpreted as 0.
 195   uint dirty_size(uint bot, uint top) const {
 196     return (bot - top) & MOD_N_MASK;
 197   }
 198 
 199   // Returns the size corresponding to the given "bot" and "top".
 200   uint size(uint bot, uint top) const {
 201     uint sz = dirty_size(bot, top);
 202     // Has the queue "wrapped", so that bottom is less than top?  There's a
 203     // complicated special case here.  A pair of threads could perform pop_local
 204     // and pop_global operations concurrently, starting from a state in which
 205     // _bottom == _top+1.  The pop_local could succeed in decrementing _bottom,
 206     // and the pop_global in incrementing _top (in which case the pop_global
 207     // will be awarded the contested queue element.)  The resulting state must
 208     // be interpreted as an empty queue.  (We only need to worry about one such
 209     // event: only the queue owner performs pop_local's, and several concurrent
 210     // threads attempting to perform the pop_global will all perform the same
 211     // CAS, and only one can succeed.)  Any stealing thread that reads after
 212     // either the increment or decrement will see an empty queue, and will not
 213     // join the competitors.  The "sz == -1 || sz == N-1" state will not be
 214     // modified by concurrent queues, so the owner thread can reset the state to
 215     // _bottom == top so subsequent pushes will be performed normally.
 216     return (sz == N - 1) ? 0 : sz;
 217   }
 218 
 219 public:
 220   TaskQueueSuper() : _bottom(0), _age() {}
 221 
 222   // Return true if the TaskQueue contains/does not contain any tasks.
 223   bool peek()     const { return _bottom != _age.top(); }



 224   bool is_empty() const { return size() == 0; }
 225 
 226   // Return an estimate of the number of elements in the queue.
 227   // The "careful" version admits the possibility of pop_local/pop_global
 228   // races.
 229   uint size() const {
 230     return size(_bottom, _age.top());
 231   }
 232 
 233   uint dirty_size() const {
 234     return dirty_size(_bottom, _age.top());
 235   }
 236 
 237   void set_empty() {
 238     _bottom = 0;
 239     _age.set(0);
 240   }
 241 
 242   // Maximum number of elements allowed in the queue.  This is two less
 243   // than the actual queue size, for somewhat complicated reasons.
 244   uint max_elems() const { return N - 2; }
 245 
 246   // Total size of queue.
 247   static const uint total_size() { return N; }
 248 
 249   TASKQUEUE_STATS_ONLY(TaskQueueStats stats;)
 250 };
 251 
 252 
 253 
 254 template <class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE>
 255 class GenericTaskQueue: public TaskQueueSuper<N, F> {
 256 protected:
 257   typedef typename TaskQueueSuper<N, F>::Age Age;
 258   typedef typename TaskQueueSuper<N, F>::idx_t idx_t;
 259 
 260   using TaskQueueSuper<N, F>::_bottom;
 261   using TaskQueueSuper<N, F>::_age;
 262   using TaskQueueSuper<N, F>::increment_index;
 263   using TaskQueueSuper<N, F>::decrement_index;
 264   using TaskQueueSuper<N, F>::dirty_size;


 265 
 266 public:
 267   using TaskQueueSuper<N, F>::max_elems;
 268   using TaskQueueSuper<N, F>::size;
 269 
 270 #if  TASKQUEUE_STATS
 271   using TaskQueueSuper<N, F>::stats;
 272 #endif
 273 
 274 private:
 275   // Slow paths for push, pop_local.  (pop_global has no fast path.)
 276   bool push_slow(E t, uint dirty_n_elems);
 277   bool pop_local_slow(uint localBot, Age oldAge);
 278 
 279 public:
 280   typedef E element_type;
 281 
 282   // Initializes the queue to empty.
 283   GenericTaskQueue();
 284 


 304 
 305 private:
 306   // Element array.
 307   volatile E* _elems;
 308 };
 309 
 310 template<class E, MEMFLAGS F, unsigned int N>
 311 GenericTaskQueue<E, F, N>::GenericTaskQueue() {
 312   assert(sizeof(Age) == sizeof(size_t), "Depends on this.");
 313 }
 314 
 315 template<class E, MEMFLAGS F, unsigned int N>
 316 void GenericTaskQueue<E, F, N>::initialize() {
 317   _elems = NEW_C_HEAP_ARRAY(E, N, F);
 318 }
 319 
 320 template<class E, MEMFLAGS F, unsigned int N>
 321 void GenericTaskQueue<E, F, N>::oops_do(OopClosure* f) {
 322   // tty->print_cr("START OopTaskQueue::oops_do");
 323   uint iters = size();
 324   uint index = _bottom;
 325   for (uint i = 0; i < iters; ++i) {
 326     index = decrement_index(index);
 327     // tty->print_cr("  doing entry %d," INTPTR_T " -> " INTPTR_T,
 328     //            index, &_elems[index], _elems[index]);
 329     E* t = (E*)&_elems[index];      // cast away volatility
 330     oop* p = (oop*)t;
 331     assert((*t)->is_oop_or_null(), "Not an oop or null");
 332     f->do_oop(p);
 333   }
 334   // tty->print_cr("END OopTaskQueue::oops_do");
 335 }
 336 
 337 template<class E, MEMFLAGS F, unsigned int N>
 338 bool GenericTaskQueue<E, F, N>::push_slow(E t, uint dirty_n_elems) {
 339   if (dirty_n_elems == N - 1) {
 340     // Actually means 0, so do the push.
 341     uint localBot = _bottom;
 342     // g++ complains if the volatile result of the assignment is unused.
 343     const_cast<E&>(_elems[localBot] = t);
 344     OrderAccess::release_store(&_bottom, increment_index(localBot));
 345     TASKQUEUE_STATS_ONLY(stats.record_push());
 346     return true;
 347   }
 348   return false;
 349 }
 350 
 351 // pop_local_slow() is done by the owning thread and is trying to
 352 // get the last task in the queue.  It will compete with pop_global()
 353 // that will be used by other threads.  The tag age is incremented
 354 // whenever the queue goes empty which it will do here if this thread
 355 // gets the last task or in pop_global() if the queue wraps (top == 0
 356 // and pop_global() succeeds, see pop_global()).
 357 template<class E, MEMFLAGS F, unsigned int N>
 358 bool GenericTaskQueue<E, F, N>::pop_local_slow(uint localBot, Age oldAge) {
 359   // This queue was observed to contain exactly one element; either this
 360   // thread will claim it, or a competing "pop_global".  In either case,
 361   // the queue will be logically empty afterwards.  Create a new Age value
 362   // that represents the empty queue for the given value of "_bottom".  (We
 363   // must also increment "tag" because of the case where "bottom == 1",
 364   // "top == 0".  A pop_global could read the queue element in that case,
 365   // then have the owner thread do a pop followed by another push.  Without
 366   // the incrementing of "tag", the pop_global's CAS could succeed,
 367   // allowing it to believe it has claimed the stale element.)
 368   Age newAge((idx_t)localBot, oldAge.tag() + 1);
 369   // Perhaps a competing pop_global has already incremented "top", in which
 370   // case it wins the element.
 371   if (localBot == oldAge.top()) {
 372     // No competing pop_global has yet incremented "top"; we'll try to
 373     // install new_age, thus claiming the element.
 374     Age tempAge = _age.cmpxchg(newAge, oldAge);
 375     if (tempAge == oldAge) {
 376       // We win.
 377       assert(dirty_size(localBot, _age.top()) != N - 1, "sanity");
 378       TASKQUEUE_STATS_ONLY(stats.record_pop_slow());
 379       return true;
 380     }
 381   }
 382   // We lose; a completing pop_global gets the element.  But the queue is empty
 383   // and top is greater than bottom.  Fix this representation of the empty queue
 384   // to become the canonical one.
 385   _age.set(newAge);
 386   assert(dirty_size(localBot, _age.top()) != N - 1, "sanity");
 387   return false;
 388 }
 389 
 390 template<class E, MEMFLAGS F, unsigned int N>
 391 bool GenericTaskQueue<E, F, N>::pop_global(E& t) {
 392   Age oldAge = _age.get();
 393   uint localBot = _bottom;
 394   uint n_elems = size(localBot, oldAge.top());
 395   if (n_elems == 0) {
 396     return false;
 397   }
 398 
 399   const_cast<E&>(t = _elems[oldAge.top()]);
 400   Age newAge(oldAge);
 401   newAge.increment();
 402   Age resAge = _age.cmpxchg(newAge, oldAge);
 403 
 404   // Note that using "_bottom" here might fail, since a pop_local might
 405   // have decremented it.
 406   assert(dirty_size(localBot, newAge.top()) != N - 1, "sanity");
 407   return resAge == oldAge;
 408 }
 409 
 410 template<class E, MEMFLAGS F, unsigned int N>
 411 GenericTaskQueue<E, F, N>::~GenericTaskQueue() {
 412   FREE_C_HEAP_ARRAY(E, _elems, F);
 413 }
 414 
 415 // OverflowTaskQueue is a TaskQueue that also includes an overflow stack for
 416 // elements that do not fit in the TaskQueue.
 417 //
 418 // This class hides two methods from super classes:
 419 //
 420 // push() - push onto the task queue or, if that fails, onto the overflow stack
 421 // is_empty() - return true if both the TaskQueue and overflow stack are empty
 422 //


 616 
 617   // Reset the terminator, so that it may be reused again.
 618   // The caller is responsible for ensuring that this is done
 619   // in an MT-safe manner, once the previous round of use of
 620   // the terminator is finished.
 621   void reset_for_reuse();
 622   // Same as above but the number of parallel threads is set to the
 623   // given number.
 624   void reset_for_reuse(int n_threads);
 625 
 626 #ifdef TRACESPINNING
 627   static uint total_yields() { return _total_yields; }
 628   static uint total_spins() { return _total_spins; }
 629   static uint total_peeks() { return _total_peeks; }
 630   static void print_termination_counts();
 631 #endif
 632 };
 633 
 634 template<class E, MEMFLAGS F, unsigned int N> inline bool
 635 GenericTaskQueue<E, F, N>::push(E t) {
 636   uint localBot = _bottom;
 637   assert((localBot >= 0) && (localBot < N), "_bottom out of range.");
 638   idx_t top = _age.top();
 639   uint dirty_n_elems = dirty_size(localBot, top);
 640   assert(dirty_n_elems < N, "n_elems out of range.");
 641   if (dirty_n_elems < max_elems()) {
 642     // g++ complains if the volatile result of the assignment is unused.
 643     const_cast<E&>(_elems[localBot] = t);
 644     OrderAccess::release_store(&_bottom, increment_index(localBot));
 645     TASKQUEUE_STATS_ONLY(stats.record_push());
 646     return true;
 647   } else {
 648     return push_slow(t, dirty_n_elems);
 649   }
 650 }
 651 
 652 template<class E, MEMFLAGS F, unsigned int N> inline bool
 653 GenericTaskQueue<E, F, N>::pop_local(E& t) {
 654   uint localBot = _bottom;
 655   // This value cannot be N-1.  That can only occur as a result of
 656   // the assignment to bottom in this method.  If it does, this method
 657   // resets the size to 0 before the next call (which is sequential,
 658   // since this is pop_local.)
 659   uint dirty_n_elems = dirty_size(localBot, _age.top());
 660   assert(dirty_n_elems != N - 1, "Shouldn't be possible...");
 661   if (dirty_n_elems == 0) return false;
 662   localBot = decrement_index(localBot);
 663   _bottom = localBot;
 664   // This is necessary to prevent any read below from being reordered
 665   // before the store just above.
 666   OrderAccess::fence();
 667   const_cast<E&>(t = _elems[localBot]);
 668   // This is a second read of "age"; the "size()" above is the first.
 669   // If there's still at least one element in the queue, based on the
 670   // "_bottom" and "age" we've read, then there can be no interference with
 671   // a "pop_global" operation, and we're done.
 672   idx_t tp = _age.top();    // XXX
 673   if (size(localBot, tp) > 0) {
 674     assert(dirty_size(localBot, tp) != N - 1, "sanity");
 675     TASKQUEUE_STATS_ONLY(stats.record_pop());
 676     return true;
 677   } else {
 678     // Otherwise, the queue contained exactly one element; we take the slow
 679     // path.
 680     return pop_local_slow(localBot, _age.get());
 681   }
 682 }
 683 
 684 typedef GenericTaskQueue<oop, mtGC>             OopTaskQueue;
 685 typedef GenericTaskQueueSet<OopTaskQueue, mtGC> OopTaskQueueSet;
 686 
 687 #ifdef _MSC_VER
 688 #pragma warning(push)
 689 // warning C4522: multiple assignment operators specified
 690 #pragma warning(disable:4522)
 691 #endif
 692 
 693 // This is a container class for either an oop* or a narrowOop*.
 694 // Both are pushed onto a task queue and the consumer will test is_narrow()
 695 // to determine which should be processed.
 696 class StarTask {
 697   void*  _holder;        // either union oop* or narrowOop*
 698 
 699   enum { COMPRESSED_OOP_MASK = 1 };
 700 




 121   ++_stats[steal_attempt];
 122   if (success) ++_stats[steal];
 123 }
 124 
 125 void TaskQueueStats::record_overflow(size_t new_len) {
 126   ++_stats[overflow];
 127   if (new_len > _stats[overflow_max_len]) _stats[overflow_max_len] = new_len;
 128 }
 129 
 130 void TaskQueueStats::reset() {
 131   memset(_stats, 0, sizeof(_stats));
 132 }
 133 #endif // TASKQUEUE_STATS
 134 
 135 template <unsigned int N, MEMFLAGS F>
 136 class TaskQueueSuper: public CHeapObj<F> {
 137 protected:
 138   // Internal type for indexing the queue; also used for the tag.
 139   typedef NOT_LP64(uint16_t) LP64_ONLY(uint32_t) idx_t;
 140 
 141 private:
 142   // The first free element after the last one pushed (mod N).
 143   // We keep _bottom private and DO NOT USE IT DIRECTLY.
 144   volatile uint _bottom;
 145 
 146 protected:
 147   // Access to _bottom must be ordered. Use OrderAccess.
 148   inline uint get_bottom() const {
 149     return OrderAccess::load_acquire((volatile juint*)&_bottom);
 150   }
 151 
 152   inline void set_bottom(uint new_bottom) {
 153     OrderAccess::release_store(&_bottom, new_bottom);
 154   }
 155 
 156   enum { MOD_N_MASK = N - 1 };
 157 
 158   // Simple field access here, so no OrderAccess and no volatiles.
 159   class Age {
 160   public:
 161     Age(size_t data = 0)         { _data = data; }
 162     Age(const Age& age)          { _data = age._data; }
 163     Age(idx_t top, idx_t tag)    { _fields._top = top; _fields._tag = tag; }
 164 
 165     idx_t top() const { return _fields._top; }
 166     idx_t tag() const { return _fields._tag; }



 167 
 168     // Increment top; if it wraps, increment tag also.
 169     void increment() {
 170       _fields._top = increment_index(_fields._top);
 171       if (_fields._top == 0) ++_fields._tag;
 172     }
 173 






 174     bool operator ==(const Age& other) const { return _data == other._data; }
 175 
 176   public:
 177     struct fields {
 178       idx_t _top;
 179       idx_t _tag;
 180     };
 181     union {
 182       size_t _data;
 183       fields _fields;
 184     };
 185   };
 186 
 187 
 188 private:
 189   // Keep _age private and DO NOT USE IT DIRECTLY.
 190   volatile Age _age;
 191 
 192 protected:
 193   inline idx_t get_age_top() const {
 194     return OrderAccess::load_acquire((volatile idx_t*) &(_age._fields._top));
 195   }
 196 
 197   inline Age get_age() {
 198     size_t res = OrderAccess::load_ptr_acquire((volatile intptr_t*) &_age);
 199     return *(Age*)(&res);
 200   }
 201 
 202   inline void set_age(Age a) {
 203     OrderAccess::release_store_ptr((volatile intptr_t*) &_age, *(size_t*)(&a));
 204   }
 205 
 206   Age cmpxchg_age(const Age new_age, const Age old_age) volatile {
 207     return (Age)(size_t)Atomic::cmpxchg_ptr((intptr_t)new_age._data,
 208                                             (volatile intptr_t*) &(_age._data),
 209                                             (intptr_t)old_age._data);
 210   }
 211 
 212   // These both operate mod N.
 213   static uint increment_index(uint ind) {
 214     return (ind + 1) & MOD_N_MASK;
 215   }
 216   static uint decrement_index(uint ind) {
 217     return (ind - 1) & MOD_N_MASK;
 218   }
 219 
 220   // Returns a number in the range [0..N).  If the result is "N-1", it should be
 221   // interpreted as 0.
 222   uint dirty_size(uint bot, uint top) const {
 223     return (bot - top) & MOD_N_MASK;
 224   }
 225 
 226   // Returns the size corresponding to the given "bot" and "top".
 227   uint size(uint bot, uint top) const {
 228     uint sz = dirty_size(bot, top);
 229     // Has the queue "wrapped", so that bottom is less than top?  There's a
 230     // complicated special case here.  A pair of threads could perform pop_local
 231     // and pop_global operations concurrently, starting from a state in which
 232     // _bottom == _top+1.  The pop_local could succeed in decrementing _bottom,
 233     // and the pop_global in incrementing _top (in which case the pop_global
 234     // will be awarded the contested queue element.)  The resulting state must
 235     // be interpreted as an empty queue.  (We only need to worry about one such
 236     // event: only the queue owner performs pop_local's, and several concurrent
 237     // threads attempting to perform the pop_global will all perform the same
 238     // CAS, and only one can succeed.)  Any stealing thread that reads after
 239     // either the increment or decrement will see an empty queue, and will not
 240     // join the competitors.  The "sz == -1 || sz == N-1" state will not be
 241     // modified by concurrent queues, so the owner thread can reset the state to
 242     // _bottom == top so subsequent pushes will be performed normally.
 243     return (sz == N - 1) ? 0 : sz;
 244   }
 245 
 246 public:
 247   TaskQueueSuper() : _bottom(0), _age() {}
 248 
 249   // Return true if the TaskQueue contains/does not contain any tasks.
 250   // Use getters/setters with OrderAccess when accessing _bottom and _age.
 251   bool peek() {
 252     return get_bottom() != get_age_top();
 253   }
 254   bool is_empty() const { return size() == 0; }
 255 
 256   // Return an estimate of the number of elements in the queue.
 257   // The "careful" version admits the possibility of pop_local/pop_global
 258   // races.
 259   uint size() const {
 260     return size(get_bottom(), get_age_top());
 261   }
 262 
 263   uint dirty_size() const {
 264     return dirty_size(get_bottom(), get_age_top());
 265   }
 266 
 267   void set_empty() {
 268     set_bottom(0);
 269     set_age(0);
 270   }
 271 
 272   // Maximum number of elements allowed in the queue.  This is two less
 273   // than the actual queue size, for somewhat complicated reasons.
 274   uint max_elems() const { return N - 2; }
 275 
 276   // Total size of queue.
 277   static const uint total_size() { return N; }
 278 
 279   TASKQUEUE_STATS_ONLY(TaskQueueStats stats;)
 280 };
 281 
 282 
 283 
 284 template <class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE>
 285 class GenericTaskQueue: public TaskQueueSuper<N, F> {
 286 protected:
 287   typedef typename TaskQueueSuper<N, F>::Age Age;
 288   typedef typename TaskQueueSuper<N, F>::idx_t idx_t;
 289 


 290   using TaskQueueSuper<N, F>::increment_index;
 291   using TaskQueueSuper<N, F>::decrement_index;
 292   using TaskQueueSuper<N, F>::dirty_size;
 293   using TaskQueueSuper<N, F>::get_age_top;
 294   using TaskQueueSuper<N, F>::get_age;
 295 
 296 public:
 297   using TaskQueueSuper<N, F>::max_elems;
 298   using TaskQueueSuper<N, F>::size;
 299 
 300 #if  TASKQUEUE_STATS
 301   using TaskQueueSuper<N, F>::stats;
 302 #endif
 303 
 304 private:
 305   // Slow paths for push, pop_local.  (pop_global has no fast path.)
 306   bool push_slow(E t, uint dirty_n_elems);
 307   bool pop_local_slow(uint localBot, Age oldAge);
 308 
 309 public:
 310   typedef E element_type;
 311 
 312   // Initializes the queue to empty.
 313   GenericTaskQueue();
 314 


 334 
 335 private:
 336   // Element array.
 337   volatile E* _elems;
 338 };
 339 
 340 template<class E, MEMFLAGS F, unsigned int N>
 341 GenericTaskQueue<E, F, N>::GenericTaskQueue() {
 342   assert(sizeof(Age) == sizeof(size_t), "Depends on this.");
 343 }
 344 
 345 template<class E, MEMFLAGS F, unsigned int N>
 346 void GenericTaskQueue<E, F, N>::initialize() {
 347   _elems = NEW_C_HEAP_ARRAY(E, N, F);
 348 }
 349 
 350 template<class E, MEMFLAGS F, unsigned int N>
 351 void GenericTaskQueue<E, F, N>::oops_do(OopClosure* f) {
 352   // tty->print_cr("START OopTaskQueue::oops_do");
 353   uint iters = size();
 354   uint index = this->get_bottom();
 355   for (uint i = 0; i < iters; ++i) {
 356     index = decrement_index(index);
 357     // tty->print_cr("  doing entry %d," INTPTR_T " -> " INTPTR_T,
 358     //            index, &_elems[index], _elems[index]);
 359     E* t = (E*)&_elems[index];      // cast away volatility
 360     oop* p = (oop*)t;
 361     assert((*t)->is_oop_or_null(), "Not an oop or null");
 362     f->do_oop(p);
 363   }
 364   // tty->print_cr("END OopTaskQueue::oops_do");
 365 }
 366 
 367 template<class E, MEMFLAGS F, unsigned int N>
 368 bool GenericTaskQueue<E, F, N>::push_slow(E t, uint dirty_n_elems) {
 369   if (dirty_n_elems == N - 1) {
 370     // Actually means 0, so do the push.
 371     uint localBot = this->get_bottom();
 372     // g++ complains if the volatile result of the assignment is unused.
 373     const_cast<E&>(_elems[localBot] = t);
 374     set_bottom(increment_index(localBot));
 375     TASKQUEUE_STATS_ONLY(stats.record_push());
 376     return true;
 377   }
 378   return false;
 379 }
 380 
 381 // pop_local_slow() is done by the owning thread and is trying to
 382 // get the last task in the queue.  It will compete with pop_global()
 383 // that will be used by other threads.  The tag age is incremented
 384 // whenever the queue goes empty which it will do here if this thread
 385 // gets the last task or in pop_global() if the queue wraps (top == 0
 386 // and pop_global() succeeds, see pop_global()).
 387 template<class E, MEMFLAGS F, unsigned int N>
 388 bool GenericTaskQueue<E, F, N>::pop_local_slow(uint localBot, Age oldAge) {
 389   // This queue was observed to contain exactly one element; either this
 390   // thread will claim it, or a competing "pop_global".  In either case,
 391   // the queue will be logically empty afterwards.  Create a new Age value
 392   // that represents the empty queue for the given value of "_bottom".  (We
 393   // must also increment "tag" because of the case where "bottom == 1",
 394   // "top == 0".  A pop_global could read the queue element in that case,
 395   // then have the owner thread do a pop followed by another push.  Without
 396   // the incrementing of "tag", the pop_global's CAS could succeed,
 397   // allowing it to believe it has claimed the stale element.)
 398   Age newAge((idx_t)localBot, oldAge.tag() + 1);
 399   // Perhaps a competing pop_global has already incremented "top", in which
 400   // case it wins the element.
 401   if (localBot == oldAge.top()) {
 402     // No competing pop_global has yet incremented "top"; we'll try to
 403     // install new_age, thus claiming the element.
 404     Age tempAge = cmpxchg_age(newAge, oldAge);
 405     if (tempAge == oldAge) {
 406       // We win.
 407       assert(dirty_size(localBot, get_age_top()) != N - 1, "sanity");
 408       TASKQUEUE_STATS_ONLY(stats.record_pop_slow());
 409       return true;
 410     }
 411   }
 412   // We lose; a completing pop_global gets the element.  But the queue is empty
 413   // and top is greater than bottom.  Fix this representation of the empty queue
 414   // to become the canonical one.
 415   set_age(newAge);
 416   assert(dirty_size(localBot, get_age_top()) != N - 1, "sanity");
 417   return false;
 418 }
 419 
 420 template<class E, MEMFLAGS F, unsigned int N>
 421 bool GenericTaskQueue<E, F, N>::pop_global(E& t) {
 422   Age oldAge = get_age();
 423   uint localBot = this->get_bottom();
 424   uint n_elems = size(localBot, oldAge.top());
 425   if (n_elems == 0) {
 426     return false;
 427   }
 428 
 429   const_cast<E&>(t = _elems[oldAge.top()]);
 430   Age newAge(oldAge);
 431   newAge.increment();
 432   Age resAge = cmpxchg_age(newAge, oldAge);
 433 
 434   // Note that using "_bottom" here might fail, since a pop_local might
 435   // have decremented it.
 436   assert(dirty_size(localBot, newAge.top()) != N - 1, "sanity");
 437   return resAge == oldAge;
 438 }
 439 
 440 template<class E, MEMFLAGS F, unsigned int N>
 441 GenericTaskQueue<E, F, N>::~GenericTaskQueue() {
 442   FREE_C_HEAP_ARRAY(E, _elems, F);
 443 }
 444 
 445 // OverflowTaskQueue is a TaskQueue that also includes an overflow stack for
 446 // elements that do not fit in the TaskQueue.
 447 //
 448 // This class hides two methods from super classes:
 449 //
 450 // push() - push onto the task queue or, if that fails, onto the overflow stack
 451 // is_empty() - return true if both the TaskQueue and overflow stack are empty
 452 //


 646 
 647   // Reset the terminator, so that it may be reused again.
 648   // The caller is responsible for ensuring that this is done
 649   // in an MT-safe manner, once the previous round of use of
 650   // the terminator is finished.
 651   void reset_for_reuse();
 652   // Same as above but the number of parallel threads is set to the
 653   // given number.
 654   void reset_for_reuse(int n_threads);
 655 
 656 #ifdef TRACESPINNING
 657   static uint total_yields() { return _total_yields; }
 658   static uint total_spins() { return _total_spins; }
 659   static uint total_peeks() { return _total_peeks; }
 660   static void print_termination_counts();
 661 #endif
 662 };
 663 
 664 template<class E, MEMFLAGS F, unsigned int N> inline bool
 665 GenericTaskQueue<E, F, N>::push(E t) {
 666   uint localBot = this->get_bottom();
 667   assert(localBot < N, "_bottom out of range.");
 668   idx_t top = get_age_top();
 669   uint dirty_n_elems = dirty_size(localBot, top);
 670   assert(dirty_n_elems < N, "n_elems out of range.");
 671   if (dirty_n_elems < max_elems()) {
 672     // g++ complains if the volatile result of the assignment is unused.
 673     const_cast<E&>(_elems[localBot] = t);
 674     set_bottom(increment_index(localBot));
 675     TASKQUEUE_STATS_ONLY(stats.record_push());
 676     return true;
 677   } else {
 678     return push_slow(t, dirty_n_elems);
 679   }
 680 }
 681 
 682 template<class E, MEMFLAGS F, unsigned int N> inline bool
 683 GenericTaskQueue<E, F, N>::pop_local(E& t) {
 684   uint localBot = this->get_bottom();
 685   // This value cannot be N-1.  That can only occur as a result of
 686   // the assignment to bottom in this method.  If it does, this method
 687   // resets the size to 0 before the next call (which is sequential,
 688   // since this is pop_local.)
 689   uint dirty_n_elems = dirty_size(localBot, get_age_top());
 690   assert(dirty_n_elems != N - 1, "Shouldn't be possible...");
 691   if (dirty_n_elems == 0) return false;
 692   localBot = decrement_index(localBot);
 693   this->set_bottom(localBot);
 694   // This is necessary to prevent any read below from being reordered
 695   // before the store just above.
 696   OrderAccess::fence();
 697   const_cast<E&>(t = _elems[localBot]);
 698   // This is a second read of "age"; the "size()" above is the first.
 699   // If there's still at least one element in the queue, based on the
 700   // "_bottom" and "age" we've read, then there can be no interference with
 701   // a "pop_global" operation, and we're done.
 702   idx_t tp = get_age_top();    // XXX
 703   if (size(localBot, tp) > 0) {
 704     assert(dirty_size(localBot, tp) != N - 1, "sanity");
 705     TASKQUEUE_STATS_ONLY(stats.record_pop());
 706     return true;
 707   } else {
 708     // Otherwise, the queue contained exactly one element; we take the slow
 709     // path.
 710     return pop_local_slow(localBot, get_age());
 711   }
 712 }
 713 
 714 typedef GenericTaskQueue<oop, mtGC>             OopTaskQueue;
 715 typedef GenericTaskQueueSet<OopTaskQueue, mtGC> OopTaskQueueSet;
 716 
 717 #ifdef _MSC_VER
 718 #pragma warning(push)
 719 // warning C4522: multiple assignment operators specified
 720 #pragma warning(disable:4522)
 721 #endif
 722 
 723 // This is a container class for either an oop* or a narrowOop*.
 724 // Both are pushed onto a task queue and the consumer will test is_narrow()
 725 // to determine which should be processed.
 726 class StarTask {
 727   void*  _holder;        // either union oop* or narrowOop*
 728 
 729   enum { COMPRESSED_OOP_MASK = 1 };
 730