--- /dev/null 2018-11-30 10:10:44.238550338 +0100 +++ new/src/hotspot/share/gc/shenandoah/shenandoahTaskqueue.hpp 2018-11-30 10:23:58.881664496 +0100 @@ -0,0 +1,370 @@ +/* + * Copyright (c) 2016, 2018, Red Hat, Inc. All rights reserved. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP +#define SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP + +#include "gc/shared/taskqueue.hpp" +#include "gc/shared/taskqueue.inline.hpp" +#include "runtime/mutex.hpp" +#include "runtime/thread.hpp" + +template +class BufferedOverflowTaskQueue: public OverflowTaskQueue +{ +public: + typedef OverflowTaskQueue taskqueue_t; + + BufferedOverflowTaskQueue() : _buf_empty(true) {}; + + TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;) + + // Push task t into the queue. Returns true on success. + inline bool push(E t); + + // Attempt to pop from the queue. Returns true on success. + inline bool pop(E &t); + + inline void clear() { + _buf_empty = true; + taskqueue_t::set_empty(); + taskqueue_t::overflow_stack()->clear(); + } + + inline bool is_empty() const { + return _buf_empty && taskqueue_t::is_empty(); + } + +private: + bool _buf_empty; + E _elem; +}; + +// ObjArrayChunkedTask +// +// Encodes both regular oops, and the array oops plus chunking data for parallel array processing. +// The design goal is to make the regular oop ops very fast, because that would be the prevailing +// case. On the other hand, it should not block parallel array processing from efficiently dividing +// the array work. +// +// The idea is to steal the bits from the 64-bit oop to encode array data, if needed. For the +// proper divide-and-conquer strategies, we want to encode the "blocking" data. It turns out, the +// most efficient way to do this is to encode the array block as (chunk * 2^pow), where it is assumed +// that the block has the size of 2^pow. This requires for pow to have only 5 bits (2^32) to encode +// all possible arrays. +// +// |---------oop---------|-pow-|--chunk---| +// 0 49 54 64 +// +// By definition, chunk == 0 means "no chunk", i.e. chunking starts from 1. +// +// This encoding gives a few interesting benefits: +// +// a) Encoding/decoding regular oops is very simple, because the upper bits are zero in that task: +// +// |---------oop---------|00000|0000000000| // no chunk data +// +// This helps the most ubiquitous path. The initialization amounts to putting the oop into the word +// with zero padding. Testing for "chunkedness" is testing for zero with chunk mask. +// +// b) Splitting tasks for divide-and-conquer is possible. Suppose we have chunk that covers +// interval [ (C-1)*2^P; C*2^P ). We can then split it into two chunks: +// <2*C - 1, P-1>, that covers interval [ (2*C - 2)*2^(P-1); (2*C - 1)*2^(P-1) ) +// <2*C, P-1>, that covers interval [ (2*C - 1)*2^(P-1); 2*C*2^(P-1) ) +// +// Observe that the union of these two intervals is: +// [ (2*C - 2)*2^(P-1); 2*C*2^(P-1) ) +// +// ...which is the original interval: +// [ (C-1)*2^P; C*2^P ) +// +// c) The divide-and-conquer strategy could even start with chunk <1, round-log2-len(arr)>, and split +// down in the parallel threads, which alleviates the upfront (serial) splitting costs. +// +// Encoding limitations caused by current bitscales mean: +// 10 bits for chunk: max 1024 blocks per array +// 5 bits for power: max 2^32 array +// 49 bits for oop: max 512 TB of addressable space +// +// Stealing bits from oop trims down the addressable space. Stealing too few bits for chunk ID limits +// potential parallelism. Stealing too few bits for pow limits the maximum array size that can be handled. +// In future, these might be rebalanced to favor one degree of freedom against another. For example, +// if/when Arrays 2.0 bring 2^64-sized arrays, we might need to steal another bit for power. We could regain +// some bits back if chunks are counted in ObjArrayMarkingStride units. +// +// There is also a fallback version that uses plain fields, when we don't have enough space to steal the +// bits from the native pointer. It is useful to debug the _LP64 version. +// + +#ifdef _MSC_VER +#pragma warning(push) +// warning C4522: multiple assignment operators specified +#pragma warning( disable:4522 ) +#endif + +#ifdef _LP64 +class ObjArrayChunkedTask +{ +public: + enum { + chunk_bits = 10, + pow_bits = 5, + oop_bits = sizeof(uintptr_t)*8 - chunk_bits - pow_bits, + }; + enum { + oop_shift = 0, + pow_shift = oop_shift + oop_bits, + chunk_shift = pow_shift + pow_bits, + }; + +public: + ObjArrayChunkedTask(oop o = NULL) { + _obj = ((uintptr_t)(void*) o) << oop_shift; + } + ObjArrayChunkedTask(oop o, int chunk, int mult) { + assert(0 <= chunk && chunk < nth_bit(chunk_bits), "chunk is sane: %d", chunk); + assert(0 <= mult && mult < nth_bit(pow_bits), "pow is sane: %d", mult); + uintptr_t t_b = ((uintptr_t) chunk) << chunk_shift; + uintptr_t t_m = ((uintptr_t) mult) << pow_shift; + uintptr_t obj = (uintptr_t)(void*)o; + assert(obj < nth_bit(oop_bits), "obj ref is sane: " PTR_FORMAT, obj); + intptr_t t_o = obj << oop_shift; + _obj = t_o | t_m | t_b; + } + ObjArrayChunkedTask(const ObjArrayChunkedTask& t): _obj(t._obj) { } + + ObjArrayChunkedTask& operator =(const ObjArrayChunkedTask& t) { + _obj = t._obj; + return *this; + } + volatile ObjArrayChunkedTask& + operator =(const volatile ObjArrayChunkedTask& t) volatile { + (void)const_cast(_obj = t._obj); + return *this; + } + + inline oop obj() const { return (oop) reinterpret_cast((_obj >> oop_shift) & right_n_bits(oop_bits)); } + inline int chunk() const { return (int) (_obj >> chunk_shift) & right_n_bits(chunk_bits); } + inline int pow() const { return (int) ((_obj >> pow_shift) & right_n_bits(pow_bits)); } + inline bool is_not_chunked() const { return (_obj & ~right_n_bits(oop_bits + pow_bits)) == 0; } + + DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid. + + static size_t max_addressable() { + return nth_bit(oop_bits); + } + + static int chunk_size() { + return nth_bit(chunk_bits); + } + +private: + uintptr_t _obj; +}; +#else +class ObjArrayChunkedTask +{ +public: + enum { + chunk_bits = 10, + pow_bits = 5, + }; +public: + ObjArrayChunkedTask(oop o = NULL, int chunk = 0, int pow = 0): _obj(o) { + assert(0 <= chunk && chunk < nth_bit(chunk_bits), "chunk is sane: %d", chunk); + assert(0 <= pow && pow < nth_bit(pow_bits), "pow is sane: %d", pow); + _chunk = chunk; + _pow = pow; + } + ObjArrayChunkedTask(const ObjArrayChunkedTask& t): _obj(t._obj), _chunk(t._chunk), _pow(t._pow) { } + + ObjArrayChunkedTask& operator =(const ObjArrayChunkedTask& t) { + _obj = t._obj; + _chunk = t._chunk; + _pow = t._pow; + return *this; + } + volatile ObjArrayChunkedTask& + operator =(const volatile ObjArrayChunkedTask& t) volatile { + (void)const_cast(_obj = t._obj); + _chunk = t._chunk; + _pow = t._pow; + return *this; + } + + inline oop obj() const { return _obj; } + inline int chunk() const { return _chunk; } + inline int pow() const { return _pow; } + + inline bool is_not_chunked() const { return _chunk == 0; } + + DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid. + + static size_t max_addressable() { + return sizeof(oop); + } + + static int chunk_size() { + return nth_bit(chunk_bits); + } + +private: + oop _obj; + int _chunk; + int _pow; +}; +#endif + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +typedef ObjArrayChunkedTask ShenandoahMarkTask; +typedef BufferedOverflowTaskQueue ShenandoahBufferedOverflowTaskQueue; +typedef Padded ShenandoahObjToScanQueue; + +template +class ParallelClaimableQueueSet: public GenericTaskQueueSet { +private: + DEFINE_PAD_MINUS_SIZE(0, DEFAULT_CACHE_LINE_SIZE, sizeof(volatile jint)); + volatile jint _claimed_index; + DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, 0); + + debug_only(uint _reserved; ) + +public: + using GenericTaskQueueSet::size; + +public: + ParallelClaimableQueueSet(int n) : GenericTaskQueueSet(n), _claimed_index(0) { + debug_only(_reserved = 0; ) + } + + void clear_claimed() { _claimed_index = 0; } + T* claim_next(); + + // reserve queues that not for parallel claiming + void reserve(uint n) { + assert(n <= size(), "Sanity"); + _claimed_index = (jint)n; + debug_only(_reserved = n;) + } + + debug_only(uint get_reserved() const { return (uint)_reserved; }) +}; + +template +T* ParallelClaimableQueueSet::claim_next() { + jint size = (jint)GenericTaskQueueSet::size(); + + if (_claimed_index >= size) { + return NULL; + } + + jint index = Atomic::add(1, &_claimed_index); + + if (index <= size) { + return GenericTaskQueueSet::queue((uint)index - 1); + } else { + return NULL; + } +} + +class ShenandoahObjToScanQueueSet: public ParallelClaimableQueueSet { +public: + ShenandoahObjToScanQueueSet(int n) : ParallelClaimableQueueSet(n) {} + + bool is_empty(); + void clear(); + +#if TASKQUEUE_STATS + static void print_taskqueue_stats_hdr(outputStream* const st); + void print_taskqueue_stats() const; + void reset_taskqueue_stats(); +#endif // TASKQUEUE_STATS +}; + +class ShenandoahTerminatorTerminator : public TerminatorTerminator { +public: + // return true, terminates immediately, even if there's remaining work left + virtual bool should_force_termination() { return false; } +}; + +/* + * This is an enhanced implementation of Google's work stealing + * protocol, which is described in the paper: + * Understanding and improving JVM GC work stealing at the data center scale + * (http://dl.acm.org/citation.cfm?id=2926706) + * + * Instead of a dedicated spin-master, our implementation will let spin-master to relinquish + * the role before it goes to sleep/wait, so allows newly arrived thread to compete for the role. + * The intention of above enhancement, is to reduce spin-master's latency on detecting new tasks + * for stealing and termination condition. + */ + +class ShenandoahTaskTerminator: public ParallelTaskTerminator { +private: + Monitor* _blocker; + Thread* _spin_master; + +public: + ShenandoahTaskTerminator(uint n_threads, TaskQueueSetSuper* queue_set) : + ParallelTaskTerminator(n_threads, queue_set), _spin_master(NULL) { + _blocker = new Monitor(Mutex::leaf, "ShenandoahTaskTerminator", false, Monitor::_safepoint_check_never); + } + + ~ShenandoahTaskTerminator() { + assert(_blocker != NULL, "Can not be NULL"); + delete _blocker; + } + + bool offer_termination(ShenandoahTerminatorTerminator* terminator); + bool offer_termination() { return offer_termination((ShenandoahTerminatorTerminator*)NULL); } + +private: + bool offer_termination(TerminatorTerminator* terminator) { + ShouldNotReachHere(); + return false; + } + +private: + size_t tasks_in_queue_set() { return _queue_set->tasks(); } + + /* + * Perform spin-master task. + * return true if termination condition is detected + * otherwise, return false + */ + bool do_spin_master_work(ShenandoahTerminatorTerminator* terminator); +}; + +class ShenandoahCancelledTerminatorTerminator : public ShenandoahTerminatorTerminator { + virtual bool should_exit_termination() { + return false; + } + virtual bool should_force_termination() { + return true; + } +}; + +#endif // SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP