--- /dev/null 2017-04-17 13:03:13.666114673 -0700 +++ new/src/share/vm/runtime/heapMonitoring.cpp 2017-04-18 10:46:13.818544511 -0700 @@ -0,0 +1,391 @@ +/* + * Copyright (c) 2017, Google and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#include "precompiled.hpp" +#include "prims/forte.hpp" +#include "runtime/heapMonitoring.hpp" + +const int kMaxStackDepth = 64; + +// The resulting data, as they appear to the client. +// TODO(jcbeyler): should we make this into a JVMTI structure? +struct StackTraceData { + ASGCT_CallTrace *trace; + intx byte_size; + jlong thread_id; + + StackTraceData(ASGCT_CallTrace *t, intx size, jlong tid) : trace(t), + byte_size(size), thread_id(tid) {} +}; + +// RAII class that acquires / releases lock +class MuxLocker { + private: + volatile intptr_t *_lock; + const char *_name; + public: + MuxLocker(volatile intptr_t *lock, const char *name) : + _lock(lock), + _name(name) { + Thread::muxAcquire(lock, name); + } + ~MuxLocker() { + Thread::muxRelease(_lock); + } +}; + +// Each object that we profile is stored as trace with the thread_id. +class StackTraceStorage { + public: + // The function that gets called to add a trace to the list of + // traces we are maintaining. trace is the stacktrace, and thread + // is the thread that did the allocation. + void add_trace(ASGCT_CallTrace *trace, intx byte_size, Thread *thread); + + // The function that gets called by the client to retrieve the list + // of stack traces. Passes (by reference) a pointer to a list of + // traces, and a number of traces, both of which will get mutated by + // the function being called. + void get_all_stack_traces(jvmtiStackTraceData **traces, jint *num_traces); + + ~StackTraceStorage(); + StackTraceStorage(); + + // The global storage. Not a global static because + // StackTraceStorage isn't available at module-loading time. + static StackTraceStorage *storage() { + static StackTraceStorage storage; + return &storage; + } + + // Protects the traces currently sampled (below). + volatile intptr_t _allocated_traces_lock[1]; + + // The current allocated traces. A fixed-size ring buffer. + // This is a temporay fix until the GC handlers are in place. Then this + // becomes a growable array that is emptied as elements get garbage + // collected. + StackTraceData** _allocated_traces; + + // Maximum size of the allocation. + size_t _allocated_traces_size; + + // TODO(jcbeyler): remove extra code that is here for statistics... + size_t _allocated_count; + + // The current position in _allocated_traces (above); + // This is a temporay fix until the GC handlers are in place. Then this + // becomes a growable array that is emptied as elements get garbage + // collected. + int _allocated_traces_pos; + + private: + // Support functions and classes for copying data to the external + // world. + class StackTraceDataCopier { + public: + virtual int size() const = 0; + virtual StackTraceData *get(int i) const = 0; + }; + + class LiveStackTraceDataCopier : public StackTraceDataCopier { + public: + LiveStackTraceDataCopier(StackTraceData **data, int size) : + _data(data), _size(size) {} + int size() const { return _size; } + StackTraceData *get(int i) const { return _data[i]; } + + private: + StackTraceData **_data; + int _size; + }; + + // Copies jvmtiStackTraceData from to jvmtiStackTraceData to + bool deep_copy(jvmtiStackTraceData *to, StackTraceData *from); + + // Creates a deep copy of the list of StackTraceData + void copy_stack_traces(const StackTraceDataCopier &copier, + jvmtiStackTraceData **traces, + jint *num_traces); +}; + +// Statics for Sampler +double HeapMonitoring::_log_table[1 << kFastlogNumBits]; + +bool HeapMonitoring::_initialized = false; + +jint HeapMonitoring::_monitoring_period; + +// Cheap random number generator +uint64_t HeapMonitoring::_rnd; + +StackTraceStorage::StackTraceStorage() : + _allocated_traces(new StackTraceData*[MaxHeapTraces]), + _allocated_traces_size(MaxHeapTraces), + _allocated_traces_pos(0), + _allocated_count(0) { + memset(_allocated_traces, 0, sizeof(*_allocated_traces) * MaxHeapTraces); + _allocated_traces_lock[0] = 0; +} + +StackTraceStorage::~StackTraceStorage() { + delete[] _allocated_traces; +} + +void StackTraceStorage::add_trace(ASGCT_CallTrace *trace, + intx byte_size, + Thread *thread) { + StackTraceData *new_data = + new StackTraceData(trace, byte_size, SharedRuntime::get_java_tid(thread)); + + MuxLocker mu(_allocated_traces_lock, "StackTraceStorage::add_trace"); + StackTraceData *current_allocated_trace = + _allocated_traces[_allocated_traces_pos]; + if (current_allocated_trace != NULL) { + delete current_allocated_trace; + } + _allocated_traces[_allocated_traces_pos] = new_data; + _allocated_traces_pos = (_allocated_traces_pos + 1) % _allocated_traces_size; + _allocated_count++; +} + +bool StackTraceStorage::deep_copy(jvmtiStackTraceData *to, + StackTraceData *from) { + to->thread_id = from->thread_id; + to->size = from->byte_size; + + ASGCT_CallTrace *dest = reinterpret_cast( + os::malloc(sizeof(ASGCT_CallTrace), mtInternal)); + if (dest == NULL) { + return false; + } + to->trace = dest; + + const ASGCT_CallTrace *src = + reinterpret_cast(from->trace); + *dest = *src; + + dest->frames = reinterpret_cast( + os::malloc(sizeof(ASGCT_CallFrame) * kMaxStackDepth, mtInternal)); + if (dest->frames == NULL) { + return false; + } + memcpy(dest->frames, + src->frames, + sizeof(ASGCT_CallFrame) * kMaxStackDepth); + return true; +} + +// Called by the outside world; returns a copy of the stack traces +// (because we could be replacing them as the user handles them). +// The array is secretly null-terminated (to make it easier to reclaim). +void StackTraceStorage::get_all_stack_traces(jvmtiStackTraceData **traces, + jint *num_traces) { + LiveStackTraceDataCopier copier(_allocated_traces, _allocated_traces_size); + copy_stack_traces(copier, traces, num_traces); +} + +void StackTraceStorage::copy_stack_traces(const StackTraceDataCopier &copier, + jvmtiStackTraceData **traces, + jint *num_traces) { + MuxLocker mu(_allocated_traces_lock, "StackTraceStorage::copy_stack_traces"); + int len = copier.size(); + // TODO(jcbeyler): +2 -> +1 for len :remove this extra code handling the extra trace for + // counting, it is for statistics. + // Create a new array to store the StackTraceData objects. + jvmtiStackTraceData *t = reinterpret_cast( + os::malloc((len + 2) * sizeof(*t), mtInternal)); + if (t == NULL) { + *traces = NULL; + *num_traces = 0; + return; + } + // TODO(jcbeyler): +2 -> +1 for len :remove this extra code handling the extra trace for + // counting, it is for statistics. + // +1 to have a NULL at the end of the array. + memset(t, 0, (len + 2) * sizeof(*t)); + + // TODO(jcbeyler): remove this extra code handling the extra trace for + // counting, it is for statistics. + jvmtiStackTraceData *to = &t[0]; + to->size = _allocated_count; + + // Copy the StackTraceData objects into the new array. + int trace_count = 1; + for (int i = 0; i < len; i++) { + StackTraceData *stack_trace = copier.get(i); + if (stack_trace != NULL && stack_trace->trace != NULL) { + jvmtiStackTraceData *to = &t[trace_count]; + if (!deep_copy(to, stack_trace)) { + continue; + } + trace_count++; + } + } + + *traces = t; + *num_traces = trace_count; +} + +void HeapMonitoring::get_live_traces(jvmtiStackTraceData **traces, + jint *num_traces) { + StackTraceStorage::storage()->get_all_stack_traces(traces, num_traces); +} + +// TODO(jcbeyler): find out if the algorithm for determining the sampling can be +// upstreamed. +void HeapMonitoring::initialize_profiling(jint monitoring_period) { + _monitoring_period = monitoring_period; + + // Populate the lookup table for fast_log2. + // This approximates the log2 curve with a step function. + // Steps have height equal to log2 of the mid-point of the step. + for (int i = 0; i < (1 << kFastlogNumBits); i++) { + double half_way = static_cast(i + 0.5); + _log_table[i] = (log(1.0 + half_way / (1 << kFastlogNumBits)) / log(2.0)); + } + + JavaThread *t = reinterpret_cast(Thread::current()); + _rnd = static_cast(reinterpret_cast(t)); + if (_rnd == 0) { + _rnd = 1; + } + for (int i = 0; i < 20; i++) { + _rnd = next_random(_rnd); + } + _initialized = true; +} + +// Generates a geometric variable with the specified mean (512K by default). +// This is done by generating a random number between 0 and 1 and applying +// the inverse cumulative distribution function for an exponential. +// Specifically: Let m be the inverse of the sample period, then +// the probability distribution function is m*exp(-mx) so the CDF is +// p = 1 - exp(-mx), so +// q = 1 - p = exp(-mx) +// log_e(q) = -mx +// -log_e(q)/m = x +// log_2(q) * (-log_e(2) * 1/m) = x +// In the code, q is actually in the range 1 to 2**26, hence the -26 below +void HeapMonitoring::pick_next_sample(JavaThread *t) { + _rnd = next_random(_rnd); + // Take the top 26 bits as the random number + // (This plus a 1<<58 sampling bound gives a max possible step of + // 5194297183973780480 bytes. In this case, + // for sample_parameter = 1<<19, max possible step is + // 9448372 bytes (24 bits). + const uint64_t prng_mod_power = 48; // Number of bits in prng + // The uint32_t cast is to prevent a (hard-to-reproduce) NAN + // under piii debug for some binaries. + double q = static_cast(_rnd >> (prng_mod_power - 26)) + 1.0; + // Put the computed p-value through the CDF of a geometric. + // For faster performance (save ~1/20th exec time), replace + // min(0.0, FastLog2(q) - 26) by (Fastlog2(q) - 26.000705) + // The value 26.000705 is used rather than 26 to compensate + // for inaccuracies in FastLog2 which otherwise result in a + // negative answer. + size_t *bytes_until_sample = t->bytes_until_sample(); + double log_val = (fast_log2(q) - 26); + *bytes_until_sample = static_cast( + (0.0 < log_val ? 0.0 : log_val) * (-log(2.0) * (_monitoring_period)) + 1); +} + +// Called from the interpreter and C1 +void HeapMonitoring::object_alloc_unsized(oopDesc* o) { + JavaThread *thread = reinterpret_cast(Thread::current()); + assert(o->size() << LogHeapWordSize == byte_size, + "Object size is incorrect."); + object_alloc_do_sample(thread, o, o->size() << LogHeapWordSize); +} + +void HeapMonitoring::object_alloc(oopDesc* o, intx byte_size) { + JavaThread *thread = reinterpret_cast(Thread::current()); + object_alloc_do_sample(thread, o, byte_size); +} + +// Called directly by C2 +void HeapMonitoring::object_alloc_do_sample(Thread *t, oopDesc *o, intx byte_size) { +#if defined(X86) || defined(PPC) + JavaThread *thread = reinterpret_cast(t); + size_t *bytes_until_sample = thread->bytes_until_sample(); + if (_initialized) { + // TODO(jcbeyler): what about this? + assert(t->is_Java_thread(), "non-Java thread passed to do_sample"); + JavaThread *thread = reinterpret_cast(t); + + pick_next_sample(thread); + + ASGCT_CallTrace *trace = NEW_C_HEAP_OBJ(ASGCT_CallTrace, mtInternal); + if (trace == NULL) { + return; + } + + ASGCT_CallFrame *frames = + NEW_C_HEAP_ARRAY(ASGCT_CallFrame, kMaxStackDepth, mtInternal); + if (frames == NULL) { + FreeHeap(reinterpret_cast(trace)); + return; + } + + trace->frames = frames; + trace->env_id = (JavaThread::current())->jni_environment(); + + ucontext_t uc; + if (!getcontext(&uc)) { +#if defined(IA32) + // On Linux/x86 (but not x64), AsyncGetCallTrace/JVM reads the + // stack pointer from the REG_UESP field (as opposed to the + // REG_ESP field). The kernel sets both the REG_UESP and REG_ESP + // fields to the correct stack pointer for the ucontexts passed to + // signal handlers. However, getcontext() sets only REG_ESP, + // leaving REG_UESP uninitialized. Since there is no way to + // distinguish where a ucontext_t came from, copy from REG_ESP to + // REG_UESP so that AGCT will read the right stack pointer. + uc.uc_mcontext.gregs[REG_UESP] = uc.uc_mcontext.gregs[REG_ESP]; +#endif + + AsyncGetCallTrace(trace, kMaxStackDepth, &uc); + + if (trace->num_frames > 0) { + // Success! + StackTraceStorage::storage()->add_trace(trace, byte_size, thread); + return; + } + } + // Failure! + FREE_C_HEAP_ARRAY(ASGCT_CallFrame, trace->frames); + FreeHeap(reinterpret_cast(trace)); + return; + } else { + // There is something like 64K worth of allocation before the VM + // initializes. This is just in the interests of not slowing down + // startup. + assert(t->is_Java_thread(), "non-Java thread passed to do_sample"); + JavaThread *thread = reinterpret_cast(t); + *(thread->bytes_until_sample()) = 65536; + } +#else + Unimplemented(); +#endif +}