--- old/src/cpu/x86/vm/assembler_x86.hpp 2017-04-18 10:46:05.178578292 -0700 +++ new/src/cpu/x86/vm/assembler_x86.hpp 2017-04-18 10:46:05.054578777 -0700 @@ -2225,6 +2225,44 @@ _embedded_opmask_register_specifier = (*mask).encoding() & 0x7; } +// This is shared between the interpreter and C1, and needs to be in multiple +// places for each. The code to invoke the actual sampling methods needs +// to be provided by the user; thus, a macro. +#define HEAP_MONITORING(ma, thread, var_size_in_bytes, con_size_in_bytes, object, t1, t2, sample_invocation) \ +do { \ + { \ + SkipIfEqual skip_if(ma, &HeapMonitor, 0); \ + Label skip_sample; \ + Register thr = thread; \ + if (!thr->is_valid()) { \ + NOT_LP64(assert(t1 != noreg, \ + "Need temporary register for constants")); \ + thr = NOT_LP64(t1) LP64_ONLY(r15_thread); \ + NOT_LP64(ma -> get_thread(thr);) \ + } \ + /* Trigger heap monitoring event */ \ + Address bus(thr, \ + JavaThread::bytes_until_sample_offset()); \ + \ + if (var_size_in_bytes->is_valid()) { \ + ma -> NOT_LP64(subl) LP64_ONLY(subq)(bus, var_size_in_bytes); \ + } else { \ + int csib = (con_size_in_bytes); \ + assert(t2 != noreg, \ + "Need temporary register for constants"); \ + ma -> NOT_LP64(movl) LP64_ONLY(mov64)(t2, csib); \ + ma -> NOT_LP64(subl) LP64_ONLY(subq)(bus, t2); \ + } \ + \ + ma -> jcc(Assembler::positive, skip_sample); \ + \ + { \ + sample_invocation \ + } \ + ma -> bind(skip_sample); \ + } \ +} while(0) + }; #endif // CPU_X86_VM_ASSEMBLER_X86_HPP --- old/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp 2017-04-18 10:46:05.606576618 -0700 +++ new/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp 2017-04-18 10:46:05.486577088 -0700 @@ -23,6 +23,7 @@ */ #include "precompiled.hpp" +#include "assembler_x86.hpp" #include "c1/c1_MacroAssembler.hpp" #include "c1/c1_Runtime1.hpp" #include "classfile/systemDictionary.hpp" @@ -201,6 +202,10 @@ try_allocate(obj, noreg, object_size * BytesPerWord, t1, t2, slow_case); initialize_object(obj, klass, noreg, object_size * HeapWordSize, t1, t2, UseTLAB); + + HEAP_MONITORING(this, noreg, noreg, object_size * HeapWordSize, obj, + t1, t2, call(RuntimeAddress(Runtime1::entry_for( + Runtime1::heap_object_sample_id)));); } void C1_MacroAssembler::initialize_object(Register obj, Register klass, Register var_size_in_bytes, int con_size_in_bytes, Register t1, Register t2, bool is_tlab_allocated) { @@ -277,13 +282,19 @@ // clear rest of allocated space const Register len_zero = len; + // Initialize body destroys arr_size so remember it. + push(arr_size); initialize_body(obj, arr_size, header_size * BytesPerWord, len_zero); + pop(arr_size); if (CURRENT_ENV->dtrace_alloc_probes()) { assert(obj == rax, "must be"); call(RuntimeAddress(Runtime1::entry_for(Runtime1::dtrace_object_alloc_id))); } + HEAP_MONITORING(this, noreg, arr_size, 0, obj, t1, noreg, + call(RuntimeAddress(Runtime1::entry_for( + Runtime1::heap_object_sample_id)));); verify_oop(obj); } --- old/src/cpu/x86/vm/c1_Runtime1_x86.cpp 2017-04-18 10:46:05.998575086 -0700 +++ new/src/cpu/x86/vm/c1_Runtime1_x86.cpp 2017-04-18 10:46:05.878575555 -0700 @@ -414,7 +414,8 @@ } static OopMap* save_live_registers(StubAssembler* sasm, int num_rt_args, - bool save_fpu_registers = true) { + bool save_fpu_registers = true, + bool do_generate_oop_map = true) { __ block_comment("save_live_registers"); __ pusha(); // integer registers @@ -489,7 +490,9 @@ // FPU stack must be empty now __ verify_FPU(0, "save_live_registers"); - return generate_oop_map(sasm, num_rt_args, save_fpu_registers); + return do_generate_oop_map + ? generate_oop_map(sasm, num_rt_args, save_fpu_registers) + : NULL; } @@ -957,6 +960,24 @@ return oop_maps; } +static void heap_support_stub(StubAssembler* sasm, Register obj, + Register size_in_bytes, int con_size_in_bytes, + Register t1, Register t2) { + // Usually, when we invoke the sampling methods from within the client + // compiler, we do so in a stub. However, sometimes, we are already in a stub + // when we want to call these things, and stack trace gathering gets confused + // when you call a stub inside another stub. + HEAP_MONITORING(sasm, noreg, size_in_bytes, con_size_in_bytes, obj, t1, t2, \ + { \ + save_live_registers(sasm, 1, true, false); \ + __ NOT_LP64(push(rax)) LP64_ONLY(mov(c_rarg0, rax)); \ + __ call(RuntimeAddress( + CAST_FROM_FN_PTR(address, \ + HeapMonitoring::object_alloc_unsized))); \ + NOT_LP64(__ pop(rax)); \ + restore_live_registers(sasm); \ + }); +} OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) { @@ -1042,6 +1063,7 @@ __ initialize_object(obj, klass, obj_size, 0, t1, t2, /* is_tlab_allocated */ true); __ verify_oop(obj); + heap_support_stub(sasm, obj, obj_size, 0, t1, t2); __ pop(rbx); __ pop(rdi); __ ret(0); @@ -1170,8 +1192,12 @@ __ subptr(arr_size, t1); // body length __ addptr(t1, obj); // body start if (!ZeroTLAB) { + // Initialize body destroys arr_size so remember it. + __ push(arr_size); __ initialize_body(t1, arr_size, 0, t2); + __ pop(arr_size); } + heap_support_stub(sasm, obj, arr_size, 0, t1, t2); __ verify_oop(obj); __ ret(0); @@ -1504,6 +1530,22 @@ NOT_LP64(__ pop(rax)); restore_live_registers(sasm); + } + break; + + case heap_object_sample_id: + { // rax,: object + StubFrame f(sasm, "heap_object_sample", dont_gc_arguments); + // We can't gc here so skip the oopmap but make sure that all + // the live registers get saved + save_live_registers(sasm, 1); + + __ NOT_LP64(push(rax)) LP64_ONLY(mov(c_rarg0, rax)); + __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, + HeapMonitoring::object_alloc))); + NOT_LP64(__ pop(rax)); + + restore_live_registers(sasm); } break; --- old/src/cpu/x86/vm/templateTable_x86.cpp 2017-04-18 10:46:06.406573491 -0700 +++ new/src/cpu/x86/vm/templateTable_x86.cpp 2017-04-18 10:46:06.290573944 -0700 @@ -3927,6 +3927,7 @@ // The object is initialized before the header. If the object size is // zero, go directly to the header initialization. __ bind(initialize_object); + __ movq(rbx, rdx); // Save the size for HeapMonitoring __ decrement(rdx, sizeof(oopDesc)); __ jcc(Assembler::zero, initialize_header); @@ -3957,6 +3958,10 @@ // initialize object header only. __ bind(initialize_header); + + // Restore size for HeapMonitoring + __ movq(rdx, rbx); + if (UseBiasedLocking) { __ pop(rcx); // get saved klass back in the register. __ movptr(rbx, Address(rcx, Klass::prototype_header_offset())); @@ -3977,10 +3982,20 @@ // Trigger dtrace event for fastpath __ push(atos); __ call_VM_leaf( - CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_object_alloc), rax); + CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_object_alloc), + rax, rdx); __ pop(atos); } + HEAP_MONITORING(_masm, noreg, rdx, 0, rax, rcx, noreg, \ + { \ + __ push(atos); \ + __ call_VM_leaf( \ + CAST_FROM_FN_PTR(address, HeapMonitoring::object_alloc), \ + rax, rdx); \ + __ pop(atos); \ + }); + __ jmp(done); } --- old/src/share/vm/c1/c1_Runtime1.cpp 2017-04-18 10:46:06.834571818 -0700 +++ new/src/share/vm/c1/c1_Runtime1.cpp 2017-04-18 10:46:06.730572224 -0700 @@ -202,6 +202,7 @@ switch (id) { // These stubs don't need to have an oopmap case dtrace_object_alloc_id: + case heap_object_sample_id: case g1_pre_barrier_slow_id: case g1_post_barrier_slow_id: case slow_subtype_check_id: --- old/src/share/vm/c1/c1_Runtime1.hpp 2017-04-18 10:46:07.222570300 -0700 +++ new/src/share/vm/c1/c1_Runtime1.hpp 2017-04-18 10:46:07.102570770 -0700 @@ -39,6 +39,7 @@ #define RUNTIME1_STUBS(stub, last_entry) \ stub(dtrace_object_alloc) \ + stub(heap_object_sample) \ stub(unwind_exception) \ stub(forward_exception) \ stub(throw_range_check_failed) /* throws ArrayIndexOutOfBoundsException */ \ --- old/src/share/vm/gc/shared/collectedHeap.inline.hpp 2017-04-18 10:46:07.610568784 -0700 +++ new/src/share/vm/gc/shared/collectedHeap.inline.hpp 2017-04-18 10:46:07.482569284 -0700 @@ -33,6 +33,7 @@ #include "oops/arrayOop.hpp" #include "oops/oop.inline.hpp" #include "prims/jvmtiExport.hpp" +#include "runtime/heapMonitoring.hpp" #include "runtime/sharedRuntime.hpp" #include "runtime/thread.inline.hpp" #include "services/lowMemoryDetector.hpp" @@ -81,6 +82,24 @@ SharedRuntime::dtrace_object_alloc(obj, size); } } + + if (HeapMonitor) { + // support for object alloc event (no-op most of the time) + if (klass() != NULL && klass()->name() != NULL) { + Thread *base_thread = Thread::current(); + if (base_thread->is_Java_thread()) { + JavaThread *thread = (JavaThread *) base_thread; + size_t *bytes_until_sample = thread->bytes_until_sample(); + size_t size_in_bytes = ((size_t) size) << LogHeapWordSize; + assert(size > 0, "positive size"); + if (*bytes_until_sample < size_in_bytes) { + HeapMonitoring::object_alloc_do_sample(thread, obj, size_in_bytes); + } else { + *bytes_until_sample -= size_in_bytes; + } + } + } + } } void CollectedHeap::post_allocation_setup_obj(KlassHandle klass, --- old/src/share/vm/opto/macro.cpp 2017-04-18 10:46:07.978567345 -0700 +++ new/src/share/vm/opto/macro.cpp 2017-04-18 10:46:07.870567767 -0700 @@ -1126,6 +1126,75 @@ } } +void PhaseMacroExpand::conditional_sample(Node *should_sample, + BoolTest::mask test, + float probability, + CallLeafNode *call, + Node *thread, + Node **fast_oop_ctrl, + Node **fast_oop_rawmem, + Node **fast_oop, + Node *size_in_bytes, + Node *in_node) { + Node* sample_cmp = new CmpXNode(should_sample, _igvn.MakeConX(0)); + transform_later(sample_cmp); + + Node *sample_bool = new BoolNode(sample_cmp, test); + transform_later(sample_bool); + + IfNode *sample_if = new IfNode(*fast_oop_ctrl, + sample_bool, + probability, + COUNT_UNKNOWN); + transform_later(sample_if); + + // Slow-path call to sample + Node *sample_true = new IfTrueNode(sample_if); + transform_later(sample_true); + + // Fast path to no sample + Node *sample_false = new IfFalseNode(sample_if); + transform_later(sample_false); + + // Create postdominators for both the control and data flow paths. + Node *sample_region = new RegionNode(3); + Node *sample_phi_rawmem = new PhiNode(sample_region, + Type::MEMORY, + TypeRawPtr::BOTTOM); + + sample_region->init_req(1, sample_false); + sample_phi_rawmem->init_req(1, *fast_oop_rawmem); + + // Invoke the sampling method on the slow path. + int size = TypeFunc::Parms + 2; + + call->init_req(TypeFunc::Parms+0, thread); + call->init_req(TypeFunc::Parms+1, *fast_oop); + call->init_req(TypeFunc::Parms+2, size_in_bytes); +#ifdef _LP64 + // The size is TypeX, so in a 64-bit JVM this a long, and we need + // // a second, dummy argument (an idiosyncracy of C2). + call->init_req(TypeFunc::Parms+3, C->top()); +#endif + call->init_req( TypeFunc::Control, sample_true); + call->init_req( TypeFunc::I_O , top()); // does no i/o + call->init_req( TypeFunc::Memory , *fast_oop_rawmem ); + call->init_req( TypeFunc::ReturnAdr, in_node->in(TypeFunc::ReturnAdr)); + call->init_req( TypeFunc::FramePtr, in_node->in(TypeFunc::FramePtr)); + transform_later(call); + Node *sample_oop_rawmem = new ProjNode(call, TypeFunc::Memory); + transform_later(sample_oop_rawmem); + + // Tie the slow path to the postdominating node. + sample_region->init_req(2, sample_true); + sample_phi_rawmem->init_req(2, sample_oop_rawmem); + transform_later(sample_region); + + *fast_oop_ctrl = sample_region; + *fast_oop_rawmem = sample_phi_rawmem; + transform_later(*fast_oop_rawmem); +} + bool PhaseMacroExpand::eliminate_allocate_node(AllocateNode *alloc) { // Don't do scalar replacement if the frame can be popped by JVMTI: // if reallocation fails during deoptimization we'll pop all @@ -1636,6 +1705,60 @@ transform_later(fast_oop_rawmem); } + if (HeapMonitor) { + // Inlined version of HeapMonitoring::object_alloc_base + // Get base of thread-local storage area + Node* thread = new ThreadLocalNode(); + transform_later(thread); + + ByteSize sample_offset = JavaThread::bytes_until_sample_offset(); + + // Do test to see if we should sample. + // Get bytes_until_sample from thread local storage. + Node *bytes_until_sample = make_load(fast_oop_ctrl, + fast_oop_rawmem, + thread, + in_bytes(sample_offset), + TypeX_X, + TypeX_X->basic_type()); + + // new_bytes_until_sample = bytes_until_sample - size_in_bytes + Node *new_bytes_until_sample = + new SubXNode(bytes_until_sample, size_in_bytes); + transform_later(new_bytes_until_sample); + + // bytes_until_sample = new_bytes_until_sample; + fast_oop_rawmem = make_store(fast_oop_ctrl, + fast_oop_rawmem, + thread, + in_bytes(sample_offset), + new_bytes_until_sample, + TypeX_X->basic_type()); + + // Call to make if sampling succeeds + int size = TypeFunc::Parms + 2; + CallLeafNode *call = new CallLeafNode( + OptoRuntime::heap_object_alloc_Type(), + CAST_FROM_FN_PTR(address, + HeapMonitoring::object_alloc_do_sample), + "object_alloc_do_sample", + TypeRawPtr::BOTTOM); + + // if (new_bytes_until_sample < 0) + conditional_sample(new_bytes_until_sample, + BoolTest::le, + // Probability + // ~1/10000 + PROB_UNLIKELY_MAG(4), + call, + thread, + &fast_oop_ctrl, + &fast_oop_rawmem, + &fast_oop, + size_in_bytes, + alloc); + } + // Plug in the successful fast-path into the result merge point result_region ->init_req(fast_result_path, fast_oop_ctrl); result_phi_rawoop->init_req(fast_result_path, fast_oop); --- old/src/share/vm/opto/macro.hpp 2017-04-18 10:46:08.390565734 -0700 +++ new/src/share/vm/opto/macro.hpp 2017-04-18 10:46:08.274566187 -0700 @@ -66,6 +66,19 @@ Node* make_store(Node* ctl, Node* mem, Node* base, int offset, Node* value, BasicType bt); + // For Heap-related sampling - will generate code to invoke call() + // if the given sampling parameters are true. + void conditional_sample(Node *should_sample, + BoolTest::mask test, + float probability, + CallLeafNode *call, + Node *thread, + Node **fast_oop_ctrl, + Node **fast_oop_rawmem, + Node **fast_oop, + Node* size_in_bytes, + Node *in_node); + // projections extracted from a call node ProjNode *_fallthroughproj; ProjNode *_fallthroughcatchproj; --- old/src/share/vm/opto/runtime.cpp 2017-04-18 10:46:08.782564201 -0700 +++ new/src/share/vm/opto/runtime.cpp 2017-04-18 10:46:08.662564670 -0700 @@ -1558,6 +1558,28 @@ return TypeFunc::make(domain,range); } +const TypeFunc *OptoRuntime::heap_object_alloc_Type() { + // Keep it separate so that we don't have to worry if they change it. + // create input type (domain) + const Type **fields = TypeTuple::fields(3 LP64_ONLY( + 1)); + + // Thread-local storage + fields[TypeFunc::Parms+0] = TypeRawPtr::BOTTOM; + // oop; newly allocated object + fields[TypeFunc::Parms+1] = TypeInstPtr::NOTNULL; + // byte size of object + fields[TypeFunc::Parms+2] = TypeX_X; + // other half of long length + LP64_ONLY(fields[TypeFunc::Parms+3] = Type::HALF); + + const TypeTuple *domain = TypeTuple::make(TypeFunc::Parms+4, fields); + // create result type (range) + fields = TypeTuple::fields(0); + + const TypeTuple *range = TypeTuple::make(TypeFunc::Parms+0, fields); + + return TypeFunc::make(domain, range); +} JRT_ENTRY_NO_ASYNC(void, OptoRuntime::register_finalizer(oopDesc* obj, JavaThread* thread)) assert(obj->is_oop(), "must be a valid oop"); --- old/src/share/vm/opto/runtime.hpp 2017-04-18 10:46:09.182562637 -0700 +++ new/src/share/vm/opto/runtime.hpp 2017-04-18 10:46:09.062563106 -0700 @@ -329,6 +329,9 @@ static const TypeFunc* dtrace_method_entry_exit_Type(); static const TypeFunc* dtrace_object_alloc_Type(); + // Heap sampling support + static const TypeFunc* heap_object_alloc_Type(); + private: static NamedCounter * volatile _named_counters; --- old/src/share/vm/prims/forte.cpp 2017-04-18 10:46:09.558561167 -0700 +++ new/src/share/vm/prims/forte.cpp 2017-04-18 10:46:09.434561652 -0700 @@ -35,19 +35,6 @@ #include "runtime/vframe.hpp" #include "runtime/vframeArray.hpp" -// call frame copied from old .h file and renamed -typedef struct { - jint lineno; // line number in the source file - jmethodID method_id; // method executed in this frame -} ASGCT_CallFrame; - -// call trace copied from old .h file and renamed -typedef struct { - JNIEnv *env_id; // Env where trace was recorded - jint num_frames; // number of frames in this trace - ASGCT_CallFrame *frames; // frames -} ASGCT_CallTrace; - // These name match the names reported by the forte quality kit enum { ticks_no_Java_frame = 0, --- old/src/share/vm/prims/forte.hpp 2017-04-18 10:46:09.942559665 -0700 +++ new/src/share/vm/prims/forte.hpp 2017-04-18 10:46:09.830560104 -0700 @@ -34,4 +34,20 @@ // register internal VM stub }; +// call frame copied from old .h file and renamed +typedef struct { + jint lineno; // line number in the source file + jmethodID method_id; // method executed in this frame +} ASGCT_CallFrame; + +// call trace copied from old .h file and renamed +typedef struct { + JNIEnv *env_id; // Env where trace was recorded + jint num_frames; // number of frames in this trace + ASGCT_CallFrame *frames; // frames +} ASGCT_CallTrace; + +extern "C" +void AsyncGetCallTrace(ASGCT_CallTrace *trace, jint depth, void* ucontext); + #endif // SHARE_VM_PRIMS_FORTE_HPP --- old/src/share/vm/prims/jvmti.xml 2017-04-18 10:46:10.310558227 -0700 +++ new/src/share/vm/prims/jvmti.xml 2017-04-18 10:46:10.202558649 -0700 @@ -11466,6 +11466,81 @@ + + Start Heap Sampling + + Start the heap sampler in the JVM. The function provides, via its argument, the sampling + rate requested and will fill internal data structures with heap allocation samples. The + samples are obtained via the function. + + new + + + + + + + The monitoring period used for sampling. The sampler will use a statistical approach to + provide in average sampling every allocated bytes. + + + + + + is less than zero. + + + + + + + Get Live Traces + + + + + TODO(jcbeyler): Not sure if we should declare the type as non void and make a JVMTI type here. + It would be ASGCT_CallTrace and then would include a pointer to ASGCT_CallFrame. Would we refactor that code? + + + + + + The size of the object allocation. + + + + + + The thread id number. + + + + + Get Live Heap Sampled traces. The fields of the + structure are filled in with details of the specified sampled allocation. + + new + + + + + jvmtiStackTraceData + + The stack trace array to be filled. + + + + + + On output, size of the array returned via the first parameter. + + + + + + + --- old/src/share/vm/prims/jvmtiEnv.cpp 2017-04-18 10:46:10.962555678 -0700 +++ new/src/share/vm/prims/jvmtiEnv.cpp 2017-04-18 10:46:10.842556147 -0700 @@ -46,6 +46,7 @@ #include "prims/jvmtiCodeBlobEvents.hpp" #include "prims/jvmtiExtensions.hpp" #include "prims/jvmtiGetLoadedClasses.hpp" +#include "prims/jvmtiHeapTransition.hpp" #include "prims/jvmtiImpl.hpp" #include "prims/jvmtiManageCapabilities.hpp" #include "prims/jvmtiRawMonitor.hpp" @@ -1935,6 +1936,30 @@ return JVMTI_ERROR_NONE; } /* end IterateOverInstancesOfClass */ +// Start the sampler. +jvmtiError +JvmtiEnv::StartHeapSampling(jint monitoring_period) { + if (monitoring_period < 0) { + return JVMTI_ERROR_ILLEGAL_ARGUMENT; + } + + HeapMonitor = true; + HeapThreadTransition htt(Thread::current()); + HeapMonitoring::initialize_profiling(monitoring_period); + return JVMTI_ERROR_NONE; +} /* end StartHeapSampling */ + +// Get the currently live sampled allocations. +jvmtiError +JvmtiEnv::GetLiveTraces(jvmtiStackTraceData **stack_traces, jint *num_traces) { + HeapThreadTransition htt(Thread::current()); + if (stack_traces == NULL || num_traces == NULL) { + return JVMTI_ERROR_ILLEGAL_ARGUMENT; + } + + HeapMonitoring::get_live_traces(stack_traces, num_traces); + return JVMTI_ERROR_NONE; +} /* end GetLiveTraces */ // // Local Variable functions --- old/src/share/vm/runtime/arguments.cpp 2017-04-18 10:46:11.414553910 -0700 +++ new/src/share/vm/runtime/arguments.cpp 2017-04-18 10:46:11.286554411 -0700 @@ -4620,6 +4620,10 @@ } #endif +#if !(defined(X86) || defined(PPC64)) + HeapMonitor = false; +#endif + return JNI_OK; } --- old/src/share/vm/runtime/globals.hpp 2017-04-18 10:46:11.882552081 -0700 +++ new/src/share/vm/runtime/globals.hpp 2017-04-18 10:46:11.750552596 -0700 @@ -4082,7 +4082,13 @@ diagnostic(bool, CompilerDirectivesPrint, false, \ "Print compiler directives on installation.") \ diagnostic(int, CompilerDirectivesLimit, 50, \ - "Limit on number of compiler directives.") + "Limit on number of compiler directives.") \ + product(bool, HeapMonitor, false, \ + "Enable heap monitoring.") \ + product(int, HeapMonitorRate, (1<<19), \ + "Heap monitoring rate.") \ + product(uintx, MaxHeapTraces, 200, \ + "Maximum number of traces kept by the heap monitoring.") \ /* --- old/src/share/vm/runtime/init.cpp 2017-04-18 10:46:12.326550344 -0700 +++ new/src/share/vm/runtime/init.cpp 2017-04-18 10:46:12.210550798 -0700 @@ -32,6 +32,8 @@ #include "prims/methodHandles.hpp" #include "runtime/globals.hpp" #include "runtime/handles.inline.hpp" +#include "runtime/heapMonitoring.hpp" +#include "prims/jvmtiHeapTransition.hpp" #include "runtime/icache.hpp" #include "runtime/init.hpp" #include "runtime/safepoint.hpp" @@ -155,6 +157,11 @@ CommandLineFlags::printFlags(tty, false, PrintFlagsRanges); } + if (HeapMonitor) { + fprintf(stderr, "Starting sampling with rate %d\n", HeapMonitorRate); + HeapThreadTransition htt(Thread::current()); + HeapMonitoring::initialize_profiling(HeapMonitorRate); + } return JNI_OK; } --- old/src/share/vm/runtime/thread.cpp 2017-04-18 10:46:12.726548780 -0700 +++ new/src/share/vm/runtime/thread.cpp 2017-04-18 10:46:12.586549328 -0700 @@ -1480,6 +1480,7 @@ _do_not_unlock_if_synchronized = false; _cached_monitor_info = NULL; _parker = Parker::Allocate(this); + _bytes_until_sample = 0; #ifndef PRODUCT _jmp_ring_index = 0; --- old/src/share/vm/runtime/thread.hpp 2017-04-18 10:46:13.186546982 -0700 +++ new/src/share/vm/runtime/thread.hpp 2017-04-18 10:46:13.070547435 -0700 @@ -815,6 +815,9 @@ JavaFrameAnchor _anchor; // Encapsulation of current java frame and it state + size_t _bytes_until_sample; // Thread local counter to determine when to sample + // allocations. + ThreadFunction _entry_point; JNIEnv _jni_environment; @@ -1102,6 +1105,9 @@ address last_Java_pc(void) { return _anchor.last_Java_pc(); } + // Bytes until next heap sample. + size_t* bytes_until_sample() { return &_bytes_until_sample; } + // Safepoint support #if !(defined(PPC64) || defined(AARCH64)) JavaThreadState thread_state() const { return _thread_state; } @@ -1554,6 +1560,7 @@ static ByteSize frame_anchor_offset() { return byte_offset_of(JavaThread, _anchor); } + static ByteSize bytes_until_sample_offset() { return byte_offset_of(JavaThread, _bytes_until_sample); } static ByteSize callee_target_offset() { return byte_offset_of(JavaThread, _callee_target); } static ByteSize vm_result_offset() { return byte_offset_of(JavaThread, _vm_result); } static ByteSize vm_result_2_offset() { return byte_offset_of(JavaThread, _vm_result_2); } --- /dev/null 2017-04-17 13:03:13.666114673 -0700 +++ new/src/share/vm/prims/jvmtiHeapTransition.hpp 2017-04-18 10:46:13.470545871 -0700 @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2017, Google and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef SHARE_VM_PRIMS_JVMTIHEAPSAMPLING_HPP +#define SHARE_VM_PRIMS_JVMTIHEAPSAMPLING_HPP + +// TODO(jcbeyler): is there a better/standard JVM way of doing this? +// A RAII class that handles transitions from the agent into the VM. +class HeapThreadTransition : StackObj { + private: + JavaThreadState _saved_state; + JavaThread *_jthread; + + public: + // Transitions this thread from the agent (thread_in_native) to the VM. + HeapThreadTransition(Thread *thread) { + if (thread->is_Java_thread()) { + _jthread = (JavaThread *)thread; + _saved_state = _jthread->thread_state(); + if (_saved_state == _thread_in_native) { + ThreadStateTransition::transition_from_native(_jthread, _thread_in_vm); + } else { + ThreadStateTransition::transition(_jthread, + _saved_state, + _thread_in_vm); + } + } else { + _jthread = NULL; + _saved_state = _thread_new; + } + } + + // Transitions this thread back to the agent from the VM. + ~HeapThreadTransition() { + if (_jthread != NULL) { + ThreadStateTransition::transition(_jthread, _thread_in_vm, _saved_state); + } + } +}; + +#endif // SHARE_VM_PRIMS_JVMTIHEAPSAMPLING_HPP --- /dev/null 2017-04-17 13:03:13.666114673 -0700 +++ new/src/share/vm/runtime/heapMonitoring.cpp 2017-04-18 10:46:13.818544511 -0700 @@ -0,0 +1,391 @@ +/* + * Copyright (c) 2017, Google and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#include "precompiled.hpp" +#include "prims/forte.hpp" +#include "runtime/heapMonitoring.hpp" + +const int kMaxStackDepth = 64; + +// The resulting data, as they appear to the client. +// TODO(jcbeyler): should we make this into a JVMTI structure? +struct StackTraceData { + ASGCT_CallTrace *trace; + intx byte_size; + jlong thread_id; + + StackTraceData(ASGCT_CallTrace *t, intx size, jlong tid) : trace(t), + byte_size(size), thread_id(tid) {} +}; + +// RAII class that acquires / releases lock +class MuxLocker { + private: + volatile intptr_t *_lock; + const char *_name; + public: + MuxLocker(volatile intptr_t *lock, const char *name) : + _lock(lock), + _name(name) { + Thread::muxAcquire(lock, name); + } + ~MuxLocker() { + Thread::muxRelease(_lock); + } +}; + +// Each object that we profile is stored as trace with the thread_id. +class StackTraceStorage { + public: + // The function that gets called to add a trace to the list of + // traces we are maintaining. trace is the stacktrace, and thread + // is the thread that did the allocation. + void add_trace(ASGCT_CallTrace *trace, intx byte_size, Thread *thread); + + // The function that gets called by the client to retrieve the list + // of stack traces. Passes (by reference) a pointer to a list of + // traces, and a number of traces, both of which will get mutated by + // the function being called. + void get_all_stack_traces(jvmtiStackTraceData **traces, jint *num_traces); + + ~StackTraceStorage(); + StackTraceStorage(); + + // The global storage. Not a global static because + // StackTraceStorage isn't available at module-loading time. + static StackTraceStorage *storage() { + static StackTraceStorage storage; + return &storage; + } + + // Protects the traces currently sampled (below). + volatile intptr_t _allocated_traces_lock[1]; + + // The current allocated traces. A fixed-size ring buffer. + // This is a temporay fix until the GC handlers are in place. Then this + // becomes a growable array that is emptied as elements get garbage + // collected. + StackTraceData** _allocated_traces; + + // Maximum size of the allocation. + size_t _allocated_traces_size; + + // TODO(jcbeyler): remove extra code that is here for statistics... + size_t _allocated_count; + + // The current position in _allocated_traces (above); + // This is a temporay fix until the GC handlers are in place. Then this + // becomes a growable array that is emptied as elements get garbage + // collected. + int _allocated_traces_pos; + + private: + // Support functions and classes for copying data to the external + // world. + class StackTraceDataCopier { + public: + virtual int size() const = 0; + virtual StackTraceData *get(int i) const = 0; + }; + + class LiveStackTraceDataCopier : public StackTraceDataCopier { + public: + LiveStackTraceDataCopier(StackTraceData **data, int size) : + _data(data), _size(size) {} + int size() const { return _size; } + StackTraceData *get(int i) const { return _data[i]; } + + private: + StackTraceData **_data; + int _size; + }; + + // Copies jvmtiStackTraceData from to jvmtiStackTraceData to + bool deep_copy(jvmtiStackTraceData *to, StackTraceData *from); + + // Creates a deep copy of the list of StackTraceData + void copy_stack_traces(const StackTraceDataCopier &copier, + jvmtiStackTraceData **traces, + jint *num_traces); +}; + +// Statics for Sampler +double HeapMonitoring::_log_table[1 << kFastlogNumBits]; + +bool HeapMonitoring::_initialized = false; + +jint HeapMonitoring::_monitoring_period; + +// Cheap random number generator +uint64_t HeapMonitoring::_rnd; + +StackTraceStorage::StackTraceStorage() : + _allocated_traces(new StackTraceData*[MaxHeapTraces]), + _allocated_traces_size(MaxHeapTraces), + _allocated_traces_pos(0), + _allocated_count(0) { + memset(_allocated_traces, 0, sizeof(*_allocated_traces) * MaxHeapTraces); + _allocated_traces_lock[0] = 0; +} + +StackTraceStorage::~StackTraceStorage() { + delete[] _allocated_traces; +} + +void StackTraceStorage::add_trace(ASGCT_CallTrace *trace, + intx byte_size, + Thread *thread) { + StackTraceData *new_data = + new StackTraceData(trace, byte_size, SharedRuntime::get_java_tid(thread)); + + MuxLocker mu(_allocated_traces_lock, "StackTraceStorage::add_trace"); + StackTraceData *current_allocated_trace = + _allocated_traces[_allocated_traces_pos]; + if (current_allocated_trace != NULL) { + delete current_allocated_trace; + } + _allocated_traces[_allocated_traces_pos] = new_data; + _allocated_traces_pos = (_allocated_traces_pos + 1) % _allocated_traces_size; + _allocated_count++; +} + +bool StackTraceStorage::deep_copy(jvmtiStackTraceData *to, + StackTraceData *from) { + to->thread_id = from->thread_id; + to->size = from->byte_size; + + ASGCT_CallTrace *dest = reinterpret_cast( + os::malloc(sizeof(ASGCT_CallTrace), mtInternal)); + if (dest == NULL) { + return false; + } + to->trace = dest; + + const ASGCT_CallTrace *src = + reinterpret_cast(from->trace); + *dest = *src; + + dest->frames = reinterpret_cast( + os::malloc(sizeof(ASGCT_CallFrame) * kMaxStackDepth, mtInternal)); + if (dest->frames == NULL) { + return false; + } + memcpy(dest->frames, + src->frames, + sizeof(ASGCT_CallFrame) * kMaxStackDepth); + return true; +} + +// Called by the outside world; returns a copy of the stack traces +// (because we could be replacing them as the user handles them). +// The array is secretly null-terminated (to make it easier to reclaim). +void StackTraceStorage::get_all_stack_traces(jvmtiStackTraceData **traces, + jint *num_traces) { + LiveStackTraceDataCopier copier(_allocated_traces, _allocated_traces_size); + copy_stack_traces(copier, traces, num_traces); +} + +void StackTraceStorage::copy_stack_traces(const StackTraceDataCopier &copier, + jvmtiStackTraceData **traces, + jint *num_traces) { + MuxLocker mu(_allocated_traces_lock, "StackTraceStorage::copy_stack_traces"); + int len = copier.size(); + // TODO(jcbeyler): +2 -> +1 for len :remove this extra code handling the extra trace for + // counting, it is for statistics. + // Create a new array to store the StackTraceData objects. + jvmtiStackTraceData *t = reinterpret_cast( + os::malloc((len + 2) * sizeof(*t), mtInternal)); + if (t == NULL) { + *traces = NULL; + *num_traces = 0; + return; + } + // TODO(jcbeyler): +2 -> +1 for len :remove this extra code handling the extra trace for + // counting, it is for statistics. + // +1 to have a NULL at the end of the array. + memset(t, 0, (len + 2) * sizeof(*t)); + + // TODO(jcbeyler): remove this extra code handling the extra trace for + // counting, it is for statistics. + jvmtiStackTraceData *to = &t[0]; + to->size = _allocated_count; + + // Copy the StackTraceData objects into the new array. + int trace_count = 1; + for (int i = 0; i < len; i++) { + StackTraceData *stack_trace = copier.get(i); + if (stack_trace != NULL && stack_trace->trace != NULL) { + jvmtiStackTraceData *to = &t[trace_count]; + if (!deep_copy(to, stack_trace)) { + continue; + } + trace_count++; + } + } + + *traces = t; + *num_traces = trace_count; +} + +void HeapMonitoring::get_live_traces(jvmtiStackTraceData **traces, + jint *num_traces) { + StackTraceStorage::storage()->get_all_stack_traces(traces, num_traces); +} + +// TODO(jcbeyler): find out if the algorithm for determining the sampling can be +// upstreamed. +void HeapMonitoring::initialize_profiling(jint monitoring_period) { + _monitoring_period = monitoring_period; + + // Populate the lookup table for fast_log2. + // This approximates the log2 curve with a step function. + // Steps have height equal to log2 of the mid-point of the step. + for (int i = 0; i < (1 << kFastlogNumBits); i++) { + double half_way = static_cast(i + 0.5); + _log_table[i] = (log(1.0 + half_way / (1 << kFastlogNumBits)) / log(2.0)); + } + + JavaThread *t = reinterpret_cast(Thread::current()); + _rnd = static_cast(reinterpret_cast(t)); + if (_rnd == 0) { + _rnd = 1; + } + for (int i = 0; i < 20; i++) { + _rnd = next_random(_rnd); + } + _initialized = true; +} + +// Generates a geometric variable with the specified mean (512K by default). +// This is done by generating a random number between 0 and 1 and applying +// the inverse cumulative distribution function for an exponential. +// Specifically: Let m be the inverse of the sample period, then +// the probability distribution function is m*exp(-mx) so the CDF is +// p = 1 - exp(-mx), so +// q = 1 - p = exp(-mx) +// log_e(q) = -mx +// -log_e(q)/m = x +// log_2(q) * (-log_e(2) * 1/m) = x +// In the code, q is actually in the range 1 to 2**26, hence the -26 below +void HeapMonitoring::pick_next_sample(JavaThread *t) { + _rnd = next_random(_rnd); + // Take the top 26 bits as the random number + // (This plus a 1<<58 sampling bound gives a max possible step of + // 5194297183973780480 bytes. In this case, + // for sample_parameter = 1<<19, max possible step is + // 9448372 bytes (24 bits). + const uint64_t prng_mod_power = 48; // Number of bits in prng + // The uint32_t cast is to prevent a (hard-to-reproduce) NAN + // under piii debug for some binaries. + double q = static_cast(_rnd >> (prng_mod_power - 26)) + 1.0; + // Put the computed p-value through the CDF of a geometric. + // For faster performance (save ~1/20th exec time), replace + // min(0.0, FastLog2(q) - 26) by (Fastlog2(q) - 26.000705) + // The value 26.000705 is used rather than 26 to compensate + // for inaccuracies in FastLog2 which otherwise result in a + // negative answer. + size_t *bytes_until_sample = t->bytes_until_sample(); + double log_val = (fast_log2(q) - 26); + *bytes_until_sample = static_cast( + (0.0 < log_val ? 0.0 : log_val) * (-log(2.0) * (_monitoring_period)) + 1); +} + +// Called from the interpreter and C1 +void HeapMonitoring::object_alloc_unsized(oopDesc* o) { + JavaThread *thread = reinterpret_cast(Thread::current()); + assert(o->size() << LogHeapWordSize == byte_size, + "Object size is incorrect."); + object_alloc_do_sample(thread, o, o->size() << LogHeapWordSize); +} + +void HeapMonitoring::object_alloc(oopDesc* o, intx byte_size) { + JavaThread *thread = reinterpret_cast(Thread::current()); + object_alloc_do_sample(thread, o, byte_size); +} + +// Called directly by C2 +void HeapMonitoring::object_alloc_do_sample(Thread *t, oopDesc *o, intx byte_size) { +#if defined(X86) || defined(PPC) + JavaThread *thread = reinterpret_cast(t); + size_t *bytes_until_sample = thread->bytes_until_sample(); + if (_initialized) { + // TODO(jcbeyler): what about this? + assert(t->is_Java_thread(), "non-Java thread passed to do_sample"); + JavaThread *thread = reinterpret_cast(t); + + pick_next_sample(thread); + + ASGCT_CallTrace *trace = NEW_C_HEAP_OBJ(ASGCT_CallTrace, mtInternal); + if (trace == NULL) { + return; + } + + ASGCT_CallFrame *frames = + NEW_C_HEAP_ARRAY(ASGCT_CallFrame, kMaxStackDepth, mtInternal); + if (frames == NULL) { + FreeHeap(reinterpret_cast(trace)); + return; + } + + trace->frames = frames; + trace->env_id = (JavaThread::current())->jni_environment(); + + ucontext_t uc; + if (!getcontext(&uc)) { +#if defined(IA32) + // On Linux/x86 (but not x64), AsyncGetCallTrace/JVM reads the + // stack pointer from the REG_UESP field (as opposed to the + // REG_ESP field). The kernel sets both the REG_UESP and REG_ESP + // fields to the correct stack pointer for the ucontexts passed to + // signal handlers. However, getcontext() sets only REG_ESP, + // leaving REG_UESP uninitialized. Since there is no way to + // distinguish where a ucontext_t came from, copy from REG_ESP to + // REG_UESP so that AGCT will read the right stack pointer. + uc.uc_mcontext.gregs[REG_UESP] = uc.uc_mcontext.gregs[REG_ESP]; +#endif + + AsyncGetCallTrace(trace, kMaxStackDepth, &uc); + + if (trace->num_frames > 0) { + // Success! + StackTraceStorage::storage()->add_trace(trace, byte_size, thread); + return; + } + } + // Failure! + FREE_C_HEAP_ARRAY(ASGCT_CallFrame, trace->frames); + FreeHeap(reinterpret_cast(trace)); + return; + } else { + // There is something like 64K worth of allocation before the VM + // initializes. This is just in the interests of not slowing down + // startup. + assert(t->is_Java_thread(), "non-Java thread passed to do_sample"); + JavaThread *thread = reinterpret_cast(t); + *(thread->bytes_until_sample()) = 65536; + } +#else + Unimplemented(); +#endif +} --- /dev/null 2017-04-17 13:03:13.666114673 -0700 +++ new/src/share/vm/runtime/heapMonitoring.hpp 2017-04-18 10:46:14.166543150 -0700 @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2017, Google and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef SHARE_VM_RUNTIME_HEAPMONITORING_HPP +#define SHARE_VM_RUNTIME_HEAPMONITORING_HPP + +#include "gc/shared/referenceProcessor.hpp" +#include "runtime/sharedRuntime.hpp" + +// Support class for sampling heap allocations across the VM. +class HeapMonitoring { + private: + // Cheap random number generator + static uint64_t _rnd; + static bool _initialized; + static jint _monitoring_period; + + // Statics for the fast log + static const int kFastlogNumBits = 10; + static const int kFastlogMask = (1 << kFastlogNumBits) - 1; + static double _log_table[1<(0)) << prng_mod_power); + return (prng_mult * rnd + prng_add) & prng_mod_mask; + } + + // TODO(jcbeyler): is this algorithm acceptable in open source? + // Adapted from //util/math/fastmath.[h|cc] by Noam Shazeer + // This mimics the VeryFastLog2 code in those files + static inline double fast_log2(const double & d) { + assert(d>0, "bad value passed to assert"); + uint64_t x = 0; + memcpy(&x, &d, sizeof(uint64_t)); + const uint32_t x_high = x >> 32; + const uint32_t y = x_high >> (20 - kFastlogNumBits) & kFastlogMask; + const int32_t exponent = ((x_high >> 20) & 0x7FF) - 1023; + return exponent + _log_table[y]; + } + + public: + static void get_live_traces(jvmtiStackTraceData** stack_traces, jint* num_traces); + static void initialize_profiling(jint monitoring_period); + + // Called when o is allocated, called by interpreter and C1. + static void object_alloc_unsized(oopDesc* o); + static void object_alloc(oopDesc* o, intx byte_size); + + // Called when o is allocated from C2 directly, + // we know the thread, and we have done the sampling. + static void object_alloc_do_sample(Thread *t, oopDesc *o, intx size_in_bytes); +}; + +#endif // SHARE_VM_RUNTIME_HEAPMONITORING_HPP