jdk Udiff src/hotspot/cpu/x86/vtableStubs_x86

src/hotspot/cpu/x86/vtableStubs_x86_32.cpp

rev 51490 : 8207343: Automate vtable/itable stub size calculation
Reviewed-by: kvn

@@ -1,7 +1,7 @@
 /*
- * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 only, as
  * published by the Free Software Foundation.

@@ -53,29 +53,38 @@
 //   rax - inline cache register (is interface for itable stub)
 //   rbx - method (used when calling out to interpreter)
 // Available now, but may become callee-save at some point:
 //   rsi, rdi
 // Note that rax and rdx are also used for return values.
-//
+
 VtableStub* VtableStubs::create_vtable_stub(int vtable_index) {
-  const int i486_code_length = VtableStub::pd_code_size_limit(true);
-  VtableStub* s = new(i486_code_length) VtableStub(true, vtable_index);
+  // Read "A word on VtableStub sizing" in share/code/vtableStubs.hpp for details on stub sizing.
+  const int stub_code_length = code_size_limit(true);
+  VtableStub* s = new(stub_code_length) VtableStub(true, vtable_index);
   // Can be NULL if there is no free space in the code cache.
   if (s == NULL) {
     return NULL;
   }
 
+  // Count unused bytes in instruction sequences of variable size.
+  // We add them to the computed buffer size in order to avoid
+  // overflow in subsequently generated stubs.
+  address   start_pc;
+  int       slop_bytes = 0;
+  int       slop_delta = 0;
+  // No variance was detected in vtable stub sizes. Setting slop32 == 0 will unveil any deviation from this observation.
+  const int slop32     = 0;
+
   ResourceMark rm;
-  CodeBuffer cb(s->entry_point(), i486_code_length);
+  CodeBuffer      cb(s->entry_point(), stub_code_length);
   MacroAssembler* masm = new MacroAssembler(&cb);
 
-#ifndef PRODUCT
-
+#if (!defined(PRODUCT) && defined(COMPILER2))
   if (CountCompiledCalls) {
     __ incrementl(ExternalAddress((address) SharedRuntime::nof_megamorphic_calls_addr()));
   }
-#endif /* PRODUCT */
+#endif
 
   // get receiver (need to skip return address on top of stack)
   assert(VtableStub::receiver_location() == rcx->as_VMReg(), "receiver expected in rcx");
 
   // get receiver klass

@@ -83,82 +92,95 @@
   __ movptr(rax, Address(rcx, oopDesc::klass_offset_in_bytes()));
 
 #ifndef PRODUCT
   if (DebugVtables) {
     Label L;
+    start_pc = __ pc();
     // check offset vs vtable length
     __ cmpl(Address(rax, Klass::vtable_length_offset()), vtable_index*vtableEntry::size());
+    slop_delta  = 6 - (__ pc() - start_pc);  // cmpl varies in length, depending on data
+    slop_bytes += slop_delta;
+    assert(slop_delta >= 0, "negative slop(%d) encountered, adjust code size estimate!", slop_delta);
+
     __ jcc(Assembler::greater, L);
     __ movl(rbx, vtable_index);
+    // VTABLE TODO: find upper bound for call_VM length.
+    start_pc = __ pc();
     __ call_VM(noreg, CAST_FROM_FN_PTR(address, bad_compiled_vtable_index), rcx, rbx);
+    slop_delta  = 480 - (__ pc() - start_pc);
+    slop_bytes += slop_delta;
+    assert(slop_delta >= 0, "negative slop(%d) encountered, adjust code size estimate!", slop_delta);
     __ bind(L);
   }
 #endif // PRODUCT
 
   const Register method = rbx;
 
   // load Method* and target address
+  start_pc = __ pc();
   __ lookup_virtual_method(rax, vtable_index, method);
+  slop_delta  = 6 - (int)(__ pc() - start_pc);
+  slop_bytes += slop_delta;
+  assert(slop_delta >= 0, "negative slop(%d) encountered, adjust code size estimate!", slop_delta);
 
+#ifndef PRODUCT
   if (DebugVtables) {
     Label L;
     __ cmpptr(method, (int32_t)NULL_WORD);
     __ jcc(Assembler::equal, L);
     __ cmpptr(Address(method, Method::from_compiled_offset()), (int32_t)NULL_WORD);
     __ jcc(Assembler::notZero, L);
     __ stop("Vtable entry is NULL");
     __ bind(L);
   }
+#endif // PRODUCT
 
-  // rax,: receiver klass
+  // rax: receiver klass
   // method (rbx): Method*
   // rcx: receiver
   address ame_addr = __ pc();
   __ jmp( Address(method, Method::from_compiled_offset()));
 
   masm->flush();
+  slop_bytes += slop32; // add'l slop for size variance due to large itable offsets
+  bookkeeping(masm, tty, s, npe_addr, ame_addr, true, vtable_index, slop_bytes, slop32);
 
-  if (PrintMiscellaneous && (WizardMode || Verbose)) {
-    tty->print_cr("vtable #%d at " PTR_FORMAT "[%d] left over: %d",
-                  vtable_index, p2i(s->entry_point()),
-                  (int)(s->code_end() - s->entry_point()),
-                  (int)(s->code_end() - __ pc()));
-  }
-  guarantee(__ pc() <= s->code_end(), "overflowed buffer");
-  // shut the door on sizing bugs
-  int slop = 3;  // 32-bit offset is this much larger than an 8-bit one
-  assert(vtable_index > 10 || __ pc() + slop <= s->code_end(), "room for 32-bit offset");
-
-  s->set_exception_points(npe_addr, ame_addr);
   return s;
 }
 
 
 VtableStub* VtableStubs::create_itable_stub(int itable_index) {
-  // Note well: pd_code_size_limit is the absolute minimum we can get away with.  If you
-  //            add code here, bump the code stub size returned by pd_code_size_limit!
-  const int i486_code_length = VtableStub::pd_code_size_limit(false);
-  VtableStub* s = new(i486_code_length) VtableStub(false, itable_index);
+  // Read "A word on VtableStub sizing" in share/code/vtableStubs.hpp for details on stub sizing.
+  const int stub_code_length = code_size_limit(false);
+  VtableStub* s = new(stub_code_length) VtableStub(false, itable_index);
   // Can be NULL if there is no free space in the code cache.
   if (s == NULL) {
     return NULL;
   }
+  // Count unused bytes in instruction sequences of variable size.
+  // We add them to the computed buffer size in order to avoid
+  // overflow in subsequently generated stubs.
+  address   start_pc;
+  int       slop_bytes = 0;
+  int       slop_delta = 0;
+  const int slop32     = (itable_index == 0) ? 4 :     // code size change with transition from 8-bit to 32-bit constant (@index == 32).
+                         (itable_index < 32) ? 3 : 0;  // index == 0 generates even shorter code.
 
   ResourceMark rm;
-  CodeBuffer cb(s->entry_point(), i486_code_length);
+  CodeBuffer      cb(s->entry_point(), stub_code_length);
   MacroAssembler* masm = new MacroAssembler(&cb);
 
-  // Entry arguments:
-  //  rax: CompiledICHolder
-  //  rcx: Receiver
-
-#ifndef PRODUCT
+#if (!defined(PRODUCT) && defined(COMPILER2))
   if (CountCompiledCalls) {
     __ incrementl(ExternalAddress((address) SharedRuntime::nof_megamorphic_calls_addr()));
   }
 #endif /* PRODUCT */
 
+  // Entry arguments:
+  //  rax: CompiledICHolder
+  //  rcx: Receiver
+
   // Most registers are in use; we'll use rax, rbx, rsi, rdi
   // (If we need to make rsi, rdi callee-save, do a push/pop here.)
   const Register recv_klass_reg     = rsi;
   const Register holder_klass_reg   = rax; // declaring interface klass (DECC)
   const Register resolved_klass_reg = rbx; // resolved interface klass (REFC)

@@ -169,32 +191,48 @@
   __ movptr(holder_klass_reg,   Address(icholder_reg, CompiledICHolder::holder_metadata_offset()));
 
   Label L_no_such_interface;
 
   // get receiver klass (also an implicit null-check)
-  address npe_addr = __ pc();
   assert(VtableStub::receiver_location() ==  rcx->as_VMReg(), "receiver expected in  rcx");
+  address npe_addr = __ pc();
   __ load_klass(recv_klass_reg, rcx);
 
+  start_pc = __ pc();
+
   // Receiver subtype check against REFC.
   // Destroys recv_klass_reg value.
   __ lookup_interface_method(// inputs: rec. class, interface
                              recv_klass_reg, resolved_klass_reg, noreg,
                              // outputs:  scan temp. reg1, scan temp. reg2
                              recv_klass_reg, temp_reg,
                              L_no_such_interface,
                              /*return_method=*/false);
 
+  const ptrdiff_t  typecheckSize = __ pc() - start_pc;
+  start_pc = __ pc();
+
   // Get selected method from declaring class and itable index
   const Register method = rbx;
   __ load_klass(recv_klass_reg, rcx); // restore recv_klass_reg
   __ lookup_interface_method(// inputs: rec. class, interface, itable index
                              recv_klass_reg, holder_klass_reg, itable_index,
                              // outputs: method, scan temp. reg
                              method, temp_reg,
                              L_no_such_interface);
 
+  const ptrdiff_t  lookupSize = __ pc() - start_pc;
+
+  // We expect we need slop32 extra bytes. Reason:
+  // The emitted code in lookup_interface_method changes when itable_index exceeds 31.
+  // For windows, a narrow estimate was found to be 104. Other OSes not tested.
+  const ptrdiff_t estimate = 104;
+  const ptrdiff_t codesize = typecheckSize + lookupSize + slop32;
+  slop_delta  = (int)(estimate - codesize);
+  slop_bytes += slop_delta;
+  assert(slop_delta >= 0, "itable #%d: Code size estimate (%d) for lookup_interface_method too small, required: %d", itable_index, (int)estimate, (int)codesize);
+
   // method (rbx): Method*
   // rcx: receiver
 
 #ifdef ASSERT
   if (DebugVtables) {

@@ -217,72 +255,17 @@
   // We force resolving of the call site by jumping to the "handle
   // wrong method" stub, and so let the interpreter runtime do all the
   // dirty work.
   __ jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub()));
 
-  __ flush();
-
-  if (PrintMiscellaneous && (WizardMode || Verbose)) {
-    tty->print_cr("itable #%d at " PTR_FORMAT "[%d] left over: %d",
-                  itable_index, p2i(s->entry_point()),
-                  (int)(s->code_end() - s->entry_point()),
-                  (int)(s->code_end() - __ pc()));
-  }
-  guarantee(__ pc() <= s->code_end(), "overflowed buffer");
-  // shut the door on sizing bugs
-  int slop = 3;  // 32-bit offset is this much larger than an 8-bit one
-  assert(itable_index > 10 || __ pc() + slop <= s->code_end(), "room for 32-bit offset");
+  masm->flush();
+  slop_bytes += slop32; // add'l slop for size variance due to large itable offsets
+  bookkeeping(masm, tty, s, npe_addr, ame_addr, false, itable_index, slop_bytes, slop32);
 
-  s->set_exception_points(npe_addr, ame_addr);
   return s;
 }
 
-
-
-int VtableStub::pd_code_size_limit(bool is_vtable_stub) {
-  if (is_vtable_stub) {
-    // Vtable stub size
-    return (DebugVtables ? 210 : 16) + (CountCompiledCalls ? 6 : 0);
-  } else {
-    // Itable stub size
-    return (DebugVtables ? 256 : 110) + (CountCompiledCalls ? 6 : 0);
-  }
-  // In order to tune these parameters, run the JVM with VM options
-  // +PrintMiscellaneous and +WizardMode to see information about
-  // actual itable stubs.  Look for lines like this:
-  //   itable #1 at 0x5551212[65] left over: 3
-  // Reduce the constants so that the "left over" number is >=3
-  // for the common cases.
-  // Do not aim at a left-over number of zero, because a
-  // large vtable or itable index (> 16) will require a 32-bit
-  // immediate displacement instead of an 8-bit one.
-  //
-  // The JVM98 app. _202_jess has a megamorphic interface call.
-  // The itable code looks like this:
-  // Decoding VtableStub itbl[1]@1
-  //   mov    0x4(%ecx),%esi
-  //   mov    0xe8(%esi),%edi
-  //   lea    0x130(%esi,%edi,4),%edi
-  //   add    $0x7,%edi
-  //   and    $0xfffffff8,%edi
-  //   lea    0x4(%esi),%esi
-  //   mov    (%edi),%ebx
-  //   cmp    %ebx,%eax
-  //   je     success
-  // loop:
-  //   test   %ebx,%ebx
-  //   je     throw_icce
-  //   add    $0x8,%edi
-  //   mov    (%edi),%ebx
-  //   cmp    %ebx,%eax
-  //   jne    loop
-  // success:
-  //   mov    0x4(%edi),%edi
-  //   mov    (%esi,%edi,1),%ebx
-  //   jmp    *0x44(%ebx)
-  // throw_icce:
-  //   jmp    throw_ICCE_entry
-}
-
 int VtableStub::pd_code_alignment() {
-  return wordSize;
+  // x86 cache line size is 64 bytes, but we want to limit alignment loss.
+  const unsigned int icache_line_size = wordSize;
+  return icache_line_size;
 }

< prev index next >