< prev index next >

src/cpu/x86/vm/sharedRuntime_x86_64.cpp

Print this page

        

@@ -70,49 +70,32 @@
 };
 
 class RegisterSaver {
   // Capture info about frame layout.  Layout offsets are in jint
   // units because compiler frame slots are jints.
-#define HALF_ZMM_BANK_WORDS 128
+#define XSAVE_AREA_BEGIN 160
+#define XSAVE_AREA_YMM_BEGIN 576
+#define XSAVE_AREA_ZMM_BEGIN 1152
+#define XSAVE_AREA_UPPERBANK 1664
 #define DEF_XMM_OFFS(regnum) xmm ## regnum ## _off = xmm_off + (regnum)*16/BytesPerInt, xmm ## regnum ## H_off
+#define DEF_YMM_OFFS(regnum) ymm ## regnum ## _off = ymm_off + (regnum)*16/BytesPerInt, ymm ## regnum ## H_off
 #define DEF_ZMM_OFFS(regnum) zmm ## regnum ## _off = zmm_off + (regnum-16)*64/BytesPerInt, zmm ## regnum ## H_off
   enum layout {
     fpu_state_off = frame::arg_reg_save_area_bytes/BytesPerInt, // fxsave save area
-    xmm_off       = fpu_state_off + 160/BytesPerInt,            // offset in fxsave save area
+    xmm_off       = fpu_state_off + XSAVE_AREA_BEGIN/BytesPerInt,            // offset in fxsave save area
     DEF_XMM_OFFS(0),
     DEF_XMM_OFFS(1),
-    DEF_XMM_OFFS(2),
-    DEF_XMM_OFFS(3),
-    DEF_XMM_OFFS(4),
-    DEF_XMM_OFFS(5),
-    DEF_XMM_OFFS(6),
-    DEF_XMM_OFFS(7),
-    DEF_XMM_OFFS(8),
-    DEF_XMM_OFFS(9),
-    DEF_XMM_OFFS(10),
-    DEF_XMM_OFFS(11),
-    DEF_XMM_OFFS(12),
-    DEF_XMM_OFFS(13),
-    DEF_XMM_OFFS(14),
-    DEF_XMM_OFFS(15),
-    zmm_off = fpu_state_off + ((FPUStateSizeInWords - (HALF_ZMM_BANK_WORDS + 1))*wordSize / BytesPerInt),
+    // 2..15 are implied in range usage
+    ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
+    DEF_YMM_OFFS(0),
+    DEF_YMM_OFFS(1),
+    // 2..15 are implied in range usage
+    zmm_high = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
+    zmm_off = xmm_off + (XSAVE_AREA_UPPERBANK - XSAVE_AREA_BEGIN)/BytesPerInt,
     DEF_ZMM_OFFS(16),
     DEF_ZMM_OFFS(17),
-    DEF_ZMM_OFFS(18),
-    DEF_ZMM_OFFS(19),
-    DEF_ZMM_OFFS(20),
-    DEF_ZMM_OFFS(21),
-    DEF_ZMM_OFFS(22),
-    DEF_ZMM_OFFS(23),
-    DEF_ZMM_OFFS(24),
-    DEF_ZMM_OFFS(25),
-    DEF_ZMM_OFFS(26),
-    DEF_ZMM_OFFS(27),
-    DEF_ZMM_OFFS(28),
-    DEF_ZMM_OFFS(29),
-    DEF_ZMM_OFFS(30),
-    DEF_ZMM_OFFS(31),
+    // 18..31 are implied in range usage
     fpu_state_end = fpu_state_off + ((FPUStateSizeInWords-1)*wordSize / BytesPerInt),
     fpu_stateH_end,
     r15_off, r15H_off,
     r14_off, r14H_off,
     r13_off, r13H_off,

@@ -158,39 +141,28 @@
   // all the other values have already been extracted.
   static void restore_result_registers(MacroAssembler* masm);
 };
 
 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
-  int vect_words = 0;
-  int ymmhi_offset = -1;
   int off = 0;
   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
   if (UseAVX < 3) {
     num_xmm_regs = num_xmm_regs/2;
   }
 #if defined(COMPILER2) || INCLUDE_JVMCI
   if (save_vectors) {
     assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
     assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
-    // Save upper half of YMM registers
-    vect_words = 16 * num_xmm_regs / wordSize;
-    if (UseAVX < 3) {
-      ymmhi_offset = additional_frame_words;
-      additional_frame_words += vect_words;
-    }
   }
 #else
   assert(!save_vectors, "vectors are generated only by C2 and JVMCI");
 #endif
 
-  // Always make the frame size 16-byte aligned
-  int frame_size_in_bytes = round_to(additional_frame_words*wordSize +
-                                     reg_save_size*BytesPerInt, num_xmm_regs);
+  // Always make the frame size 16-byte aligned, both vector and non vector stacks are always allocated
+  int frame_size_in_bytes = round_to(reg_save_size*BytesPerInt, num_xmm_regs);
   // OopMap frame size is in compiler stack slots (jint's) not bytes or words
   int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
-  // The caller will allocate additional_frame_words
-  int additional_frame_slots = additional_frame_words*wordSize / BytesPerInt;
   // CodeBlob frame size is in words.
   int frame_size_in_words = frame_size_in_bytes / wordSize;
   *total_frame_words = frame_size_in_words;
 
   // Save registers, fpu state, and flags.

@@ -201,16 +173,38 @@
 
   __ enter();          // rsp becomes 16-byte aligned here
   __ push_CPU_state(); // Push a multiple of 16 bytes
 
   // push cpu state handles this on EVEX enabled targets
-  if ((vect_words > 0) && (UseAVX < 3)) {
-    assert(vect_words*wordSize >= 256, "");
-    // Save upper half of YMM registes(0..num_xmm_regs)
-    __ subptr(rsp, num_xmm_regs*16);
-    for (int n = 0; n < num_xmm_regs; n++) {
-      __ vextractf128h(Address(rsp, off++*16), as_XMMRegister(n));
+  if (save_vectors) {
+    // Save upper half of YMM registes(0..15)
+    int base_addr = XSAVE_AREA_YMM_BEGIN;
+    for (int n = 0; n < 16; n++) {
+      __ vextractf128h(Address(rsp, base_addr+n*16), as_XMMRegister(n));
+    }
+    if (VM_Version::supports_evex()) {
+      // Save upper half of ZMM registes(0..15)
+      base_addr = XSAVE_AREA_ZMM_BEGIN;
+      for (int n = 0; n < 16; n++) {
+        __ vextractf64x4h(Address(rsp, base_addr+n*32), as_XMMRegister(n), 1);
+      }
+      // Save full ZMM registes(16..num_xmm_regs)
+      base_addr = XSAVE_AREA_UPPERBANK;
+      int off = 0;
+      int vector_len = Assembler::AVX_512bit;
+      for (int n = 16; n < num_xmm_regs; n++) {
+        __ evmovdqul(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n), vector_len);
+      }
+    }
+  } else {
+    if (VM_Version::supports_evex()) {
+      // Save upper bank of ZMM registers(16..31) for double/float usage
+      int base_addr = XSAVE_AREA_UPPERBANK;
+      int off = 0;
+      for (int n = 16; n < num_xmm_regs; n++) {
+        __ movsd(Address(rsp, base_addr+(off++*64)), as_XMMRegister(n));
+      }
     }
   }
   if (frame::arg_reg_save_area_bytes != 0) {
     // Allocate argument register save area
     __ subptr(rsp, frame::arg_reg_save_area_bytes);

@@ -222,12 +216,11 @@
   // debug-info recordings, as well as let GC find all oops.
 
   OopMapSet *oop_maps = new OopMapSet();
   OopMap* map = new OopMap(frame_size_in_slots, 0);
 
-#define STACK_OFFSET(x) VMRegImpl::stack2reg((x) + additional_frame_slots)
-#define YMMHI_STACK_OFFSET(x) VMRegImpl::stack2reg((x / VMRegImpl::stack_slot_size) + ymmhi_offset)
+#define STACK_OFFSET(x) VMRegImpl::stack2reg((x))
 
   map->set_callee_saved(STACK_OFFSET( rax_off ), rax->as_VMReg());
   map->set_callee_saved(STACK_OFFSET( rcx_off ), rcx->as_VMReg());
   map->set_callee_saved(STACK_OFFSET( rdx_off ), rdx->as_VMReg());
   map->set_callee_saved(STACK_OFFSET( rbx_off ), rbx->as_VMReg());

@@ -255,35 +248,25 @@
   if(UseAVX > 2) {
     // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
     off = zmm16_off;
     delta = zmm17_off - off;
     for (int n = 16; n < num_xmm_regs; n++) {
-      XMMRegister xmm_name = as_XMMRegister(n);
-      map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg());
+      XMMRegister zmm_name = as_XMMRegister(n);
+      map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
       off += delta;
     }
   }
 
 #if defined(COMPILER2) || INCLUDE_JVMCI
   if (save_vectors) {
-    assert(ymmhi_offset != -1, "save area must exist");
-    map->set_callee_saved(YMMHI_STACK_OFFSET(  0), xmm0->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET( 16), xmm1->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET( 32), xmm2->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET( 48), xmm3->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET( 64), xmm4->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET( 80), xmm5->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET( 96), xmm6->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET(112), xmm7->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET(128), xmm8->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET(144), xmm9->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET(160), xmm10->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET(176), xmm11->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET(192), xmm12->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET(208), xmm13->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET(224), xmm14->as_VMReg()->next(4));
-    map->set_callee_saved(YMMHI_STACK_OFFSET(240), xmm15->as_VMReg()->next(4));
+    off = ymm0_off;
+    int delta = ymm1_off - off;
+    for (int n = 0; n < 16; n++) {
+      XMMRegister ymm_name = as_XMMRegister(n);
+      map->set_callee_saved(STACK_OFFSET(off), ymm_name->as_VMReg()->next(4));
+      off += delta;
+    }
   }
 #endif // COMPILER2 || INCLUDE_JVMCI
 
   // %%% These should all be a waste but we'll keep things as they were for now
   if (true) {

@@ -314,12 +297,12 @@
     if (UseAVX > 2) {
       // Obtain xmm16..xmm31 from the XSAVE area on EVEX enabled targets
       off = zmm16H_off;
       delta = zmm17H_off - off;
       for (int n = 16; n < num_xmm_regs; n++) {
-        XMMRegister xmm_name = as_XMMRegister(n);
-        map->set_callee_saved(STACK_OFFSET(off), xmm_name->as_VMReg()->next());
+        XMMRegister zmm_name = as_XMMRegister(n);
+        map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
         off += delta;
       }
     }
   }
 

@@ -333,25 +316,52 @@
   }
   if (frame::arg_reg_save_area_bytes != 0) {
     // Pop arg register save area
     __ addptr(rsp, frame::arg_reg_save_area_bytes);
   }
+
 #if defined(COMPILER2) || INCLUDE_JVMCI
+  if (restore_vectors) {
+    assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
+    assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
+  }
+#else
+  assert(!save_vectors, "vectors are generated only by C2");
+#endif
+
   // On EVEX enabled targets everything is handled in pop fpu state
-  if ((restore_vectors) && (UseAVX < 3)) {
-    assert(UseAVX > 0, "256/512-bit vectors are supported only with AVX");
-    assert(MaxVectorSize == 64, "up to 512bit vectors are supported now");
+  if (restore_vectors) {
+    // Restore upper half of YMM registes (0..15)
+    int base_addr = XSAVE_AREA_YMM_BEGIN;
+    for (int n = 0; n < 16; n++) {
+      __ vinsertf128h(as_XMMRegister(n), Address(rsp,  base_addr+n*16));
+    }
+    if (VM_Version::supports_evex()) {
+      // Restore upper half of ZMM registes (0..15)
+      base_addr = XSAVE_AREA_ZMM_BEGIN;
+      for (int n = 0; n < 16; n++) {
+        __ vinsertf64x4h(as_XMMRegister(n), Address(rsp, base_addr+n*32), 1);
+      }
+      // Restore full ZMM registes(16..num_xmm_regs)
+      base_addr = XSAVE_AREA_UPPERBANK;
+      int vector_len = Assembler::AVX_512bit;
     int off = 0;
-    // Restore upper half of YMM registes (0..num_xmm_regs)
-    for (int n = 0; n < num_xmm_regs; n++) {
-      __ vinsertf128h(as_XMMRegister(n), Address(rsp,  off++*16));
+      for (int n = 16; n < num_xmm_regs; n++) {
+        __ evmovdqul(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)), vector_len);
     }
-    __ addptr(rsp, num_xmm_regs*16);
   }
-#else
-  assert(!restore_vectors, "vectors are generated only by C2 and JVMCI");
-#endif
+  } else {
+    if (VM_Version::supports_evex()) {
+      // Restore upper bank of ZMM registes(16..31) for double/float usage
+      int base_addr = XSAVE_AREA_UPPERBANK;
+      int off = 0;
+      for (int n = 16; n < num_xmm_regs; n++) {
+        __ movsd(as_XMMRegister(n), Address(rsp, base_addr+(off++*64)));
+      }
+    }
+  }
+
   // Recover CPU state
   __ pop_CPU_state();
   // Get the rbp described implicitly by the calling convention (no oopMap)
   __ pop(rbp);
 }
< prev index next >