--- old/src/cpu/x86/vm/x86_32.ad 2015-04-23 08:26:44.657343300 -0700 +++ new/src/cpu/x86/vm/x86_32.ad 2015-04-23 08:26:44.450343300 -0700 @@ -101,6 +101,17 @@ reg_def FPR6H( SOC, SOC, Op_RegF, 6, as_FloatRegister(5)->as_VMReg()->next()); reg_def FPR7L( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()); reg_def FPR7H( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next()); +// +// Empty fill registers, which are never used, but supply alignment to xmm regs +// +reg_def FILL0( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(2)); +reg_def FILL1( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(3)); +reg_def FILL2( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(4)); +reg_def FILL3( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(5)); +reg_def FILL4( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(6)); +reg_def FILL5( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(7)); +reg_def FILL6( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(8)); +reg_def FILL7( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next(9)); // Specify priority of register selection within phases of register // allocation. Highest priority is first. A useful heuristic is to @@ -112,7 +123,8 @@ alloc_class chunk0( ECX, EBX, EBP, EDI, EAX, EDX, ESI, ESP, FPR0L, FPR0H, FPR1L, FPR1H, FPR2L, FPR2H, FPR3L, FPR3H, FPR4L, FPR4H, FPR5L, FPR5H, - FPR6L, FPR6H, FPR7L, FPR7H ); + FPR6L, FPR6H, FPR7L, FPR7H, + FILL0, FILL1, FILL2, FILL3, FILL4, FILL5, FILL6, FILL7); //----------Architecture Description Register Classes-------------------------- @@ -235,7 +247,9 @@ size += 6; // fldcw } if (C->max_vector_size() > 16) { - size += 3; // vzeroupper + if(UseAVX <= 2) { + size += 3; // vzeroupper + } } return size; } @@ -731,6 +745,12 @@ // Helper for XMM registers. Extra opcode bits, limited syntax. static int impl_x_helper( CodeBuffer *cbuf, bool do_size, bool is_load, int offset, int reg_lo, int reg_hi, int size, outputStream* st ) { + int in_size_in_bits = Assembler::EVEX_32bit; + int evex_encoding = 0; + if (reg_lo+1 == reg_hi) { + in_size_in_bits = Assembler::EVEX_64bit; + evex_encoding = Assembler::VEX_W; + } if (cbuf) { MacroAssembler _masm(cbuf); if (reg_lo+1 == reg_hi) { // double move? @@ -763,7 +783,17 @@ } #endif } - int offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4); + bool is_single_byte = false; + if ((UseAVX > 2) && (offset != 0)) { + is_single_byte = Assembler::query_compressed_disp_byte(offset, true, 0, Assembler::EVEX_T1S, in_size_in_bits, evex_encoding); + } + int offset_size = 0; + if (UseAVX > 2 ) { + offset_size = (offset == 0) ? 0 : ((is_single_byte) ? 1 : 4); + } else { + offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4); + } + size += (UseAVX > 2) ? 2 : 0; // Need an additional two bytes for EVEX // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix. return size+5+offset_size; } @@ -799,8 +829,8 @@ #endif } // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix. - // Only MOVAPS SSE prefix uses 1 byte. - int sz = 4; + // Only MOVAPS SSE prefix uses 1 byte. EVEX uses an additional 2 bytes. + int sz = (UseAVX > 2) ? 6 : 4; if (!(src_lo+1 == src_hi && dst_lo+1 == dst_hi) && UseXmmRegToRegMoveAll && (UseAVX == 0)) sz = 3; return size + sz; @@ -818,7 +848,7 @@ st->print("movdl %s, %s\t# spill", Matcher::regName[dst_lo], Matcher::regName[src_lo]); #endif } - return 4; + return (UseAVX> 2) ? 6 : 4; } @@ -834,7 +864,7 @@ st->print("movdl %s, %s\t# spill", Matcher::regName[dst_lo], Matcher::regName[src_lo]); #endif } - return 4; + return (UseAVX> 2) ? 6 : 4; } static int impl_mov_helper( CodeBuffer *cbuf, bool do_size, int src, int dst, int size, outputStream* st ) { @@ -905,9 +935,8 @@ calc_size += 3+src_offset_size + 3+dst_offset_size; break; case Op_VecX: - calc_size = 6 + 6 + 5+src_offset_size + 5+dst_offset_size; - break; case Op_VecY: + case Op_VecZ: calc_size = 6 + 6 + 5+src_offset_size + 5+dst_offset_size; break; default: @@ -938,6 +967,11 @@ __ vmovdqu(xmm0, Address(rsp, src_offset)); __ vmovdqu(Address(rsp, dst_offset), xmm0); __ vmovdqu(xmm0, Address(rsp, -32)); + case Op_VecZ: + __ evmovdqu(Address(rsp, -64), xmm0, 2); + __ evmovdqu(xmm0, Address(rsp, src_offset), 2); + __ evmovdqu(Address(rsp, dst_offset), xmm0, 2); + __ evmovdqu(xmm0, Address(rsp, -64), 2); break; default: ShouldNotReachHere(); @@ -973,6 +1007,12 @@ "vmovdqu [rsp + #%d], xmm0\n\t" "vmovdqu xmm0, [rsp - #32]", src_offset, dst_offset); + case Op_VecZ: + st->print("vmovdqu [rsp - #64], xmm0\t# 512-bit mem-mem spill\n\t" + "vmovdqu xmm0, [rsp + #%d]\n\t" + "vmovdqu [rsp + #%d], xmm0\n\t" + "vmovdqu xmm0, [rsp - #64]", + src_offset, dst_offset); break; default: ShouldNotReachHere(); @@ -1006,7 +1046,7 @@ uint ireg = ideal_reg(); assert((src_first_rc != rc_int && dst_first_rc != rc_int), "sanity"); assert((src_first_rc != rc_float && dst_first_rc != rc_float), "sanity"); - assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY), "sanity"); + assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY || ireg == Op_VecZ ), "sanity"); if( src_first_rc == rc_stack && dst_first_rc == rc_stack ) { // mem -> mem int src_offset = ra_->reg2offset(src_first); @@ -3969,7 +4009,7 @@ // XMM Float register operands operand regF() %{ predicate( UseSSE>=1 ); - constraint(ALLOC_IN_RC(float_reg)); + constraint(ALLOC_IN_RC(float_reg_legacy)); match(RegF); format %{ %} interface(REG_INTER); @@ -3978,12 +4018,44 @@ // XMM Double register operands operand regD() %{ predicate( UseSSE>=2 ); - constraint(ALLOC_IN_RC(double_reg)); + constraint(ALLOC_IN_RC(double_reg_legacy)); match(RegD); format %{ %} interface(REG_INTER); %} +// Vectors +operand vecS() %{ + constraint(ALLOC_IN_RC(vectors_reg_legacy)); + match(VecS); + + format %{ %} + interface(REG_INTER); +%} + +operand vecD() %{ + constraint(ALLOC_IN_RC(vectord_reg_legacy)); + match(VecD); + + format %{ %} + interface(REG_INTER); +%} + +operand vecX() %{ + constraint(ALLOC_IN_RC(vectorx_reg_legacy)); + match(VecX); + + format %{ %} + interface(REG_INTER); +%} + +operand vecY() %{ + constraint(ALLOC_IN_RC(vectory_reg_legacy)); + match(VecY); + + format %{ %} + interface(REG_INTER); +%} //----------Memory Operands---------------------------------------------------- // Direct Memory Operand @@ -4991,11 +5063,11 @@ match(Set dst (ReverseBytesUS dst)); effect(KILL cr); - format %{ "BSWAP $dst\n\t" + format %{ "BSWAP $dst\n\t" "SHR $dst,16\n\t" %} ins_encode %{ __ bswapl($dst$$Register); - __ shrl($dst$$Register, 16); + __ shrl($dst$$Register, 16); %} ins_pipe( ialu_reg ); %} @@ -5004,11 +5076,11 @@ match(Set dst (ReverseBytesS dst)); effect(KILL cr); - format %{ "BSWAP $dst\n\t" + format %{ "BSWAP $dst\n\t" "SAR $dst,16\n\t" %} ins_encode %{ __ bswapl($dst$$Register); - __ sarl($dst$$Register, 16); + __ sarl($dst$$Register, 16); %} ins_pipe( ialu_reg ); %} @@ -6496,7 +6568,7 @@ effect(KILL cr); ins_cost(400); - format %{ + format %{ $$template if (os::is_MP()) { $$emit$$"LOCK ADDL [ESP + #0], 0\t! membar_volatile" @@ -8259,10 +8331,10 @@ // Xor Register with Immediate -1 instruct xorI_eReg_im1(rRegI dst, immI_M1 imm) %{ - match(Set dst (XorI dst imm)); + match(Set dst (XorI dst imm)); size(2); - format %{ "NOT $dst" %} + format %{ "NOT $dst" %} ins_encode %{ __ notl($dst$$Register); %} @@ -8910,7 +8982,7 @@ // Xor Long Register with Immediate -1 instruct xorl_eReg_im1(eRegL dst, immL_M1 imm) %{ - match(Set dst (XorL dst imm)); + match(Set dst (XorL dst imm)); format %{ "NOT $dst.lo\n\t" "NOT $dst.hi" %} ins_encode %{ @@ -8965,7 +9037,7 @@ effect(KILL cr); ins_cost(100); format %{ "ADD $dst.lo,$dst.lo\n\t" - "ADC $dst.hi,$dst.hi\n\t" + "ADC $dst.hi,$dst.hi\n\t" "ADD $dst.lo,$dst.lo\n\t" "ADC $dst.hi,$dst.hi" %} ins_encode %{ @@ -8984,9 +9056,9 @@ effect(KILL cr); ins_cost(100); format %{ "ADD $dst.lo,$dst.lo\n\t" - "ADC $dst.hi,$dst.hi\n\t" + "ADC $dst.hi,$dst.hi\n\t" "ADD $dst.lo,$dst.lo\n\t" - "ADC $dst.hi,$dst.hi\n\t" + "ADC $dst.hi,$dst.hi\n\t" "ADD $dst.lo,$dst.lo\n\t" "ADC $dst.hi,$dst.hi" %} ins_encode %{ @@ -11139,7 +11211,6 @@ ins_pipe( ialu_reg_reg ); %} - instruct MoveF2I_stack_reg(rRegI dst, stackSlotF src) %{ match(Set dst (MoveF2I src)); effect( DEF dst, USE src ); @@ -11371,7 +11442,7 @@ format %{ "XOR EAX,EAX\t# ClearArray:\n\t" "SHL ECX,1\t# Convert doublewords to words\n\t" "REP STOS\t# store EAX into [EDI++] while ECX--" %} - ins_encode %{ + ins_encode %{ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register); %} ins_pipe( pipe_slow ); @@ -11384,7 +11455,7 @@ format %{ "XOR EAX,EAX\t# ClearArray:\n\t" "SHL ECX,3\t# Convert doublewords to bytes\n\t" "REP STOSB\t# store EAX into [EDI++] while ECX--" %} - ins_encode %{ + ins_encode %{ __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register); %} ins_pipe( pipe_slow );