--- old/src/cpu/x86/vm/assembler_x86.cpp 2016-04-22 17:27:55.078034500 -0700 +++ new/src/cpu/x86/vm/assembler_x86.cpp 2016-04-22 17:27:54.649010000 -0700 @@ -2323,6 +2323,15 @@ emit_int8((unsigned char)(0xC0 | encode)); } +// This instruction produces ZF or CF flags +void Assembler::ktestql(KRegister src1, KRegister src2) { + assert(VM_Version::supports_avx512bw(), ""); + InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + int encode = vex_prefix_and_encode(src1->encoding(), 0, src2->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0x99); + emit_int8((unsigned char)(0xC0 | encode)); +} + void Assembler::movb(Address dst, int imm8) { InstructionMark im(this); prefix(dst); @@ -2491,6 +2500,20 @@ emit_operand(src, dst); } +void Assembler::evmovdqub(KRegister mask, bool zeroing, XMMRegister dst, Address src, int vector_len) { + assert(VM_Version::supports_avx512vlbw(), ""); + assert(is_programmed_mask_reg(), ""); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); + attributes.set_embedded_opmask_register_specifier(mask); + if (zeroing) attributes.set_is_clear_context(); + attributes.set_is_evex_instruction(); + vex_prefix(src, 0, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes); + emit_int8(0x6F); + emit_operand(dst, src); +} + void Assembler::evmovdquw(XMMRegister dst, XMMRegister src, int vector_len) { assert(VM_Version::supports_evex(), ""); InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true); @@ -3275,6 +3298,23 @@ emit_operand(as_Register(dst_enc), src); } +void Assembler::evpcmpeqb(KRegister mask, bool zeroing, KRegister kdst, XMMRegister nds, Address src, int vector_len) { + bool no_reg_mask = (mask == NULL && zeroing == false) ? true : false; + assert(VM_Version::supports_avx512vlbw(), ""); + assert(is_programmed_mask_reg(), ""); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, no_reg_mask, /* uses_vl */ false); + attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit); + if (!no_reg_mask) { + attributes.set_embedded_opmask_register_specifier(mask); + if (zeroing) attributes.set_is_clear_context(); + } + attributes.set_is_evex_instruction(); + vex_prefix(src, nds->encoding(), kdst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8(0x74); + emit_operand(as_Register(kdst->encoding()), src); +} + // In this context, the dst vector contains the components that are equal, non equal components are zeroed in dst void Assembler::pcmpeqw(XMMRegister dst, XMMRegister src) { assert(VM_Version::supports_sse2(), ""); --- old/src/cpu/x86/vm/assembler_x86.hpp 2016-04-22 17:27:57.912196600 -0700 +++ new/src/cpu/x86/vm/assembler_x86.hpp 2016-04-22 17:27:57.501173100 -0700 @@ -606,6 +606,7 @@ bool _legacy_mode_vl; bool _legacy_mode_vlbw; bool _is_managed; + bool _programmed_mask_reg; class InstructionAttr *_attributes; @@ -813,6 +814,7 @@ _legacy_mode_vl = (VM_Version::supports_avx512vl() == false); _legacy_mode_vlbw = (VM_Version::supports_avx512vlbw() == false); _is_managed = false; + _programmed_mask_reg = false; _attributes = NULL; } @@ -823,6 +825,11 @@ void clear_managed(void) { _is_managed = false; } bool is_managed(void) { return _is_managed; } + void set_programmed_mask_reg(void) { _programmed_mask_reg = true; } + void clear_programmed_mask_reg(void) { _programmed_mask_reg = false; } + bool is_programmed_mask_reg(void) { return _programmed_mask_reg; } + + void lea(Register dst, Address src); void mov(Register dst, Register src); @@ -1354,6 +1361,8 @@ void kortestdl(KRegister dst, KRegister src); void kortestql(KRegister dst, KRegister src); + void ktestql(KRegister dst, KRegister src); + void movdl(XMMRegister dst, Register src); void movdl(Register dst, XMMRegister src); void movdl(XMMRegister dst, Address src); @@ -1381,6 +1390,7 @@ void evmovdqub(Address dst, XMMRegister src, int vector_len); void evmovdqub(XMMRegister dst, Address src, int vector_len); void evmovdqub(XMMRegister dst, XMMRegister src, int vector_len); + void evmovdqub(KRegister mask, bool zeroing, XMMRegister dst, Address src, int vector_len); void evmovdquw(Address dst, XMMRegister src, int vector_len); void evmovdquw(XMMRegister dst, Address src, int vector_len); void evmovdquw(XMMRegister dst, XMMRegister src, int vector_len); @@ -1533,6 +1543,7 @@ void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len); void evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len); + void evpcmpeqb(KRegister mask, bool zeroing, KRegister kdst, XMMRegister nds, Address src, int vector_len); void pcmpeqw(XMMRegister dst, XMMRegister src); void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); @@ -2092,7 +2103,8 @@ _evex_encoding(0), _is_clear_context(false), _is_extended_context(false), - _current_assembler(NULL) { + _current_assembler(NULL), + _embedded_opmask_register_specifier(1) { // hard code k1, it will be initialized for now if (UseAVX < 3) _legacy_mode = true; } @@ -2116,6 +2128,7 @@ int _evex_encoding; bool _is_clear_context; bool _is_extended_context; + int _embedded_opmask_register_specifier; Assembler *_current_assembler; @@ -2166,6 +2179,11 @@ } } + // Set embedded opmask register specifier. + void set_embedded_opmask_register_specifier(KRegister mask) { + _embedded_opmask_register_specifier = (*mask).encoding() & 0x7; + } + }; #endif // CPU_X86_VM_ASSEMBLER_X86_HPP --- old/src/cpu/x86/vm/macroAssembler_x86.cpp 2016-04-22 17:28:00.464342600 -0700 +++ new/src/cpu/x86/vm/macroAssembler_x86.cpp 2016-04-22 17:28:00.043318500 -0700 @@ -9429,6 +9429,7 @@ void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale, Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){ assert(UseSSE42Intrinsics, "SSE4.2 must be enabled."); + Label VECTOR64_LOOP, VECTOR64_TAIL, VECTOR64_NOT_EQUAL, VECTOR32_TAIL; Label VECTOR32_LOOP, VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP; Label VECTOR16_TAIL, VECTOR8_TAIL, VECTOR4_TAIL; Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL; @@ -9441,11 +9442,62 @@ shlq(length); xorq(result, result); + if ((UseAVX > 2) && + VM_Version::supports_avx512vlbw()) { + set_programmed_mask_reg(); // opening of the stub context for programming mask registers + cmpq(length, 64); + jcc(Assembler::less, VECTOR32_TAIL); + movq(tmp1, length); + andq(tmp1, 0x3F); // tail count + andq(length, ~(0x3F)); //vector count + + bind(VECTOR64_LOOP); + // AVX512 code to compare 64 byte vectors. + evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit); + evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit); + kortestql(k7, k7); + jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL); // mismatch + addq(result, 64); + subq(length, 64); + jccb(Assembler::notZero, VECTOR64_LOOP); + + //bind(VECTOR64_TAIL); + testq(tmp1, tmp1); + jcc(Assembler::zero, SAME_TILL_END); + + bind(VECTOR64_TAIL); + // AVX512 code to compare upto 63 byte vectors. + // Save k1 + kmovql(k3, k1); + mov64(tmp2, 0xFFFFFFFFFFFFFFFF); + shlxq(tmp2, tmp2, tmp1); + notq(tmp2); + kmovql(k1, tmp2); + + evmovdqub(k1, false, rymm0, Address(obja, result), Assembler::AVX_512bit); + evpcmpeqb(k1, false, k7, rymm0, Address(objb, result), Assembler::AVX_512bit); + + ktestql(k7, k1); + // Restore k1 + kmovql(k1, k3); + jcc(Assembler::below, SAME_TILL_END); // not mismatch + + bind(VECTOR64_NOT_EQUAL); + kmovql(tmp1, k7); + notq(tmp1); + tzcntq(tmp1, tmp1); + addq(result, tmp1); + shrq(result); + jmp(DONE); + bind(VECTOR32_TAIL); + clear_programmed_mask_reg(); // closing of the stub context for programming mask registers + } + cmpq(length, 8); jcc(Assembler::equal, VECTOR8_LOOP); jcc(Assembler::less, VECTOR4_TAIL); - if (UseAVX >= 2){ + if (UseAVX >= 2) { cmpq(length, 16); jcc(Assembler::equal, VECTOR16_LOOP); @@ -9553,7 +9605,7 @@ jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found jmpb(SAME_TILL_END); - if (UseAVX >= 2){ + if (UseAVX >= 2) { bind(VECTOR32_NOT_EQUAL); vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit); vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit); @@ -9566,7 +9618,7 @@ } bind(VECTOR16_NOT_EQUAL); - if (UseAVX >= 2){ + if (UseAVX >= 2) { vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit); vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit); pxor(rymm0, rymm2); @@ -9597,7 +9649,6 @@ bind(DONE); } - //Helper functions for square_to_len() /**