--- old/src/cpu/x86/vm/assembler_x86.cpp 2014-07-25 15:31:05.000000000 -0700 +++ new/src/cpu/x86/vm/assembler_x86.cpp 2014-07-25 15:31:05.000000000 -0700 @@ -3854,6 +3854,15 @@ } // Carry-Less Multiplication Quadword +void Assembler::pclmulqdq(XMMRegister dst, XMMRegister src, int mask) { + assert(VM_Version::supports_clmul(), ""); + int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A); + emit_int8(0x44); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8((unsigned char)mask); +} + +// Carry-Less Multiplication Quadword void Assembler::vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask) { assert(VM_Version::supports_avx() && VM_Version::supports_clmul(), ""); bool vector256 = false; --- old/src/cpu/x86/vm/assembler_x86.hpp 2014-07-25 15:31:06.000000000 -0700 +++ new/src/cpu/x86/vm/assembler_x86.hpp 2014-07-25 15:31:06.000000000 -0700 @@ -1837,6 +1837,7 @@ void vpbroadcastd(XMMRegister dst, XMMRegister src); // Carry-Less Multiplication Quadword + void pclmulqdq(XMMRegister dst, XMMRegister src, int mask); void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask); // AVX instruction which is used to clear upper 128 bits of YMM registers and --- old/src/cpu/x86/vm/macroAssembler_x86.cpp 2014-07-25 15:31:06.000000000 -0700 +++ new/src/cpu/x86/vm/macroAssembler_x86.cpp 2014-07-25 15:31:06.000000000 -0700 @@ -7316,17 +7316,34 @@ * Fold 128-bit data chunk */ void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) { - vpclmulhdq(xtmp, xK, xcrc); // [123:64] - vpclmulldq(xcrc, xK, xcrc); // [63:0] - vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */); - pxor(xcrc, xtmp); + if (UseAVX > 0) { + vpclmulhdq(xtmp, xK, xcrc); // [123:64] + vpclmulldq(xcrc, xK, xcrc); // [63:0] + vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */); + pxor(xcrc, xtmp); + } else { + movdqa(xtmp, xcrc); + pclmulhdq(xtmp, xK); // [123:64] + pclmulldq(xcrc, xK); // [63:0] + pxor(xcrc, xtmp); + movdqu(xtmp, Address(buf, offset)); + pxor(xcrc, xtmp); + } } void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) { - vpclmulhdq(xtmp, xK, xcrc); - vpclmulldq(xcrc, xK, xcrc); - pxor(xcrc, xbuf); - pxor(xcrc, xtmp); + if (UseAVX > 0) { + vpclmulhdq(xtmp, xK, xcrc); + vpclmulldq(xcrc, xK, xcrc); + pxor(xcrc, xbuf); + pxor(xcrc, xtmp); + } else { + movdqa(xtmp, xcrc); + pclmulhdq(xtmp, xK); + pclmulldq(xcrc, xK); + pxor(xcrc, xbuf); + pxor(xcrc, xtmp); + } } /** @@ -7444,9 +7461,17 @@ // Fold 128 bits in xmm1 down into 32 bits in crc register. BIND(L_fold_128b); movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr())); - vpclmulqdq(xmm2, xmm0, xmm1, 0x1); - vpand(xmm3, xmm0, xmm2, false /* vector256 */); - vpclmulqdq(xmm0, xmm0, xmm3, 0x1); + if (UseAVX > 0) { + vpclmulqdq(xmm2, xmm0, xmm1, 0x1); + vpand(xmm3, xmm0, xmm2, false /* vector256 */); + vpclmulqdq(xmm0, xmm0, xmm3, 0x1); + } else { + movdqa(xmm2, xmm0); + pclmulqdq(xmm2, xmm1, 0x1); + movdqa(xmm3, xmm0); + pand(xmm3, xmm2); + pclmulqdq(xmm0, xmm3, 0x1); + } psrldq(xmm1, 8); psrldq(xmm2, 4); pxor(xmm0, xmm1); --- old/src/cpu/x86/vm/macroAssembler_x86.hpp 2014-07-25 15:31:07.000000000 -0700 +++ new/src/cpu/x86/vm/macroAssembler_x86.hpp 2014-07-25 15:31:07.000000000 -0700 @@ -966,6 +966,16 @@ void mulss(XMMRegister dst, Address src) { Assembler::mulss(dst, src); } void mulss(XMMRegister dst, AddressLiteral src); + // Carry-Less Multiplication Quadword + void pclmulldq(XMMRegister dst, XMMRegister src) { + // 0x00 - multiply lower 64 bits [0:63] + Assembler::pclmulqdq(dst, src, 0x00); + } + void pclmulhdq(XMMRegister dst, XMMRegister src) { + // 0x11 - multiply upper 64 bits [64:127] + Assembler::pclmulqdq(dst, src, 0x11); + } + void sqrtsd(XMMRegister dst, XMMRegister src) { Assembler::sqrtsd(dst, src); } void sqrtsd(XMMRegister dst, Address src) { Assembler::sqrtsd(dst, src); } void sqrtsd(XMMRegister dst, AddressLiteral src); --- old/src/cpu/x86/vm/vm_version_x86.cpp 2014-07-25 15:31:08.000000000 -0700 +++ new/src/cpu/x86/vm/vm_version_x86.cpp 2014-07-25 15:31:07.000000000 -0700 @@ -559,7 +559,7 @@ FLAG_SET_DEFAULT(UseCLMUL, false); } - if (UseCLMUL && (UseAVX > 0) && (UseSSE > 2)) { + if (UseCLMUL && (UseSSE > 2)) { if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) { UseCRC32Intrinsics = true; } @@ -805,6 +805,21 @@ } } } + if ((cpu_family() == 0x06) && + ((extended_cpu_model() == 0x36) || // Centerton + (extended_cpu_model() == 0x37) || // Silvermont + (extended_cpu_model() == 0x4D))) { +#ifdef COMPILER2 + if (FLAG_IS_DEFAULT(OptoScheduling)) { + OptoScheduling = true; + } +#endif + if (supports_sse4_2()) { // Silvermont + if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) { + UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus + } + } + } } // Use count leading zeros count instruction if available. @@ -892,23 +907,25 @@ AllocatePrefetchDistance = allocate_prefetch_distance(); AllocatePrefetchStyle = allocate_prefetch_style(); - if( is_intel() && cpu_family() == 6 && supports_sse3() ) { - if( AllocatePrefetchStyle == 2 ) { // watermark prefetching on Core + if (is_intel() && cpu_family() == 6 && supports_sse3()) { + if (AllocatePrefetchStyle == 2) { // watermark prefetching on Core #ifdef _LP64 AllocatePrefetchDistance = 384; #else AllocatePrefetchDistance = 320; #endif } - if( supports_sse4_2() && supports_ht() ) { // Nehalem based cpus + if (supports_sse4_2() && supports_ht()) { // Nehalem based cpus AllocatePrefetchDistance = 192; AllocatePrefetchLines = 4; + } #ifdef COMPILER2 - if (AggressiveOpts && FLAG_IS_DEFAULT(UseFPUForSpilling)) { + if (supports_sse4_2()) { + if (FLAG_IS_DEFAULT(UseFPUForSpilling)) { FLAG_SET_DEFAULT(UseFPUForSpilling, true); } -#endif } +#endif } assert(AllocatePrefetchDistance % AllocatePrefetchStepSize == 0, "invalid value"); --- old/src/share/vm/opto/lcm.cpp 2014-07-25 15:31:08.000000000 -0700 +++ new/src/share/vm/opto/lcm.cpp 2014-07-25 15:31:08.000000000 -0700 @@ -464,7 +464,9 @@ iop == Op_CreateEx || // Create-exception must start block iop == Op_CheckCastPP ) { - worklist.map(i,worklist.pop()); + // select the node n + // remove n from worklist and retain the order of remaining nodes + worklist.remove((uint)i); return n; } @@ -550,7 +552,9 @@ assert(idx >= 0, "index should be set"); Node *n = worklist[(uint)idx]; // Get the winner - worklist.map((uint)idx, worklist.pop()); // Compress worklist + // select the node n + // remove n from worklist and retain the order of remaining nodes + worklist.remove((uint)idx); return n; } --- old/src/share/vm/opto/superword.cpp 2014-07-25 15:31:09.000000000 -0700 +++ new/src/share/vm/opto/superword.cpp 2014-07-25 15:31:09.000000000 -0700 @@ -1378,6 +1378,20 @@ if (n->is_Load()) { Node* ctl = n->in(MemNode::Control); Node* mem = first->in(MemNode::Memory); + SWPointer p1(n->as_Mem(), this); + // Identify the memory dependency for the new loadVector node by + // walking up through memory chain. + // This is done to give flexibility to the new loadVector node so that + // it can move above independent storeVector nodes. + while (mem->is_StoreVector()) { + SWPointer p2(mem->as_Mem(), this); + int cmp = p1.cmp(p2); + if (SWPointer::not_equal(cmp) || !SWPointer::comparable(cmp)) { + mem = mem->in(MemNode::Memory); + } else { + break; // dependent memory + } + } Node* adr = low_adr->in(MemNode::Address); const TypePtr* atyp = n->adr_type(); vn = LoadVectorNode::make(C, opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n));