# HG changeset patch # User goetz # Date 1436258409 -7200 # Node ID babf6eebf45005b7e4d23cff2caa7b8a7e2523e2 # Parent d7f63963925f9e8ffeeb094a0a577b9ae78ab349 8130654: ppc: implement MultiplyToLen intrinsic Contributed-by: Peter.Januschke@sap.com diff --git a/src/cpu/ppc/vm/frame_ppc.inline.hpp b/src/cpu/ppc/vm/frame_ppc.inline.hpp --- a/src/cpu/ppc/vm/frame_ppc.inline.hpp +++ b/src/cpu/ppc/vm/frame_ppc.inline.hpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 2000, 2014, Oracle and/or its affiliates. All rights reserved. - * Copyright 2012, 2014 SAP AG. All rights reserved. + * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012, 2015 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -225,7 +225,7 @@ return (BasicObjectLock *) get_ijava_state(); } -// SAPJVM ASc 2012-11-21. Return register stack slot addr at which currently interpreted method is found +// Return register stack slot addr at which currently interpreted method is found. inline Method** frame::interpreter_frame_method_addr() const { return (Method**) &(get_ijava_state()->method); } diff --git a/src/cpu/ppc/vm/macroAssembler_ppc.cpp b/src/cpu/ppc/vm/macroAssembler_ppc.cpp --- a/src/cpu/ppc/vm/macroAssembler_ppc.cpp +++ b/src/cpu/ppc/vm/macroAssembler_ppc.cpp @@ -3433,6 +3433,376 @@ bind(Ldone_false); } +// dest_lo += src1 + src2 +// dest_hi += carry1 + carry2 +void MacroAssembler::add2_with_carry(Register dest_hi, + Register dest_lo, + Register src1, Register src2) { + li(R0, 0); + addc(dest_lo, dest_lo, src1); + adde(dest_hi, dest_hi, R0); + addc(dest_lo, dest_lo, src2); + adde(dest_hi, dest_hi, R0); +} + +// Multiply 64 bit by 64 bit first loop. +void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, + Register x_xstart, + Register y, Register y_idx, + Register z, + Register carry, + Register product_high, Register product, + Register idx, Register kdx, + Register tmp) { + // jlong carry, x[], y[], z[]; + // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { + // huge_128 product = y[idx] * x[xstart] + carry; + // z[kdx] = (jlong)product; + // carry = (jlong)(product >>> 64); + // } + // z[xstart] = carry; + + Label L_first_loop, L_first_loop_exit; + Label L_one_x, L_one_y, L_multiply; + + addic_(xstart, xstart, -1); + blt(CCR0, L_one_x); // Special case: length of x is 1. + + // Load next two integers of x. + sldi(tmp, xstart, LogBytesPerInt); + ldx(x_xstart, x, tmp); +#ifdef VM_LITTLE_ENDIAN + rldicl(x_xstart, x_xstart, 32, 0); +#endif + + align(32, 16); + bind(L_first_loop); + + cmpdi(CCR0, idx, 1); + blt(CCR0, L_first_loop_exit); + addi(idx, idx, -2); + beq(CCR0, L_one_y); + + // Load next two integers of y. + sldi(tmp, idx, LogBytesPerInt); + ldx(y_idx, y, tmp); +#ifdef VM_LITTLE_ENDIAN + rldicl(y_idx, y_idx, 32, 0); +#endif + + + bind(L_multiply); + multiply64(product_high, product, x_xstart, y_idx); + + li(tmp, 0); + addc(product, product, carry); // Add carry to result. + adde(product_high, product_high, tmp); // Add carry of the last addition. + addi(kdx, kdx, -2); + + // Store result. +#ifdef VM_LITTLE_ENDIAN + rldicl(product, product, 32, 0); +#endif + sldi(tmp, kdx, LogBytesPerInt); + stdx(product, z, tmp); + mr_if_needed(carry, product_high); + b(L_first_loop); + + + bind(L_one_y); // Load one 32 bit portion of y as (0,value). + + lwz(y_idx, 0, y); + b(L_multiply); + + + bind( L_one_x ); // Load one 32 bit portion of x as (0,value). + + lwz(x_xstart, 0, x); + b(L_first_loop); + + bind(L_first_loop_exit); +} + +// Multiply 64 bit by 64 bit and add 128 bit. +void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, + Register z, Register yz_idx, + Register idx, Register carry, + Register product_high, Register product, + Register tmp, int offset) { + + // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; + // z[kdx] = (jlong)product; + + sldi(tmp, idx, LogBytesPerInt); + if ( offset ) { + addi(tmp, tmp, offset); + } + ldx(yz_idx, y, tmp); +#ifdef VM_LITTLE_ENDIAN + rldicl(yz_idx, yz_idx, 32, 0); +#endif + + multiply64(product_high, product, x_xstart, yz_idx); + ldx(yz_idx, z, tmp); +#ifdef VM_LITTLE_ENDIAN + rldicl(yz_idx, yz_idx, 32, 0); +#endif + + add2_with_carry(product_high, product, carry, yz_idx); + + sldi(tmp, idx, LogBytesPerInt); + if ( offset ) { + addi(tmp, tmp, offset); + } +#ifdef VM_LITTLE_ENDIAN + rldicl(product, product, 32, 0); +#endif + stdx(product, z, tmp); +} + +// Multiply 128 bit by 128 bit. Unrolled inner loop. +void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, + Register y, Register z, + Register yz_idx, Register idx, Register carry, + Register product_high, Register product, + Register carry2, Register tmp) { + + // jlong carry, x[], y[], z[]; + // int kdx = ystart+1; + // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop + // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; + // z[kdx+idx+1] = (jlong)product; + // jlong carry2 = (jlong)(product >>> 64); + // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; + // z[kdx+idx] = (jlong)product; + // carry = (jlong)(product >>> 64); + // } + // idx += 2; + // if (idx > 0) { + // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; + // z[kdx+idx] = (jlong)product; + // carry = (jlong)(product >>> 64); + // } + + Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; + const Register jdx = R0; + + // Scale the index. + srdi_(jdx, idx, 2); + beq(CCR0, L_third_loop_exit); + mtctr(jdx); + + align(32, 16); + bind(L_third_loop); + + addi(idx, idx, -4); + + multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); + mr_if_needed(carry2, product_high); + + multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); + mr_if_needed(carry, product_high); + bdnz(L_third_loop); + + bind(L_third_loop_exit); // Handle any left-over operand parts. + + andi_(idx, idx, 0x3); + beq(CCR0, L_post_third_loop_done); + + Label L_check_1; + + addic_(idx, idx, -2); + blt(CCR0, L_check_1); + + multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); + mr_if_needed(carry, product_high); + + bind(L_check_1); + + addi(idx, idx, 0x2); + andi_(idx, idx, 0x1) ; + addic_(idx, idx, -1); + blt(CCR0, L_post_third_loop_done); + + sldi(tmp, idx, LogBytesPerInt); + lwzx(yz_idx, y, tmp); + multiply64(product_high, product, x_xstart, yz_idx); + lwzx(yz_idx, z, tmp); + + add2_with_carry(product_high, product, yz_idx, carry); + + sldi(tmp, idx, LogBytesPerInt); + stwx(product, z, tmp); + srdi(product, product, 32); + + sldi(product_high, product_high, 32); + orr(product, product, product_high); + mr_if_needed(carry, product); + + bind(L_post_third_loop_done); +} // multiply_128_x_128_loop + +void MacroAssembler::multiply_to_len(Register x, Register xlen, + Register y, Register ylen, + Register z, Register zlen, + Register tmp1, Register tmp2, + Register tmp3, Register tmp4, + Register tmp5, Register tmp6, + Register tmp7, Register tmp8, + Register tmp9, Register tmp10, + Register tmp11, Register tmp12, + Register tmp13) { + + ShortBranchVerifier sbv(this); + + assert_different_registers(x, xlen, y, ylen, z, zlen, + tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); + assert_different_registers(x, xlen, y, ylen, z, zlen, + tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); + assert_different_registers(x, xlen, y, ylen, z, zlen, + tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); + + const Register idx = tmp1; + const Register kdx = tmp2; + const Register xstart = tmp3; + + const Register y_idx = tmp4; + const Register carry = tmp5; + const Register product = tmp6; + const Register product_high = tmp7; + const Register x_xstart = tmp8; + const Register tmp = tmp9; + + // First Loop. + // + // final static long LONG_MASK = 0xffffffffL; + // int xstart = xlen - 1; + // int ystart = ylen - 1; + // long carry = 0; + // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { + // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; + // z[kdx] = (int)product; + // carry = product >>> 32; + // } + // z[xstart] = (int)carry; + + mr_if_needed(idx, ylen); // idx = ylen + mr_if_needed(kdx, zlen); // kdx = xlen + ylen + li(carry, 0); // carry = 0 + + Label L_done; + + addic_(xstart, xlen, -1); + blt(CCR0, L_done); + + multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, + carry, product_high, product, idx, kdx, tmp); + + Label L_second_loop; + + cmpdi(CCR0, kdx, 0); + beq(CCR0, L_second_loop); + + Label L_carry; + + addic_(kdx, kdx, -1); + beq(CCR0, L_carry); + + // Store lower 32 bits of carry. + sldi(tmp, kdx, LogBytesPerInt); + stwx(carry, z, tmp); + srdi(carry, carry, 32); + addi(kdx, kdx, -1); + + + bind(L_carry); + + // Store upper 32 bits of carry. + sldi(tmp, kdx, LogBytesPerInt); + stwx(carry, z, tmp); + + // Second and third (nested) loops. + // + // for (int i = xstart-1; i >= 0; i--) { // Second loop + // carry = 0; + // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop + // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + + // (z[k] & LONG_MASK) + carry; + // z[k] = (int)product; + // carry = product >>> 32; + // } + // z[i] = (int)carry; + // } + // + // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx + + bind(L_second_loop); + + li(carry, 0); // carry = 0; + + addic_(xstart, xstart, -1); // i = xstart-1; + blt(CCR0, L_done); + + Register zsave = tmp10; + + mr(zsave, z); + + + Label L_last_x; + + sldi(tmp, xstart, LogBytesPerInt); + add(z, z, tmp); // z = z + k - j + addi(z, z, 4); + addic_(xstart, xstart, -1); // i = xstart-1; + blt(CCR0, L_last_x); + + sldi(tmp, xstart, LogBytesPerInt); + ldx(x_xstart, x, tmp); +#ifdef VM_LITTLE_ENDIAN + rldicl(x_xstart, x_xstart, 32, 0); +#endif + + + Label L_third_loop_prologue; + + bind(L_third_loop_prologue); + + Register xsave = tmp11; + Register xlensave = tmp12; + Register ylensave = tmp13; + + mr(xsave, x); + mr(xlensave, xstart); + mr(ylensave, ylen); + + + multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, + carry, product_high, product, x, tmp); + + mr(z, zsave); + mr(x, xsave); + mr(xlen, xlensave); // This is the decrement of the loop counter! + mr(ylen, ylensave); + + addi(tmp3, xlen, 1); + sldi(tmp, tmp3, LogBytesPerInt); + stwx(carry, z, tmp); + addic_(tmp3, tmp3, -1); + blt(CCR0, L_done); + + srdi(carry, carry, 32); + sldi(tmp, tmp3, LogBytesPerInt); + stwx(carry, z, tmp); + b(L_second_loop); + + // Next infrequent code is moved outside loops. + bind(L_last_x); + + lwz(x_xstart, 0, x); + b(L_third_loop_prologue); + + bind(L_done); +} // multiply_to_len void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) { #ifdef ASSERT diff --git a/src/cpu/ppc/vm/macroAssembler_ppc.hpp b/src/cpu/ppc/vm/macroAssembler_ppc.hpp --- a/src/cpu/ppc/vm/macroAssembler_ppc.hpp +++ b/src/cpu/ppc/vm/macroAssembler_ppc.hpp @@ -677,6 +677,31 @@ void char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg, Register tmp1_reg, Register tmp2_reg); + // Emitters for BigInteger.multiplyToLen intrinsic. + inline void multiply64(Register dest_hi, Register dest_lo, + Register x, Register y); + void add2_with_carry(Register dest_hi, Register dest_lo, + Register src1, Register src2); + void multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, + Register y, Register y_idx, Register z, + Register carry, Register product_high, Register product, + Register idx, Register kdx, Register tmp); + void multiply_add_128_x_128(Register x_xstart, Register y, Register z, + Register yz_idx, Register idx, Register carry, + Register product_high, Register product, Register tmp, + int offset); + void multiply_128_x_128_loop(Register x_xstart, + Register y, Register z, + Register yz_idx, Register idx, Register carry, + Register product_high, Register product, + Register carry2, Register tmp); + void multiply_to_len(Register x, Register xlen, + Register y, Register ylen, + Register z, Register zlen, + Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, + Register tmp6, Register tmp7, Register tmp8, Register tmp9, Register tmp10, + Register tmp11, Register tmp12, Register tmp13); + // // Debugging // diff --git a/src/cpu/ppc/vm/macroAssembler_ppc.inline.hpp b/src/cpu/ppc/vm/macroAssembler_ppc.inline.hpp --- a/src/cpu/ppc/vm/macroAssembler_ppc.inline.hpp +++ b/src/cpu/ppc/vm/macroAssembler_ppc.inline.hpp @@ -423,6 +423,13 @@ twi(traptoEqual | traptoGreaterThanUnsigned, a/*reg a*/, si16); } +// unsigned integer multiplication 64*64 -> 128 bits +inline void MacroAssembler::multiply64(Register dest_hi, Register dest_lo, + Register x, Register y) { + mulld(dest_lo, x, y); + mulhdu(dest_hi, x, y); +} + #if defined(ABI_ELFv2) inline address MacroAssembler::function_entry() { return pc(); } #else diff --git a/src/cpu/ppc/vm/ppc.ad b/src/cpu/ppc/vm/ppc.ad --- a/src/cpu/ppc/vm/ppc.ad +++ b/src/cpu/ppc/vm/ppc.ad @@ -10930,7 +10930,7 @@ instruct cmpFastLock(flagsReg crx, iRegPdst oop, iRegPdst box, iRegPdst tmp1, iRegPdst tmp2, iRegPdst tmp3) %{ match(Set crx (FastLock oop box)); effect(TEMP tmp1, TEMP tmp2, TEMP tmp3); - predicate(/*(!UseNewFastLockPPC64 || UseBiasedLocking) &&*/ !Compile::current()->use_rtm()); + predicate(!Compile::current()->use_rtm()); format %{ "FASTLOCK $oop, $box, $tmp1, $tmp2, $tmp3" %} ins_encode %{ diff --git a/src/cpu/ppc/vm/stubGenerator_ppc.cpp b/src/cpu/ppc/vm/stubGenerator_ppc.cpp --- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp +++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp @@ -2053,6 +2053,79 @@ __ blr(); } + // Stub for BigInteger::multiplyToLen() + // + // Arguments: + // + // Input: + // R3 - x address + // R4 - x length + // R5 - y address + // R6 - y length + // R7 - z address + // R8 - z length + // + address generate_multiplyToLen() { + + StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); + + address start = __ function_entry(); + + const Register x = R3; + const Register xlen = R4; + const Register y = R5; + const Register ylen = R6; + const Register z = R7; + const Register zlen = R8; + + const Register tmp1 = R2; // TOC not used. + const Register tmp2 = R9; + const Register tmp3 = R10; + const Register tmp4 = R11; + const Register tmp5 = R12; + + // non-volatile regs + const Register tmp6 = R31; + const Register tmp7 = R30; + const Register tmp8 = R29; + const Register tmp9 = R28; + const Register tmp10 = R27; + const Register tmp11 = R26; + const Register tmp12 = R25; + const Register tmp13 = R24; + + BLOCK_COMMENT("Entry:"); + + // Save non-volatile regs (frameless). + int current_offs = 8; + __ std(R24, -current_offs, R1_SP); current_offs += 8; + __ std(R25, -current_offs, R1_SP); current_offs += 8; + __ std(R26, -current_offs, R1_SP); current_offs += 8; + __ std(R27, -current_offs, R1_SP); current_offs += 8; + __ std(R28, -current_offs, R1_SP); current_offs += 8; + __ std(R29, -current_offs, R1_SP); current_offs += 8; + __ std(R30, -current_offs, R1_SP); current_offs += 8; + __ std(R31, -current_offs, R1_SP); + + __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, + tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13); + + // Restore non-volatile regs. + current_offs = 8; + __ ld(R24, -current_offs, R1_SP); current_offs += 8; + __ ld(R25, -current_offs, R1_SP); current_offs += 8; + __ ld(R26, -current_offs, R1_SP); current_offs += 8; + __ ld(R27, -current_offs, R1_SP); current_offs += 8; + __ ld(R28, -current_offs, R1_SP); current_offs += 8; + __ ld(R29, -current_offs, R1_SP); current_offs += 8; + __ ld(R30, -current_offs, R1_SP); current_offs += 8; + __ ld(R31, -current_offs, R1_SP); + + __ blr(); // Return to caller. + + return start; + } + // Initialization void generate_initial() { // Generates all stubs and initializes the entry points @@ -2102,6 +2175,12 @@ generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, &StubRoutines::_safefetchN_fault_pc, &StubRoutines::_safefetchN_continuation_pc); + +#ifdef COMPILER2 + if (UseMultiplyToLenIntrinsic) { + StubRoutines::_multiplyToLen = generate_multiplyToLen(); + } +#endif } public: diff --git a/src/cpu/ppc/vm/vm_version_ppc.cpp b/src/cpu/ppc/vm/vm_version_ppc.cpp --- a/src/cpu/ppc/vm/vm_version_ppc.cpp +++ b/src/cpu/ppc/vm/vm_version_ppc.cpp @@ -198,6 +198,10 @@ FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false); } + if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) { + UseMultiplyToLenIntrinsic = true; + } + // Adjust RTM (Restricted Transactional Memory) flags. if (!has_tcheck() && UseRTMLocking) { // Can't continue because UseRTMLocking affects UseBiasedLocking flag @@ -228,7 +232,6 @@ warning("RTMAbortRatio must be in the range 0 to 100, resetting it to 50"); FLAG_SET_DEFAULT(RTMAbortRatio, 50); } - FLAG_SET_ERGO(bool, UseNewFastLockPPC64, false); // Does not implement TM. guarantee(RTMSpinLoopCount > 0, "unsupported"); #else // Only C2 does RTM locking optimization.