# HG changeset patch # User goetz # Date 1429094696 -7200 # Node ID aff04f621af28240e37cab0ffa7fd090b5112e6b # Parent 364dd48a2c48ace207c0aa886033a1bce448f1e6 8077838: Recent developments for ppc. Reviewed-by: kvn diff --git a/src/cpu/ppc/vm/assembler_ppc.cpp b/src/cpu/ppc/vm/assembler_ppc.cpp --- a/src/cpu/ppc/vm/assembler_ppc.cpp +++ b/src/cpu/ppc/vm/assembler_ppc.cpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved. - * Copyright 2012, 2014 SAP AG. All rights reserved. + * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012, 2015 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -85,8 +85,7 @@ } // Low-level andi-one-instruction-macro. -void Assembler::andi(Register a, Register s, const int ui16) { - assert(is_uimm(ui16, 16), "must be 16-bit unsigned immediate"); +void Assembler::andi(Register a, Register s, const long ui16) { if (is_power_of_2_long(((jlong) ui16)+1)) { // pow2minus1 clrldi(a, s, 64-log2_long((((jlong) ui16)+1))); @@ -97,6 +96,7 @@ // negpow2 clrrdi(a, s, log2_long((jlong)-ui16)); } else { + assert(is_uimm(ui16, 16), "must be 16-bit unsigned immediate"); andi_(a, s, ui16); } } @@ -356,7 +356,6 @@ // 16 bit immediate offset. int Assembler::load_const_optimized(Register d, long x, Register tmp, bool return_simm16_rest) { // Avoid accidentally trying to use R0 for indexed addressing. - assert(d != R0, "R0 not allowed"); assert_different_registers(d, tmp); short xa, xb, xc, xd; // Four 16-bit chunks of const. @@ -370,6 +369,58 @@ return 0; } + int retval = 0; + if (return_simm16_rest) { + retval = xd; + x = rem << 16; + xd = 0; + } + + if (d == R0) { // Can't use addi. + if (is_simm(x, 32)) { // opt 2: simm32 + lis(d, x >> 16); + if (xd) ori(d, d, (unsigned short)xd); + } else { + // 64-bit value: x = xa xb xc xd + xa = (x >> 48) & 0xffff; + xb = (x >> 32) & 0xffff; + xc = (x >> 16) & 0xffff; + bool xa_loaded = (xb & 0x8000) ? (xa != -1) : (xa != 0); + if (tmp == noreg || (xc == 0 && xd == 0)) { + if (xa_loaded) { + lis(d, xa); + if (xb) { ori(d, d, (unsigned short)xb); } + } else { + li(d, xb); + } + sldi(d, d, 32); + if (xc) { oris(d, d, (unsigned short)xc); } + if (xd) { ori( d, d, (unsigned short)xd); } + } else { + // Exploit instruction level parallelism if we have a tmp register. + bool xc_loaded = (xd & 0x8000) ? (xc != -1) : (xc != 0); + if (xa_loaded) { + lis(tmp, xa); + } + if (xc_loaded) { + lis(d, xc); + } + if (xa_loaded) { + if (xb) { ori(tmp, tmp, (unsigned short)xb); } + } else { + li(tmp, xb); + } + if (xc_loaded) { + if (xd) { ori(d, d, (unsigned short)xd); } + } else { + li(d, xd); + } + insrdi(d, tmp, 32, 0); + } + } + return retval; + } + xc = rem & 0xFFFF; // Next 16-bit chunk. rem = (rem >> 16) + ((unsigned short)xc >> 15); // Compensation for sign extend. @@ -377,28 +428,27 @@ lis(d, xc); } else { // High 32 bits needed. - if (tmp != noreg) { // opt 3: We have a temp reg. + if (tmp != noreg && (int)x != 0) { // opt 3: We have a temp reg. // No carry propagation between xc and higher chunks here (use logical instructions). xa = (x >> 48) & 0xffff; xb = (x >> 32) & 0xffff; // No sign compensation, we use lis+ori or li to allow usage of R0. - bool load_xa = (xa != 0) || (xb < 0); + bool xa_loaded = (xb & 0x8000) ? (xa != -1) : (xa != 0); bool return_xd = false; - if (load_xa) { lis(tmp, xa); } + if (xa_loaded) { lis(tmp, xa); } if (xc) { lis(d, xc); } - if (load_xa) { + if (xa_loaded) { if (xb) { ori(tmp, tmp, (unsigned short)xb); } // No addi, we support tmp == R0. } else { - li(tmp, xb); // non-negative + li(tmp, xb); } if (xc) { - if (return_simm16_rest && xd >= 0) { return_xd = true; } // >= 0 to avoid carry propagation after insrdi/rldimi. - else if (xd) { addi(d, d, xd); } + if (xd) { addi(d, d, xd); } } else { li(d, xd); } insrdi(d, tmp, 32, 0); - return return_xd ? xd : 0; // non-negative + return retval; } xb = rem & 0xFFFF; // Next 16-bit chunk. @@ -417,11 +467,51 @@ if (xc) { addis(d, d, xc); } } - // opt 5: Return offset to be inserted into following instruction. - if (return_simm16_rest) return xd; + if (xd) { addi(d, d, xd); } + return retval; +} - if (xd) { addi(d, d, xd); } - return 0; +// We emit only one addition to s to optimize latency. +int Assembler::add_const_optimized(Register d, Register s, long x, Register tmp, bool return_simm16_rest) { + assert(s != R0 && s != tmp, "unsupported"); + long rem = x; + + // Case 1: Can use mr or addi. + short xd = rem & 0xFFFF; // Lowest 16-bit chunk. + rem = (rem >> 16) + ((unsigned short)xd >> 15); + if (rem == 0) { + if (xd == 0) { + if (d != s) { mr(d, s); } + return 0; + } + if (return_simm16_rest) { + return xd; + } + addi(d, s, xd); + return 0; + } + + // Case 2: Can use addis. + if (xd == 0) { + short xc = rem & 0xFFFF; // 2nd 16-bit chunk. + rem = (rem >> 16) + ((unsigned short)xd >> 15); + if (rem == 0) { + addis(d, s, xc); + return 0; + } + } + + // Other cases: load & add. + Register tmp1 = tmp, + tmp2 = noreg; + if ((d != tmp) && (d != s)) { + // Can use d. + tmp1 = d; + tmp2 = tmp; + } + int simm16_rest = load_const_optimized(tmp1, x, tmp2, return_simm16_rest); + add(d, tmp1, s); + return simm16_rest; } #ifndef PRODUCT diff --git a/src/cpu/ppc/vm/assembler_ppc.hpp b/src/cpu/ppc/vm/assembler_ppc.hpp --- a/src/cpu/ppc/vm/assembler_ppc.hpp +++ b/src/cpu/ppc/vm/assembler_ppc.hpp @@ -224,10 +224,13 @@ ADDIS_OPCODE = (15u << OPCODE_SHIFT), ADDIC__OPCODE = (13u << OPCODE_SHIFT), ADDE_OPCODE = (31u << OPCODE_SHIFT | 138u << 1), + ADDME_OPCODE = (31u << OPCODE_SHIFT | 234u << 1), + ADDZE_OPCODE = (31u << OPCODE_SHIFT | 202u << 1), SUBF_OPCODE = (31u << OPCODE_SHIFT | 40u << 1), SUBFC_OPCODE = (31u << OPCODE_SHIFT | 8u << 1), SUBFE_OPCODE = (31u << OPCODE_SHIFT | 136u << 1), SUBFIC_OPCODE = (8u << OPCODE_SHIFT), + SUBFME_OPCODE = (31u << OPCODE_SHIFT | 232u << 1), SUBFZE_OPCODE = (31u << OPCODE_SHIFT | 200u << 1), DIVW_OPCODE = (31u << OPCODE_SHIFT | 491u << 1), MULLW_OPCODE = (31u << OPCODE_SHIFT | 235u << 1), @@ -657,6 +660,9 @@ SYNC_OPCODE = (31u << OPCODE_SHIFT | 598u << 1), EIEIO_OPCODE = (31u << OPCODE_SHIFT | 854u << 1), + // Wait instructions for polling. + WAIT_OPCODE = (31u << OPCODE_SHIFT | 62u << 1), + // Trap instructions TDI_OPCODE = (2u << OPCODE_SHIFT), TWI_OPCODE = (3u << OPCODE_SHIFT), @@ -666,8 +672,10 @@ // Atomics. LWARX_OPCODE = (31u << OPCODE_SHIFT | 20u << 1), LDARX_OPCODE = (31u << OPCODE_SHIFT | 84u << 1), + LQARX_OPCODE = (31u << OPCODE_SHIFT | 276u << 1), STWCX_OPCODE = (31u << OPCODE_SHIFT | 150u << 1), - STDCX_OPCODE = (31u << OPCODE_SHIFT | 214u << 1) + STDCX_OPCODE = (31u << OPCODE_SHIFT | 214u << 1), + STQCX_OPCODE = (31u << OPCODE_SHIFT | 182u << 1) }; @@ -1171,6 +1179,14 @@ inline void adde_( Register d, Register a, Register b); inline void subfe( Register d, Register a, Register b); inline void subfe_( Register d, Register a, Register b); + inline void addme( Register d, Register a); + inline void addme_( Register d, Register a); + inline void subfme( Register d, Register a); + inline void subfme_(Register d, Register a); + inline void addze( Register d, Register a); + inline void addze_( Register d, Register a); + inline void subfze( Register d, Register a); + inline void subfze_(Register d, Register a); inline void neg( Register d, Register a); inline void neg_( Register d, Register a); inline void mulli( Register d, Register a, int si16); @@ -1189,6 +1205,38 @@ inline void divw( Register d, Register a, Register b); inline void divw_( Register d, Register a, Register b); + // Fixed-Point Arithmetic Instructions with Overflow detection + inline void addo( Register d, Register a, Register b); + inline void addo_( Register d, Register a, Register b); + inline void subfo( Register d, Register a, Register b); + inline void subfo_( Register d, Register a, Register b); + inline void addco( Register d, Register a, Register b); + inline void addco_( Register d, Register a, Register b); + inline void subfco( Register d, Register a, Register b); + inline void subfco_( Register d, Register a, Register b); + inline void addeo( Register d, Register a, Register b); + inline void addeo_( Register d, Register a, Register b); + inline void subfeo( Register d, Register a, Register b); + inline void subfeo_( Register d, Register a, Register b); + inline void addmeo( Register d, Register a); + inline void addmeo_( Register d, Register a); + inline void subfmeo( Register d, Register a); + inline void subfmeo_(Register d, Register a); + inline void addzeo( Register d, Register a); + inline void addzeo_( Register d, Register a); + inline void subfzeo( Register d, Register a); + inline void subfzeo_(Register d, Register a); + inline void nego( Register d, Register a); + inline void nego_( Register d, Register a); + inline void mulldo( Register d, Register a, Register b); + inline void mulldo_( Register d, Register a, Register b); + inline void mullwo( Register d, Register a, Register b); + inline void mullwo_( Register d, Register a, Register b); + inline void divdo( Register d, Register a, Register b); + inline void divdo_( Register d, Register a, Register b); + inline void divwo( Register d, Register a, Register b); + inline void divwo_( Register d, Register a, Register b); + // extended mnemonics inline void li( Register d, int si16); inline void lis( Register d, int si16); @@ -1303,7 +1351,7 @@ inline void isel_0( Register d, ConditionRegister cr, Condition cc, Register b = noreg); // PPC 1, section 3.3.11, Fixed-Point Logical Instructions - void andi( Register a, Register s, int ui16); // optimized version + void andi( Register a, Register s, long ui16); // optimized version inline void andi_( Register a, Register s, int ui16); inline void andis_( Register a, Register s, int ui16); inline void ori( Register a, Register s, int ui16); @@ -1688,14 +1736,21 @@ inline void isync(); inline void elemental_membar(int e); // Elemental Memory Barriers (>=Power 8) + // Wait instructions for polling. Attention: May result in SIGILL. + inline void wait(); + inline void waitrsv(); // >=Power7 + // atomics inline void lwarx_unchecked(Register d, Register a, Register b, int eh1 = 0); inline void ldarx_unchecked(Register d, Register a, Register b, int eh1 = 0); + inline void lqarx_unchecked(Register d, Register a, Register b, int eh1 = 0); inline bool lxarx_hint_exclusive_access(); inline void lwarx( Register d, Register a, Register b, bool hint_exclusive_access = false); inline void ldarx( Register d, Register a, Register b, bool hint_exclusive_access = false); + inline void lqarx( Register d, Register a, Register b, bool hint_exclusive_access = false); inline void stwcx_( Register s, Register a, Register b); inline void stdcx_( Register s, Register a, Register b); + inline void stqcx_( Register s, Register a, Register b); // Instructions for adjusting thread priority for simultaneous // multithreading (SMT) on Power5. @@ -2054,10 +2109,13 @@ // Atomics: use ra0mem to disallow R0 as base. inline void lwarx_unchecked(Register d, Register b, int eh1); inline void ldarx_unchecked(Register d, Register b, int eh1); + inline void lqarx_unchecked(Register d, Register b, int eh1); inline void lwarx( Register d, Register b, bool hint_exclusive_access); inline void ldarx( Register d, Register b, bool hint_exclusive_access); + inline void lqarx( Register d, Register b, bool hint_exclusive_access); inline void stwcx_(Register s, Register b); inline void stdcx_(Register s, Register b); + inline void stqcx_(Register s, Register b); inline void lfs( FloatRegister d, int si16); inline void lfsx( FloatRegister d, Register b); inline void lfd( FloatRegister d, int si16); @@ -2120,6 +2178,20 @@ return load_const_optimized(d, (long)(unsigned long)a, tmp, return_simm16_rest); } + // If return_simm16_rest, the return value needs to get added afterwards. + int add_const_optimized(Register d, Register s, long x, Register tmp = R0, bool return_simm16_rest = false); + inline int add_const_optimized(Register d, Register s, void* a, Register tmp = R0, bool return_simm16_rest = false) { + return add_const_optimized(d, s, (long)(unsigned long)a, tmp, return_simm16_rest); + } + + // If return_simm16_rest, the return value needs to get added afterwards. + inline int sub_const_optimized(Register d, Register s, long x, Register tmp = R0, bool return_simm16_rest = false) { + return add_const_optimized(d, s, -x, tmp, return_simm16_rest); + } + inline int sub_const_optimized(Register d, Register s, void* a, Register tmp = R0, bool return_simm16_rest = false) { + return sub_const_optimized(d, s, (long)(unsigned long)a, tmp, return_simm16_rest); + } + // Creation Assembler(CodeBuffer* code) : AbstractAssembler(code) { #ifdef CHECK_DELAY diff --git a/src/cpu/ppc/vm/assembler_ppc.inline.hpp b/src/cpu/ppc/vm/assembler_ppc.inline.hpp --- a/src/cpu/ppc/vm/assembler_ppc.inline.hpp +++ b/src/cpu/ppc/vm/assembler_ppc.inline.hpp @@ -100,6 +100,14 @@ inline void Assembler::adde_( Register d, Register a, Register b) { emit_int32(ADDE_OPCODE | rt(d) | ra(a) | rb(b) | oe(0) | rc(1)); } inline void Assembler::subfe( Register d, Register a, Register b) { emit_int32(SUBFE_OPCODE | rt(d) | ra(a) | rb(b) | oe(0) | rc(0)); } inline void Assembler::subfe_( Register d, Register a, Register b) { emit_int32(SUBFE_OPCODE | rt(d) | ra(a) | rb(b) | oe(0) | rc(1)); } +inline void Assembler::addme( Register d, Register a) { emit_int32(ADDME_OPCODE | rt(d) | ra(a) | oe(0) | rc(0)); } +inline void Assembler::addme_( Register d, Register a) { emit_int32(ADDME_OPCODE | rt(d) | ra(a) | oe(0) | rc(1)); } +inline void Assembler::subfme( Register d, Register a) { emit_int32(SUBFME_OPCODE | rt(d) | ra(a) | oe(0) | rc(0)); } +inline void Assembler::subfme_(Register d, Register a) { emit_int32(SUBFME_OPCODE | rt(d) | ra(a) | oe(0) | rc(1)); } +inline void Assembler::addze( Register d, Register a) { emit_int32(ADDZE_OPCODE | rt(d) | ra(a) | oe(0) | rc(0)); } +inline void Assembler::addze_( Register d, Register a) { emit_int32(ADDZE_OPCODE | rt(d) | ra(a) | oe(0) | rc(1)); } +inline void Assembler::subfze( Register d, Register a) { emit_int32(SUBFZE_OPCODE | rt(d) | ra(a) | oe(0) | rc(0)); } +inline void Assembler::subfze_(Register d, Register a) { emit_int32(SUBFZE_OPCODE | rt(d) | ra(a) | oe(0) | rc(1)); } inline void Assembler::neg( Register d, Register a) { emit_int32(NEG_OPCODE | rt(d) | ra(a) | oe(0) | rc(0)); } inline void Assembler::neg_( Register d, Register a) { emit_int32(NEG_OPCODE | rt(d) | ra(a) | oe(0) | rc(1)); } inline void Assembler::mulli( Register d, Register a, int si16) { emit_int32(MULLI_OPCODE | rt(d) | ra(a) | simm(si16, 16)); } @@ -118,6 +126,38 @@ inline void Assembler::divw( Register d, Register a, Register b) { emit_int32(DIVW_OPCODE | rt(d) | ra(a) | rb(b) | oe(0) | rc(0)); } inline void Assembler::divw_( Register d, Register a, Register b) { emit_int32(DIVW_OPCODE | rt(d) | ra(a) | rb(b) | oe(0) | rc(1)); } +// Fixed-Point Arithmetic Instructions with Overflow detection +inline void Assembler::addo( Register d, Register a, Register b) { emit_int32(ADD_OPCODE | rt(d) | ra(a) | rb(b) | oe(1) | rc(0)); } +inline void Assembler::addo_( Register d, Register a, Register b) { emit_int32(ADD_OPCODE | rt(d) | ra(a) | rb(b) | oe(1) | rc(1)); } +inline void Assembler::subfo( Register d, Register a, Register b) { emit_int32(SUBF_OPCODE | rt(d) | ra(a) | rb(b) | oe(1) | rc(0)); } +inline void Assembler::subfo_( Register d, Register a, Register b) { emit_int32(SUBF_OPCODE | rt(d) | ra(a) | rb(b) | oe(1) | rc(1)); } +inline void Assembler::addco( Register d, Register a, Register b) { emit_int32(ADDC_OPCODE | rt(d) | ra(a) | rb(b) | oe(1) | rc(0)); } +inline void Assembler::addco_( Register d, Register a, Register b) { emit_int32(ADDC_OPCODE | rt(d) | ra(a) | rb(b) | oe(1) | rc(1)); } +inline void Assembler::subfco( Register d, Register a, Register b) { emit_int32(SUBFC_OPCODE | rt(d) | ra(a) | rb(b) | oe(1) | rc(0)); } +inline void Assembler::subfco_( Register d, Register a, Register b) { emit_int32(SUBFC_OPCODE | rt(d) | ra(a) | rb(b) | oe(1) | rc(1)); } +inline void Assembler::addeo( Register d, Register a, Register b) { emit_int32(ADDE_OPCODE | rt(d) | ra(a) | rb(b) | oe(1) | rc(0)); } +inline void Assembler::addeo_( Register d, Register a, Register b) { emit_int32(ADDE_OPCODE | rt(d) | ra(a) | rb(b) | oe(1) | rc(1)); } +inline void Assembler::subfeo( Register d, Register a, Register b) { emit_int32(SUBFE_OPCODE | rt(d) | ra(a) | rb(b) | oe(1) | rc(0)); } +inline void Assembler::subfeo_( Register d, Register a, Register b) { emit_int32(SUBFE_OPCODE | rt(d) | ra(a) | rb(b) | oe(1) | rc(1)); } +inline void Assembler::addmeo( Register d, Register a) { emit_int32(ADDME_OPCODE | rt(d) | ra(a) | oe(1) | rc(0)); } +inline void Assembler::addmeo_( Register d, Register a) { emit_int32(ADDME_OPCODE | rt(d) | ra(a) | oe(1) | rc(1)); } +inline void Assembler::subfmeo( Register d, Register a) { emit_int32(SUBFME_OPCODE | rt(d) | ra(a) | oe(1) | rc(0)); } +inline void Assembler::subfmeo_(Register d, Register a) { emit_int32(SUBFME_OPCODE | rt(d) | ra(a) | oe(1) | rc(1)); } +inline void Assembler::addzeo( Register d, Register a) { emit_int32(ADDZE_OPCODE | rt(d) | ra(a) | oe(1) | rc(0)); } +inline void Assembler::addzeo_( Register d, Register a) { emit_int32(ADDZE_OPCODE | rt(d) | ra(a) | oe(1) | rc(1)); } +inline void Assembler::subfzeo( Register d, Register a) { emit_int32(SUBFZE_OPCODE | rt(d) | ra(a) | oe(1) | rc(0)); } +inline void Assembler::subfzeo_(Register d, Register a) { emit_int32(SUBFZE_OPCODE | rt(d) | ra(a) | oe(1) | rc(1)); } +inline void Assembler::nego( Register d, Register a) { emit_int32(NEG_OPCODE | rt(d) | ra(a) | oe(1) | rc(0)); } +inline void Assembler::nego_( Register d, Register a) { emit_int32(NEG_OPCODE | rt(d) | ra(a) | oe(1) | rc(1)); } +inline void Assembler::mulldo( Register d, Register a, Register b) { emit_int32(MULLD_OPCODE | rt(d) | ra(a) | rb(b) | oe(1) | rc(0)); } +inline void Assembler::mulldo_( Register d, Register a, Register b) { emit_int32(MULLD_OPCODE | rt(d) | ra(a) | rb(b) | oe(1) | rc(1)); } +inline void Assembler::mullwo( Register d, Register a, Register b) { emit_int32(MULLW_OPCODE | rt(d) | ra(a) | rb(b) | oe(1) | rc(0)); } +inline void Assembler::mullwo_( Register d, Register a, Register b) { emit_int32(MULLW_OPCODE | rt(d) | ra(a) | rb(b) | oe(1) | rc(1)); } +inline void Assembler::divdo( Register d, Register a, Register b) { emit_int32(DIVD_OPCODE | rt(d) | ra(a) | rb(b) | oe(1) | rc(0)); } +inline void Assembler::divdo_( Register d, Register a, Register b) { emit_int32(DIVD_OPCODE | rt(d) | ra(a) | rb(b) | oe(1) | rc(1)); } +inline void Assembler::divwo( Register d, Register a, Register b) { emit_int32(DIVW_OPCODE | rt(d) | ra(a) | rb(b) | oe(1) | rc(0)); } +inline void Assembler::divwo_( Register d, Register a, Register b) { emit_int32(DIVW_OPCODE | rt(d) | ra(a) | rb(b) | oe(1) | rc(1)); } + // extended mnemonics inline void Assembler::li( Register d, int si16) { Assembler::addi_r0ok( d, R0, si16); } inline void Assembler::lis( Register d, int si16) { Assembler::addis_r0ok(d, R0, si16); } @@ -540,15 +580,22 @@ inline void Assembler::isync() { emit_int32( ISYNC_OPCODE); } inline void Assembler::elemental_membar(int e) { assert(0 < e && e < 16, "invalid encoding"); emit_int32( SYNC_OPCODE | e1215(e)); } +// Wait instructions for polling. +inline void Assembler::wait() { emit_int32( WAIT_OPCODE); } +inline void Assembler::waitrsv() { emit_int32( WAIT_OPCODE | 1<<(31-10)); } // WC=0b01 >=Power7 + // atomics // Use ra0mem to disallow R0 as base. inline void Assembler::lwarx_unchecked(Register d, Register a, Register b, int eh1) { emit_int32( LWARX_OPCODE | rt(d) | ra0mem(a) | rb(b) | eh(eh1)); } inline void Assembler::ldarx_unchecked(Register d, Register a, Register b, int eh1) { emit_int32( LDARX_OPCODE | rt(d) | ra0mem(a) | rb(b) | eh(eh1)); } +inline void Assembler::lqarx_unchecked(Register d, Register a, Register b, int eh1) { emit_int32( LQARX_OPCODE | rt(d) | ra0mem(a) | rb(b) | eh(eh1)); } inline bool Assembler::lxarx_hint_exclusive_access() { return VM_Version::has_lxarxeh(); } inline void Assembler::lwarx( Register d, Register a, Register b, bool hint_exclusive_access) { lwarx_unchecked(d, a, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); } inline void Assembler::ldarx( Register d, Register a, Register b, bool hint_exclusive_access) { ldarx_unchecked(d, a, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); } +inline void Assembler::lqarx( Register d, Register a, Register b, bool hint_exclusive_access) { lqarx_unchecked(d, a, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); } inline void Assembler::stwcx_(Register s, Register a, Register b) { emit_int32( STWCX_OPCODE | rs(s) | ra0mem(a) | rb(b) | rc(1)); } inline void Assembler::stdcx_(Register s, Register a, Register b) { emit_int32( STDCX_OPCODE | rs(s) | ra0mem(a) | rb(b) | rc(1)); } +inline void Assembler::stqcx_(Register s, Register a, Register b) { emit_int32( STQCX_OPCODE | rs(s) | ra0mem(a) | rb(b) | rc(1)); } // Instructions for adjusting thread priority // for simultaneous multithreading (SMT) on POWER5. @@ -873,10 +920,13 @@ // ra0 version inline void Assembler::lwarx_unchecked(Register d, Register b, int eh1) { emit_int32( LWARX_OPCODE | rt(d) | rb(b) | eh(eh1)); } inline void Assembler::ldarx_unchecked(Register d, Register b, int eh1) { emit_int32( LDARX_OPCODE | rt(d) | rb(b) | eh(eh1)); } +inline void Assembler::lqarx_unchecked(Register d, Register b, int eh1) { emit_int32( LQARX_OPCODE | rt(d) | rb(b) | eh(eh1)); } inline void Assembler::lwarx( Register d, Register b, bool hint_exclusive_access){ lwarx_unchecked(d, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); } inline void Assembler::ldarx( Register d, Register b, bool hint_exclusive_access){ ldarx_unchecked(d, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); } +inline void Assembler::lqarx( Register d, Register b, bool hint_exclusive_access){ lqarx_unchecked(d, b, (hint_exclusive_access && lxarx_hint_exclusive_access() && UseExtendedLoadAndReserveInstructionsPPC64) ? 1 : 0); } inline void Assembler::stwcx_(Register s, Register b) { emit_int32( STWCX_OPCODE | rs(s) | rb(b) | rc(1)); } inline void Assembler::stdcx_(Register s, Register b) { emit_int32( STDCX_OPCODE | rs(s) | rb(b) | rc(1)); } +inline void Assembler::stqcx_(Register s, Register b) { emit_int32( STQCX_OPCODE | rs(s) | rb(b) | rc(1)); } // ra0 version inline void Assembler::lfs( FloatRegister d, int si16) { emit_int32( LFS_OPCODE | frt(d) | simm(si16,16)); } diff --git a/src/cpu/ppc/vm/c2_globals_ppc.hpp b/src/cpu/ppc/vm/c2_globals_ppc.hpp --- a/src/cpu/ppc/vm/c2_globals_ppc.hpp +++ b/src/cpu/ppc/vm/c2_globals_ppc.hpp @@ -1,6 +1,6 @@ /* * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved. - * Copyright 2012, 2014 SAP AG. All rights reserved. + * Copyright 2012, 2015 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -47,7 +47,7 @@ define_pd_global(intx, FLOATPRESSURE, 28); define_pd_global(intx, FreqInlineSize, 175); define_pd_global(intx, MinJumpTableSize, 10); -define_pd_global(intx, INTPRESSURE, 25); +define_pd_global(intx, INTPRESSURE, 26); define_pd_global(intx, InteriorEntryAlignment, 16); define_pd_global(size_t, NewSizeThreadIncrease, ScaleForWordSize(4*K)); define_pd_global(intx, RegisterCostAreaRatio, 16000); diff --git a/src/cpu/ppc/vm/globals_ppc.hpp b/src/cpu/ppc/vm/globals_ppc.hpp --- a/src/cpu/ppc/vm/globals_ppc.hpp +++ b/src/cpu/ppc/vm/globals_ppc.hpp @@ -1,6 +1,6 @@ /* * Copyright (c) 2002, 2015, Oracle and/or its affiliates. All rights reserved. - * Copyright 2012, 2013 SAP AG. All rights reserved. + * Copyright 2012, 2015 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -58,7 +58,7 @@ // GC Ergo Flags define_pd_global(size_t, CMSYoungGenPerWorker, 16*M); // Default max size of CMS young gen, per GC worker thread. -define_pd_global(uintx, TypeProfileLevel, 0); +define_pd_global(uintx, TypeProfileLevel, 111); // Platform dependent flag handling: flags only defined on this platform. #define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct) \ @@ -71,14 +71,26 @@ \ product(uintx, PowerArchitecturePPC64, 0, \ "CPU Version: x for PowerX. Currently recognizes Power5 to " \ - "Power7. Default is 0. CPUs newer than Power7 will be " \ - "recognized as Power7.") \ + "Power8. Default is 0. Newer CPUs will be recognized as Power8.") \ \ /* Reoptimize code-sequences of calls at runtime, e.g. replace an */ \ /* indirect call by a direct call. */ \ product(bool, ReoptimizeCallSequences, true, \ "Reoptimize code-sequences of calls at runtime.") \ \ + /* Power 8: Configure Data Stream Control Register. */ \ + product(uint64_t,DSCR_PPC64, (uintx)-1, \ + "Power8 or later: Specify encoded value for Data Stream Control " \ + "Register") \ + product(uint64_t,DSCR_DPFD_PPC64, 8, \ + "Power8 or later: DPFD (default prefetch depth) value of the " \ + "Data Stream Control Register." \ + " 0: hardware default, 1: none, 2-7: min-max, 8: don't touch") \ + product(uint64_t,DSCR_URG_PPC64, 8, \ + "Power8 or later: URG (depth attainment urgency) value of the " \ + "Data Stream Control Register." \ + " 0: hardware default, 1: none, 2-7: min-max, 8: don't touch") \ + \ product(bool, UseLoadInstructionsForStackBangingPPC64, false, \ "Use load instructions for stack banging.") \ \ @@ -121,6 +133,41 @@ \ product(bool, ZapMemory, false, "Write 0x0101... to empty memory." \ " Use this to ease debugging.") \ - + \ + /* Use Restricted Transactional Memory for lock eliding */ \ + product(bool, UseRTMLocking, false, \ + "Enable RTM lock eliding for inflated locks in compiled code") \ + \ + experimental(bool, UseRTMForStackLocks, false, \ + "Enable RTM lock eliding for stack locks in compiled code") \ + \ + product(bool, UseRTMDeopt, false, \ + "Perform deopt and recompilation based on RTM abort ratio") \ + \ + product(uintx, RTMRetryCount, 5, \ + "Number of RTM retries on lock abort or busy") \ + \ + experimental(intx, RTMSpinLoopCount, 100, \ + "Spin count for lock to become free before RTM retry") \ + \ + experimental(intx, RTMAbortThreshold, 1000, \ + "Calculate abort ratio after this number of aborts") \ + \ + experimental(intx, RTMLockingThreshold, 10000, \ + "Lock count at which to do RTM lock eliding without " \ + "abort ratio calculation") \ + \ + experimental(intx, RTMAbortRatio, 50, \ + "Lock abort ratio at which to stop use RTM lock eliding") \ + \ + experimental(intx, RTMTotalCountIncrRate, 64, \ + "Increment total RTM attempted lock count once every n times") \ + \ + experimental(intx, RTMLockingCalculationDelay, 0, \ + "Number of milliseconds to wait before start calculating aborts " \ + "for RTM locking") \ + \ + experimental(bool, UseRTMXendForLockBusy, true, \ + "Use RTM Xend instead of Xabort when lock busy") \ #endif // CPU_PPC_VM_GLOBALS_PPC_HPP diff --git a/src/cpu/ppc/vm/interp_masm_ppc_64.cpp b/src/cpu/ppc/vm/interp_masm_ppc_64.cpp --- a/src/cpu/ppc/vm/interp_masm_ppc_64.cpp +++ b/src/cpu/ppc/vm/interp_masm_ppc_64.cpp @@ -446,7 +446,7 @@ } // Load object from cpool->resolved_references(index). -void InterpreterMacroAssembler::load_resolved_reference_at_index(Register result, Register index) { +void InterpreterMacroAssembler::load_resolved_reference_at_index(Register result, Register index, Label *is_null) { assert_different_registers(result, index); get_constant_pool(result); @@ -469,7 +469,7 @@ #endif // Add in the index. add(result, tmp, result); - load_heap_oop(result, arrayOopDesc::base_offset_in_bytes(T_OBJECT), result); + load_heap_oop(result, arrayOopDesc::base_offset_in_bytes(T_OBJECT), result, is_null); } // Generate a subtype check: branch to ok_is_subtype if sub_klass is @@ -876,7 +876,6 @@ // If condition is true we are done and hence we can store 0 in the displaced // header indicating it is a recursive lock. bne(CCR0, slow_case); - release(); std(R0/*==0!*/, BasicObjectLock::lock_offset_in_bytes() + BasicLock::displaced_header_offset_in_bytes(), monitor); b(done); @@ -1861,7 +1860,7 @@ const Register mdp = tmp1; add(mdp, tmp1, R28_mdx); - // Pffset of the current profile entry to update. + // Offset of the current profile entry to update. const Register entry_offset = tmp2; // entry_offset = array len in number of cells ld(entry_offset, in_bytes(ArrayData::array_len_offset()), mdp); diff --git a/src/cpu/ppc/vm/interp_masm_ppc_64.hpp b/src/cpu/ppc/vm/interp_masm_ppc_64.hpp --- a/src/cpu/ppc/vm/interp_masm_ppc_64.hpp +++ b/src/cpu/ppc/vm/interp_masm_ppc_64.hpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 2002, 2013, Oracle and/or its affiliates. All rights reserved. - * Copyright 2012, 2014 SAP AG. All rights reserved. + * Copyright (c) 2002, 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012, 2015 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -85,7 +85,7 @@ Register tmp1, Register tmp2, Register tmp3, Label &ok_is_subtype); // Load object from cpool->resolved_references(index). - void load_resolved_reference_at_index(Register result, Register index); + void load_resolved_reference_at_index(Register result, Register index, Label *is_null = NULL); void generate_stack_overflow_check_with_compare_and_throw(Register Rmem_frame_size, Register Rscratch1); void load_receiver(Register Rparam_count, Register Rrecv_dst); diff --git a/src/cpu/ppc/vm/interpreter_ppc.hpp b/src/cpu/ppc/vm/interpreter_ppc.hpp --- a/src/cpu/ppc/vm/interpreter_ppc.hpp +++ b/src/cpu/ppc/vm/interpreter_ppc.hpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 2002, 2013, Oracle and/or its affiliates. All rights reserved. - * Copyright 2012, 2014 SAP AG. All rights reserved. + * Copyright (c) 2002, 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012, 2015 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -47,4 +47,4 @@ } #endif -#endif // CPU_PPC_VM_INTERPRETER_PPC_PP +#endif // CPU_PPC_VM_INTERPRETER_PPC_HPP diff --git a/src/cpu/ppc/vm/macroAssembler_ppc.cpp b/src/cpu/ppc/vm/macroAssembler_ppc.cpp --- a/src/cpu/ppc/vm/macroAssembler_ppc.cpp +++ b/src/cpu/ppc/vm/macroAssembler_ppc.cpp @@ -1,6 +1,6 @@ /* * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved. - * Copyright 2012, 2014 SAP AG. All rights reserved. + * Copyright 2012, 2015 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -1455,7 +1455,7 @@ // Several special cases exist to avoid that unnecessary information is generated. // void MacroAssembler::cmpxchgd(ConditionRegister flag, - Register dest_current_value, Register compare_value, Register exchange_value, + Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, Register addr_base, int semantics, bool cmpxchgx_hint, Register int_flag_success, Label* failed_ext, bool contention_hint) { Label retry; @@ -1465,7 +1465,7 @@ // Save one branch if result is returned via register and result register is different from the other ones. bool use_result_reg = (int_flag_success!=noreg); - bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value && + bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && int_flag_success!=exchange_value && int_flag_success!=addr_base); assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both"); @@ -1481,7 +1481,7 @@ // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). if (contention_hint) { // Don't try to reserve if cmp fails. ld(dest_current_value, 0, addr_base); - cmpd(flag, dest_current_value, compare_value); + cmpd(flag, compare_value, dest_current_value); bne(flag, failed); } @@ -1489,7 +1489,7 @@ bind(retry); ldarx(dest_current_value, addr_base, cmpxchgx_hint); - cmpd(flag, dest_current_value, compare_value); + cmpd(flag, compare_value, dest_current_value); if (UseStaticBranchPredictionInCompareAndSwapPPC64) { bne_predict_not_taken(flag, failed); } else { @@ -1873,7 +1873,6 @@ assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). - fence(); // TODO: replace by MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq ? cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, /*where=*/obj_reg, @@ -1909,7 +1908,6 @@ assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). - fence(); // TODO: replace by MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq ? cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, /*where=*/obj_reg, @@ -1946,7 +1944,6 @@ assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg). - fence(); // TODO: replace by MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq ? cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg, /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg, /*where=*/obj_reg, @@ -1987,9 +1984,371 @@ beq(cr_reg, done); } +// TM on PPC64. +void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) { + Label retry; + bind(retry); + ldarx(result, addr, /*hint*/ false); + addi(result, result, simm16); + stdcx_(result, addr); + if (UseStaticBranchPredictionInCompareAndSwapPPC64) { + bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 + } else { + bne( CCR0, retry); // stXcx_ sets CCR0 + } +} + +void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) { + Label retry; + bind(retry); + lwarx(result, addr, /*hint*/ false); + ori(result, result, uimm16); + stwcx_(result, addr); + if (UseStaticBranchPredictionInCompareAndSwapPPC64) { + bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0 + } else { + bne( CCR0, retry); // stXcx_ sets CCR0 + } +} + +#if INCLUDE_RTM_OPT + +// Update rtm_counters based on abort status +// input: abort_status +// rtm_counters (RTMLockingCounters*) +void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) { + // Mapping to keep PreciseRTMLockingStatistics similar to x86. + // x86 ppc (! means inverted, ? means not the same) + // 0 31 Set if abort caused by XABORT instruction. + // 1 ! 7 If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set. + // 2 13 Set if another logical processor conflicted with a memory address that was part of the transaction that aborted. + // 3 10 Set if an internal buffer overflowed. + // 4 ?12 Set if a debug breakpoint was hit. + // 5 ?32 Set if an abort occurred during execution of a nested transaction. + const int tm_failure_bit[] = {Assembler::tm_tabort, // Note: Seems like signal handler sets this, too. + Assembler::tm_failure_persistent, // inverted: transient + Assembler::tm_trans_cf, + Assembler::tm_footprint_of, + Assembler::tm_non_trans_cf, + Assembler::tm_suspended}; + const bool tm_failure_inv[] = {false, true, false, false, false, false}; + assert(sizeof(tm_failure_bit)/sizeof(int) == RTMLockingCounters::ABORT_STATUS_LIMIT, "adapt mapping!"); + + const Register addr_Reg = R0; + // Keep track of offset to where rtm_counters_Reg had pointed to. + int counters_offs = RTMLockingCounters::abort_count_offset(); + addi(addr_Reg, rtm_counters_Reg, counters_offs); + const Register temp_Reg = rtm_counters_Reg; + + //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically + ldx(temp_Reg, addr_Reg); + addi(temp_Reg, temp_Reg, 1); + stdx(temp_Reg, addr_Reg); + + if (PrintPreciseRTMLockingStatistics) { + int counters_offs_delta = RTMLockingCounters::abortX_count_offset() - counters_offs; + + //mftexasr(abort_status); done by caller + for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { + counters_offs += counters_offs_delta; + li(temp_Reg, counters_offs_delta); // can't use addi with R0 + add(addr_Reg, addr_Reg, temp_Reg); // point to next counter + counters_offs_delta = sizeof(uintx); + + Label check_abort; + rldicr_(temp_Reg, abort_status, tm_failure_bit[i], 0); + if (tm_failure_inv[i]) { + bne(CCR0, check_abort); + } else { + beq(CCR0, check_abort); + } + //atomic_inc_ptr(addr_Reg, temp_Reg); We don't increment atomically + ldx(temp_Reg, addr_Reg); + addi(temp_Reg, temp_Reg, 1); + stdx(temp_Reg, addr_Reg); + bind(check_abort); + } + } + li(temp_Reg, -counters_offs); // can't use addi with R0 + add(rtm_counters_Reg, addr_Reg, temp_Reg); // restore +} + +// Branch if (random & (count-1) != 0), count is 2^n +// tmp and CR0 are killed +void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) { + mftb(tmp); + andi_(tmp, tmp, count-1); + bne(CCR0, brLabel); +} + +// Perform abort ratio calculation, set no_rtm bit if high ratio. +// input: rtm_counters_Reg (RTMLockingCounters* address) - KILLED +void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg, + RTMLockingCounters* rtm_counters, + Metadata* method_data) { + Label L_done, L_check_always_rtm1, L_check_always_rtm2; + + if (RTMLockingCalculationDelay > 0) { + // Delay calculation. + ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr()); + cmpdi(CCR0, rtm_counters_Reg, 0); + beq(CCR0, L_done); + load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload + } + // Abort ratio calculation only if abort_count > RTMAbortThreshold. + // Aborted transactions = abort_count * 100 + // All transactions = total_count * RTMTotalCountIncrRate + // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) + ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg); + cmpdi(CCR0, R0, RTMAbortThreshold); + blt(CCR0, L_check_always_rtm2); + mulli(R0, R0, 100); + + const Register tmpReg = rtm_counters_Reg; + ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); + mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); + mulli(tmpReg, tmpReg, RTMAbortRatio); + cmpd(CCR0, R0, tmpReg); + blt(CCR0, L_check_always_rtm1); // jump to reload + if (method_data != NULL) { + // Set rtm_state to "no rtm" in MDO. + // Not using a metadata relocation. Method and Class Loader are kept alive anyway. + // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.) + load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); + atomic_ori_int(R0, tmpReg, NoRTM); + } + b(L_done); + + bind(L_check_always_rtm1); + load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload + bind(L_check_always_rtm2); + ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg); + cmpdi(CCR0, tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); + blt(CCR0, L_done); + if (method_data != NULL) { + // Set rtm_state to "always rtm" in MDO. + // Not using a metadata relocation. See above. + load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg); + atomic_ori_int(R0, tmpReg, UseRTM); + } + bind(L_done); +} + +// Update counters and perform abort ratio calculation. +// input: abort_status_Reg +void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg, + RTMLockingCounters* rtm_counters, + Metadata* method_data, + bool profile_rtm) { + + assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); + // Update rtm counters based on state at abort. + // Reads abort_status_Reg, updates flags. + assert_different_registers(abort_status_Reg, temp_Reg); + load_const_optimized(temp_Reg, (address)rtm_counters, R0); + rtm_counters_update(abort_status_Reg, temp_Reg); + if (profile_rtm) { + assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); + rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data); + } +} + +// Retry on abort if abort's status indicates non-persistent failure. +// inputs: retry_count_Reg +// : abort_status_Reg +// output: retry_count_Reg decremented by 1 +void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, + Label& retryLabel, Label* checkRetry) { + Label doneRetry; + rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0); + bne(CCR0, doneRetry); + if (checkRetry) { bind(*checkRetry); } + addic_(retry_count_Reg, retry_count_Reg, -1); + blt(CCR0, doneRetry); + smt_yield(); // Can't use wait(). No permission (SIGILL). + b(retryLabel); + bind(doneRetry); +} + +// Spin and retry if lock is busy. +// inputs: box_Reg (monitor address) +// : retry_count_Reg +// output: retry_count_Reg decremented by 1 +// CTR is killed +void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) { + Label SpinLoop, doneRetry; + addic_(retry_count_Reg, retry_count_Reg, -1); + blt(CCR0, doneRetry); + li(R0, RTMSpinLoopCount); + mtctr(R0); + + bind(SpinLoop); + smt_yield(); // Can't use waitrsv(). No permission (SIGILL). + bdz(retryLabel); + ld(R0, 0, owner_addr_Reg); + cmpdi(CCR0, R0, 0); + bne(CCR0, SpinLoop); + b(retryLabel); + + bind(doneRetry); +} + +// Use RTM for normal stack locks. +// Input: objReg (object to lock) +void MacroAssembler::rtm_stack_locking(ConditionRegister flag, + Register obj, Register mark_word, Register tmp, + Register retry_on_abort_count_Reg, + RTMLockingCounters* stack_rtm_counters, + Metadata* method_data, bool profile_rtm, + Label& DONE_LABEL, Label& IsInflated) { + assert(UseRTMForStackLocks, "why call this otherwise?"); + assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); + Label L_rtm_retry, L_decrement_retry, L_on_abort; + + if (RTMRetryCount > 0) { + load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort + bind(L_rtm_retry); + } + andi_(R0, mark_word, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased + bne(CCR0, IsInflated); + + if (PrintPreciseRTMLockingStatistics || profile_rtm) { + Label L_noincrement; + if (RTMTotalCountIncrRate > 1) { + branch_on_random_using_tb(tmp, (int)RTMTotalCountIncrRate, L_noincrement); + } + assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); + load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0); + //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically + ldx(mark_word, tmp); + addi(mark_word, mark_word, 1); + stdx(mark_word, tmp); + bind(L_noincrement); + } + tbegin_(); + beq(CCR0, L_on_abort); + ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // Reload in transaction, conflicts need to be tracked. + andi(R0, mark_word, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits + cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked + beq(flag, DONE_LABEL); // all done if unlocked + + if (UseRTMXendForLockBusy) { + tend_(); + b(L_decrement_retry); + } else { + tabort_(); + } + bind(L_on_abort); + const Register abort_status_Reg = tmp; + mftexasr(abort_status_Reg); + if (PrintPreciseRTMLockingStatistics || profile_rtm) { + rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm); + } + ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload + if (RTMRetryCount > 0) { + // Retry on lock abort if abort status is not permanent. + rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry); + } else { + bind(L_decrement_retry); + } +} + +// Use RTM for inflating locks +// inputs: obj (object to lock) +// mark_word (current header - KILLED) +// boxReg (on-stack box address (displaced header location) - KILLED) +void MacroAssembler::rtm_inflated_locking(ConditionRegister flag, + Register obj, Register mark_word, Register boxReg, + Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg, + RTMLockingCounters* rtm_counters, + Metadata* method_data, bool profile_rtm, + Label& DONE_LABEL) { + assert(UseRTMLocking, "why call this otherwise?"); + Label L_rtm_retry, L_decrement_retry, L_on_abort; + // Clean monitor_value bit to get valid pointer. + int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value; + + // Store non-null, using boxReg instead of (intptr_t)markOopDesc::unused_mark(). + std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg); + const Register tmpReg = boxReg; + const Register owner_addr_Reg = mark_word; + addi(owner_addr_Reg, mark_word, owner_offset); + + if (RTMRetryCount > 0) { + load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy. + load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort. + bind(L_rtm_retry); + } + if (PrintPreciseRTMLockingStatistics || profile_rtm) { + Label L_noincrement; + if (RTMTotalCountIncrRate > 1) { + branch_on_random_using_tb(R0, (int)RTMTotalCountIncrRate, L_noincrement); + } + assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); + load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg); + //atomic_inc_ptr(R0, tmpReg); We don't increment atomically + ldx(tmpReg, R0); + addi(tmpReg, tmpReg, 1); + stdx(tmpReg, R0); + bind(L_noincrement); + } + tbegin_(); + beq(CCR0, L_on_abort); + // We don't reload mark word. Will only be reset at safepoint. + ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked. + cmpdi(flag, R0, 0); + beq(flag, DONE_LABEL); + + if (UseRTMXendForLockBusy) { + tend_(); + b(L_decrement_retry); + } else { + tabort_(); + } + bind(L_on_abort); + const Register abort_status_Reg = tmpReg; + mftexasr(abort_status_Reg); + if (PrintPreciseRTMLockingStatistics || profile_rtm) { + rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm); + // Restore owner_addr_Reg + ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); +#ifdef ASSERT + andi_(R0, mark_word, markOopDesc::monitor_value); + asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint. +#endif + addi(owner_addr_Reg, mark_word, owner_offset); + } + if (RTMRetryCount > 0) { + // Retry on lock abort if abort status is not permanent. + rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); + } + + // Appears unlocked - try to swing _owner from null to non-null. + cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg, + MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, + MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true); + + if (RTMRetryCount > 0) { + // success done else retry + b(DONE_LABEL); + bind(L_decrement_retry); + // Spin and retry if lock is busy. + rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry); + } else { + bind(L_decrement_retry); + } +} + +#endif // INCLUDE_RTM_OPT + // "The box" is the space on the stack where we copy the object mark. void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, - Register temp, Register displaced_header, Register current_header) { + Register temp, Register displaced_header, Register current_header, + bool try_bias, + RTMLockingCounters* rtm_counters, + RTMLockingCounters* stack_rtm_counters, + Metadata* method_data, + bool use_rtm, bool profile_rtm) { assert_different_registers(oop, box, temp, displaced_header, current_header); assert(flag != CCR0, "bad condition register"); Label cont; @@ -2006,10 +2365,18 @@ return; } - if (UseBiasedLocking) { + if (try_bias) { biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont); } +#if INCLUDE_RTM_OPT + if (UseRTMForStackLocks && use_rtm) { + rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header, + stack_rtm_counters, method_data, profile_rtm, + cont, object_has_monitor); + } +#endif // INCLUDE_RTM_OPT + // Handle existing monitor. if ((EmitSync & 0x02) == 0) { // The object has an existing monitor iff (mark & monitor_value) != 0. @@ -2066,14 +2433,22 @@ bind(object_has_monitor); // The object's monitor m is unlocked iff m->owner == NULL, // otherwise m->owner may contain a thread or a stack address. - // + +#if INCLUDE_RTM_OPT + // Use the same RTM locking code in 32- and 64-bit VM. + if (use_rtm) { + rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header, + rtm_counters, method_data, profile_rtm, cont); + } else { +#endif // INCLUDE_RTM_OPT + // Try to CAS m->owner from NULL to current thread. addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markOopDesc::monitor_value); li(displaced_header, 0); // CmpxchgX sets flag to cmpX(current, displaced). cmpxchgd(/*flag=*/flag, /*current_value=*/current_header, - /*compare_value=*/displaced_header, + /*compare_value=*/(intptr_t)0, /*exchange_value=*/R16_thread, /*where=*/temp, MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, @@ -2095,6 +2470,10 @@ //asm_assert_mem4_isnot_zero(ObjectMonitor::OwnerIsThread_offset_in_bytes(), temp, // "monitor->OwnerIsThread shouldn't be 0", -1); # endif + +#if INCLUDE_RTM_OPT + } // use_rtm() +#endif } bind(cont); @@ -2103,7 +2482,8 @@ } void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, - Register temp, Register displaced_header, Register current_header) { + Register temp, Register displaced_header, Register current_header, + bool try_bias, bool use_rtm) { assert_different_registers(oop, box, temp, displaced_header, current_header); assert(flag != CCR0, "bad condition register"); Label cont; @@ -2115,10 +2495,24 @@ return; } - if (UseBiasedLocking) { + if (try_bias) { biased_locking_exit(flag, oop, current_header, cont); } +#if INCLUDE_RTM_OPT + if (UseRTMForStackLocks && use_rtm) { + assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); + Label L_regular_unlock; + ld(current_header, oopDesc::mark_offset_in_bytes(), oop); // fetch markword + andi(R0, current_header, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits + cmpwi(flag, R0, markOopDesc::unlocked_value); // bits = 001 unlocked + bne(flag, L_regular_unlock); // else RegularLock + tend_(); // otherwise end... + b(cont); // ... and we're done + bind(L_regular_unlock); + } +#endif + // Find the lock address and load the displaced header from the stack. ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); @@ -2129,13 +2523,12 @@ // Handle existing monitor. if ((EmitSync & 0x02) == 0) { // The object has an existing monitor iff (mark & monitor_value) != 0. + RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done ld(current_header, oopDesc::mark_offset_in_bytes(), oop); - andi(temp, current_header, markOopDesc::monitor_value); - cmpdi(flag, temp, 0); - bne(flag, object_has_monitor); + andi_(R0, current_header, markOopDesc::monitor_value); + bne(CCR0, object_has_monitor); } - // Check if it is still a light weight lock, this is is true if we see // the stack address of the basicLock in the markOop of the object. // Cmpxchg sets flag to cmpd(current_header, box). @@ -2158,6 +2551,20 @@ bind(object_has_monitor); addi(current_header, current_header, -markOopDesc::monitor_value); // monitor ld(temp, ObjectMonitor::owner_offset_in_bytes(), current_header); + + // It's inflated. +#if INCLUDE_RTM_OPT + if (use_rtm) { + Label L_regular_inflated_unlock; + // Clean monitor_value bit to get valid pointer + cmpdi(flag, temp, 0); + bne(flag, L_regular_inflated_unlock); + tend_(); + b(cont); + bind(L_regular_inflated_unlock); + } +#endif + ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header); xorr(temp, R16_thread, temp); // Will be 0 if we are the owner. orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions. @@ -2441,6 +2848,8 @@ // oop_result // R16_thread->in_bytes(JavaThread::vm_result_offset()) + verify_thread(); + ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); li(R0, 0); std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); @@ -2462,26 +2871,24 @@ std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); } - -void MacroAssembler::encode_klass_not_null(Register dst, Register src) { +Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. if (Universe::narrow_klass_base() != 0) { // Use dst as temp if it is free. - load_const(R0, Universe::narrow_klass_base(), (dst != current && dst != R0) ? dst : noreg); - sub(dst, current, R0); + sub_const_optimized(dst, current, Universe::narrow_klass_base(), R0); current = dst; } if (Universe::narrow_klass_shift() != 0) { srdi(dst, current, Universe::narrow_klass_shift()); current = dst; } - mr_if_needed(dst, current); // Move may be required. + return current; } void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { if (UseCompressedClassPointers) { - encode_klass_not_null(ck, klass); - stw(ck, oopDesc::klass_offset_in_bytes(), dst_oop); + Register compressedKlass = encode_klass_not_null(ck, klass); + stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); } else { std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); } @@ -2514,8 +2921,7 @@ sldi(shifted_src, src, Universe::narrow_klass_shift()); } if (Universe::narrow_klass_base() != 0) { - load_const(R0, Universe::narrow_klass_base()); - add(dst, shifted_src, R0); + add_const_optimized(dst, shifted_src, Universe::narrow_klass_base(), R0); } } diff --git a/src/cpu/ppc/vm/macroAssembler_ppc.hpp b/src/cpu/ppc/vm/macroAssembler_ppc.hpp --- a/src/cpu/ppc/vm/macroAssembler_ppc.hpp +++ b/src/cpu/ppc/vm/macroAssembler_ppc.hpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 2002, 2013, Oracle and/or its affiliates. All rights reserved. - * Copyright 2012, 2014 SAP AG. All rights reserved. + * Copyright (c) 2002, 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012, 2015 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -27,6 +27,7 @@ #define CPU_PPC_VM_MACROASSEMBLER_PPC_HPP #include "asm/assembler.hpp" +#include "runtime/rtmLocking.hpp" #include "utilities/macros.hpp" // MacroAssembler extends Assembler by a few frequently used macros. @@ -432,8 +433,8 @@ int semantics, bool cmpxchgx_hint = false, Register int_flag_success = noreg, bool contention_hint = false); void cmpxchgd(ConditionRegister flag, - Register dest_current_value, Register compare_value, Register exchange_value, Register addr_base, - int semantics, bool cmpxchgx_hint = false, + Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, + Register addr_base, int semantics, bool cmpxchgx_hint = false, Register int_flag_success = noreg, Label* failed = NULL, bool contention_hint = false); // interface method calling @@ -506,8 +507,42 @@ // biased locking exit case failed. void biased_locking_exit(ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done); - void compiler_fast_lock_object( ConditionRegister flag, Register oop, Register box, Register tmp1, Register tmp2, Register tmp3); - void compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, Register tmp1, Register tmp2, Register tmp3); + void atomic_inc_ptr(Register addr, Register result, int simm16 = 1); + void atomic_ori_int(Register addr, Register result, int uimm16); + +#if INCLUDE_RTM_OPT + void rtm_counters_update(Register abort_status, Register rtm_counters); + void branch_on_random_using_tb(Register tmp, int count, Label& brLabel); + void rtm_abort_ratio_calculation(Register rtm_counters_reg, RTMLockingCounters* rtm_counters, + Metadata* method_data); + void rtm_profiling(Register abort_status_Reg, Register temp_Reg, + RTMLockingCounters* rtm_counters, Metadata* method_data, bool profile_rtm); + void rtm_retry_lock_on_abort(Register retry_count, Register abort_status, + Label& retryLabel, Label* checkRetry = NULL); + void rtm_retry_lock_on_busy(Register retry_count, Register owner_addr, Label& retryLabel); + void rtm_stack_locking(ConditionRegister flag, Register obj, Register mark_word, Register tmp, + Register retry_on_abort_count, + RTMLockingCounters* stack_rtm_counters, + Metadata* method_data, bool profile_rtm, + Label& DONE_LABEL, Label& IsInflated); + void rtm_inflated_locking(ConditionRegister flag, Register obj, Register mark_word, Register box, + Register retry_on_busy_count, Register retry_on_abort_count, + RTMLockingCounters* rtm_counters, + Metadata* method_data, bool profile_rtm, + Label& DONE_LABEL); +#endif + + void compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, + Register tmp1, Register tmp2, Register tmp3, + bool try_bias = UseBiasedLocking, + RTMLockingCounters* rtm_counters = NULL, + RTMLockingCounters* stack_rtm_counters = NULL, + Metadata* method_data = NULL, + bool use_rtm = false, bool profile_rtm = false); + + void compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, + Register tmp1, Register tmp2, Register tmp3, + bool try_bias = UseBiasedLocking, bool use_rtm = false); // Support for serializing memory accesses between threads void serialize_memory(Register thread, Register tmp1, Register tmp2); @@ -576,7 +611,7 @@ Register tmp = noreg); // Null allowed. - inline void load_heap_oop(Register d, RegisterOrConstant offs, Register s1 = noreg); + inline void load_heap_oop(Register d, RegisterOrConstant offs, Register s1 = noreg, Label *is_null = NULL); // Encode/decode heap oop. Oop may not be null, else en/decoding goes wrong. // src == d allowed. @@ -593,7 +628,7 @@ void store_klass_gap(Register dst_oop, Register val = noreg); // Will store 0 if val not specified. static int instr_size_for_decode_klass_not_null(); void decode_klass_not_null(Register dst, Register src = noreg); - void encode_klass_not_null(Register dst, Register src = noreg); + Register encode_klass_not_null(Register dst, Register src = noreg); // Load common heap base into register. void reinit_heapbase(Register d, Register tmp = noreg); diff --git a/src/cpu/ppc/vm/macroAssembler_ppc.inline.hpp b/src/cpu/ppc/vm/macroAssembler_ppc.inline.hpp --- a/src/cpu/ppc/vm/macroAssembler_ppc.inline.hpp +++ b/src/cpu/ppc/vm/macroAssembler_ppc.inline.hpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 2002, 2013, Oracle and/or its affiliates. All rights reserved. - * Copyright 2012, 2014 SAP AG. All rights reserved. + * Copyright (c) 2002, 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012, 2015 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -333,19 +333,29 @@ } } -inline void MacroAssembler::load_heap_oop(Register d, RegisterOrConstant offs, Register s1) { +inline void MacroAssembler::load_heap_oop(Register d, RegisterOrConstant offs, Register s1, Label *is_null) { if (UseCompressedOops) { lwz(d, offs, s1); - decode_heap_oop(d); + if (is_null != NULL) { + cmpwi(CCR0, d, 0); + beq(CCR0, *is_null); + decode_heap_oop_not_null(d); + } else { + decode_heap_oop(d); + } } else { ld(d, offs, s1); + if (is_null != NULL) { + cmpdi(CCR0, d, 0); + beq(CCR0, *is_null); + } } } inline Register MacroAssembler::encode_heap_oop_not_null(Register d, Register src) { Register current = (src != noreg) ? src : d; // Oop to be compressed is in d if no src provided. if (Universe::narrow_oop_base_overlaps()) { - sub(d, current, R30); + sub_const_optimized(d, current, Universe::narrow_oop_base(), R0); current = d; } if (Universe::narrow_oop_shift() != 0) { @@ -358,7 +368,7 @@ inline Register MacroAssembler::decode_heap_oop_not_null(Register d, Register src) { if (Universe::narrow_oop_base_disjoint() && src != noreg && src != d && Universe::narrow_oop_shift() != 0) { - mr(d, R30); + load_const_optimized(d, Universe::narrow_oop_base(), R0); rldimi(d, src, Universe::narrow_oop_shift(), 32-Universe::narrow_oop_shift()); return d; } @@ -369,7 +379,7 @@ current = d; } if (Universe::narrow_oop_base() != NULL) { - add(d, current, R30); + add_const_optimized(d, current, Universe::narrow_oop_base(), R0); current = d; } return current; // Decoded oop is in this register. @@ -377,11 +387,19 @@ inline void MacroAssembler::decode_heap_oop(Register d) { Label isNull; + bool use_isel = false; if (Universe::narrow_oop_base() != NULL) { cmpwi(CCR0, d, 0); - beq(CCR0, isNull); + if (VM_Version::has_isel()) { + use_isel = true; + } else { + beq(CCR0, isNull); + } } decode_heap_oop_not_null(d); + if (use_isel) { + isel_0(d, CCR0, Assembler::equal); + } bind(isNull); } diff --git a/src/cpu/ppc/vm/methodHandles_ppc.hpp b/src/cpu/ppc/vm/methodHandles_ppc.hpp --- a/src/cpu/ppc/vm/methodHandles_ppc.hpp +++ b/src/cpu/ppc/vm/methodHandles_ppc.hpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 2002, 2013, Oracle and/or its affiliates. All rights reserved. - * Copyright 2012, 2013 SAP AG. All rights reserved. + * Copyright (c) 2002, 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012, 2015 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -27,9 +27,6 @@ // These definitions are inlined into class MethodHandles. // Adapters -//static unsigned int adapter_code_size() { -// return 32*K DEBUG_ONLY(+ 16*K) + (TraceMethodHandles ? 16*K : 0) + (VerifyMethodHandles ? 32*K : 0); -//} enum /* platform_dependent_constants */ { adapter_code_size = NOT_LP64(16000 DEBUG_ONLY(+ 25000)) LP64_ONLY(32000 DEBUG_ONLY(+ 150000)) }; @@ -45,7 +42,9 @@ static void verify_method_handle(MacroAssembler* _masm, Register mh_reg, Register temp_reg, Register temp2_reg) { - Unimplemented(); + verify_klass(_masm, mh_reg, SystemDictionary::WK_KLASS_ENUM_NAME(java_lang_invoke_MethodHandle), + temp_reg, temp2_reg, + "reference is a MH"); } static void verify_ref_kind(MacroAssembler* _masm, int ref_kind, Register member_reg, Register temp) NOT_DEBUG_RETURN; diff --git a/src/cpu/ppc/vm/ppc.ad b/src/cpu/ppc/vm/ppc.ad --- a/src/cpu/ppc/vm/ppc.ad +++ b/src/cpu/ppc/vm/ppc.ad @@ -447,8 +447,8 @@ R26, R27, R28, -/*R29*/ // global TOC -/*R30*/ // Narrow Oop Base +/*R29,*/ // global TOC + R30, R31 ); @@ -484,58 +484,11 @@ R26, R27, R28, -/*R29*/ -/*R30*/ // Narrow Oop Base +/*R29,*/ + R30, R31 ); -// Complement-required-in-pipeline operands for narrow oops. -reg_class bits32_reg_ro_not_complement ( -/*R0*/ // R0 - R1, // SP - R2, // TOC - R3, - R4, - R5, - R6, - R7, - R8, - R9, - R10, - R11, - R12, -/*R13,*/ // system thread id - R14, - R15, - R16, // R16_thread - R17, - R18, - R19, - R20, - R21, - R22, -/*R23, - R24, - R25, - R26, - R27, - R28,*/ -/*R29,*/ // TODO: let allocator handle TOC!! -/*R30,*/ - R31 -); - -// Complement-required-in-pipeline operands for narrow oops. -// See 64-bit declaration. -reg_class bits32_reg_ro_complement ( - R23, - R24, - R25, - R26, - R27, - R28 -); - reg_class rscratch1_bits32_reg(R11); reg_class rscratch2_bits32_reg(R12); reg_class rarg1_bits32_reg(R3); @@ -591,8 +544,8 @@ R26_H, R26, R27_H, R27, R28_H, R28, -/*R29_H, R29*/ -/*R30_H, R30*/ +/*R29_H, R29,*/ + R30_H, R30, R31_H, R31 ); @@ -629,8 +582,8 @@ R26_H, R26, R27_H, R27, R28_H, R28, -/*R29_H, R29*/ -/*R30_H, R30*/ +/*R29_H, R29,*/ + R30_H, R30, R31_H, R31 ); @@ -667,8 +620,8 @@ R26_H, R26, R27_H, R27, R28_H, R28, -/*R29_H, R29*/ -/*R30_H, R30*/ +/*R29_H, R29,*/ + R30_H, R30, R31_H, R31 ); @@ -704,64 +657,11 @@ R26_H, R26, R27_H, R27, R28_H, R28, -/*R29_H, R29*/ // TODO: let allocator handle TOC!! -/*R30_H, R30,*/ +/*R29_H, R29,*/ // TODO: let allocator handle TOC!! + R30_H, R30, R31_H, R31 ); -// Complement-required-in-pipeline operands. -reg_class bits64_reg_ro_not_complement ( -/*R0_H, R0*/ // R0 - R1_H, R1, // SP - R2_H, R2, // TOC - R3_H, R3, - R4_H, R4, - R5_H, R5, - R6_H, R6, - R7_H, R7, - R8_H, R8, - R9_H, R9, - R10_H, R10, - R11_H, R11, - R12_H, R12, -/*R13_H, R13*/ // system thread id - R14_H, R14, - R15_H, R15, - R16_H, R16, // R16_thread - R17_H, R17, - R18_H, R18, - R19_H, R19, - R20_H, R20, - R21_H, R21, - R22_H, R22, -/*R23_H, R23, - R24_H, R24, - R25_H, R25, - R26_H, R26, - R27_H, R27, - R28_H, R28,*/ -/*R29_H, R29*/ // TODO: let allocator handle TOC!! -/*R30_H, R30,*/ - R31_H, R31 -); - -// Complement-required-in-pipeline operands. -// This register mask is used for the trap instructions that implement -// the null checks on AIX. The trap instruction first computes the -// complement of the value it shall trap on. Because of this, the -// instruction can not be scheduled in the same cycle as an other -// instruction reading the normal value of the same register. So we -// force the value to check into 'bits64_reg_ro_not_complement' -// and then copy it to 'bits64_reg_ro_complement' for the trap. -reg_class bits64_reg_ro_complement ( - R23_H, R23, - R24_H, R24, - R25_H, R25, - R26_H, R26, - R27_H, R27, - R28_H, R28 -); - // ---------------------------- // Special Class for Condition Code Flags Register @@ -777,6 +677,17 @@ CCR7 ); +reg_class int_flags_ro( + CCR0, + CCR1, + CCR2, + CCR3, + CCR4, + CCR5, + CCR6, + CCR7 +); + reg_class int_flags_CR0(CCR0); reg_class int_flags_CR1(CCR1); reg_class int_flags_CR6(CCR6); @@ -2876,7 +2787,7 @@ // Use release_store for card-marking to ensure that previous // oop-stores are visible before the card-mark change. - enc_class enc_cms_card_mark(memory mem, iRegLdst releaseFieldAddr) %{ + enc_class enc_cms_card_mark(memory mem, iRegLdst releaseFieldAddr, flagsReg crx) %{ // TODO: PPC port $archOpcode(ppc64Opcode_compound); // FIXME: Implement this as a cmove and use a fixed condition code // register which is written on every transition to compiled code, @@ -2897,8 +2808,8 @@ // Check CMSCollectorCardTableModRefBSExt::_requires_release and do the // StoreStore barrier conditionally. __ lwz(R0, 0, $releaseFieldAddr$$Register); - __ cmpwi(CCR0, R0, 0); - __ beq_predict_taken(CCR0, skip_storestore); + __ cmpwi($crx$$CondRegister, R0, 0); + __ beq_predict_taken($crx$$CondRegister, skip_storestore); #endif __ li(R0, 0); __ membar(Assembler::StoreStore); @@ -3108,7 +3019,7 @@ nodes->push(n2); %} - enc_class enc_cmove_reg(iRegIdst dst, flagsReg crx, iRegIsrc src, cmpOp cmp) %{ + enc_class enc_cmove_reg(iRegIdst dst, flagsRegSrc crx, iRegIsrc src, cmpOp cmp) %{ // TODO: PPC port $archOpcode(ppc64Opcode_cmove); MacroAssembler _masm(&cbuf); @@ -3123,7 +3034,7 @@ __ bind(done); %} - enc_class enc_cmove_imm(iRegIdst dst, flagsReg crx, immI16 src, cmpOp cmp) %{ + enc_class enc_cmove_imm(iRegIdst dst, flagsRegSrc crx, immI16 src, cmpOp cmp) %{ // TODO: PPC port $archOpcode(ppc64Opcode_cmove); MacroAssembler _masm(&cbuf); @@ -3269,7 +3180,7 @@ __ bind(done); %} - enc_class enc_cmove_bso_stackSlotL(iRegLdst dst, flagsReg crx, stackSlotL mem ) %{ + enc_class enc_cmove_bso_stackSlotL(iRegLdst dst, flagsRegSrc crx, stackSlotL mem ) %{ // TODO: PPC port $archOpcode(ppc64Opcode_cmove); MacroAssembler _masm(&cbuf); @@ -3281,7 +3192,7 @@ __ bind(done); %} - enc_class enc_bc(flagsReg crx, cmpOp cmp, Label lbl) %{ + enc_class enc_bc(flagsRegSrc crx, cmpOp cmp, Label lbl) %{ // TODO: PPC port $archOpcode(ppc64Opcode_bc); MacroAssembler _masm(&cbuf); @@ -3309,7 +3220,7 @@ l); %} - enc_class enc_bc_far(flagsReg crx, cmpOp cmp, Label lbl) %{ + enc_class enc_bc_far(flagsRegSrc crx, cmpOp cmp, Label lbl) %{ // The scheduler doesn't know about branch shortening, so we set the opcode // to ppc64Opcode_bc in order to hide this detail from the scheduler. // TODO: PPC port $archOpcode(ppc64Opcode_bc); @@ -3341,7 +3252,7 @@ %} // Branch used with Power6 scheduling (can be shortened without changing the node). - enc_class enc_bc_short_far(flagsReg crx, cmpOp cmp, Label lbl) %{ + enc_class enc_bc_short_far(flagsRegSrc crx, cmpOp cmp, Label lbl) %{ // The scheduler doesn't know about branch shortening, so we set the opcode // to ppc64Opcode_bc in order to hide this detail from the scheduler. // TODO: PPC port $archOpcode(ppc64Opcode_bc); @@ -4700,6 +4611,15 @@ interface(REG_INTER); %} +operand flagsRegSrc() %{ + constraint(ALLOC_IN_RC(int_flags_ro)); + match(RegFlags); + match(flagsReg); + match(flagsRegCR0); + format %{ %} + interface(REG_INTER); +%} + // Condition Code Flag Register CR0 operand flagsRegCR0() %{ constraint(ALLOC_IN_RC(int_flags_CR0)); @@ -4783,6 +4703,13 @@ predicate(false /* TODO: PPC port MatchDecodeNodes*/); constraint(ALLOC_IN_RC(bits32_reg_ro)); match(DecodeN reg); + format %{ "$reg" %} + interface(REG_INTER) +%} + +operand iRegN2P_klass(iRegNsrc reg) %{ + predicate(Universe::narrow_klass_base() == NULL && Universe::narrow_klass_shift() == 0); + constraint(ALLOC_IN_RC(bits32_reg_ro)); match(DecodeNKlass reg); format %{ "$reg" %} interface(REG_INTER) @@ -4839,6 +4766,19 @@ predicate(false /* TODO: PPC port MatchDecodeNodes*/); constraint(ALLOC_IN_RC(bits64_reg_ro)); match(DecodeN reg); + op_cost(100); + format %{ "[$reg]" %} + interface(MEMORY_INTER) %{ + base($reg); + index(0x0); + scale(0x0); + disp(0x0); + %} +%} + +operand indirectNarrow_klass(iRegNsrc reg) %{ + predicate(Universe::narrow_klass_base() == NULL && Universe::narrow_klass_shift() == 0); + constraint(ALLOC_IN_RC(bits64_reg_ro)); match(DecodeNKlass reg); op_cost(100); format %{ "[$reg]" %} @@ -4855,6 +4795,19 @@ predicate(false /* TODO: PPC port MatchDecodeNodes*/); constraint(ALLOC_IN_RC(bits64_reg_ro)); match(AddP (DecodeN reg) offset); + op_cost(100); + format %{ "[$reg + $offset]" %} + interface(MEMORY_INTER) %{ + base($reg); + index(0x0); + scale(0x0); + disp($offset); + %} +%} + +operand indOffset16Narrow_klass(iRegNsrc reg, immL16 offset) %{ + predicate(Universe::narrow_klass_base() == NULL && Universe::narrow_klass_shift() == 0); + constraint(ALLOC_IN_RC(bits64_reg_ro)); match(AddP (DecodeNKlass reg) offset); op_cost(100); format %{ "[$reg + $offset]" %} @@ -4871,6 +4824,19 @@ predicate(false /* TODO: PPC port MatchDecodeNodes*/); constraint(ALLOC_IN_RC(bits64_reg_ro)); match(AddP (DecodeN reg) offset); + op_cost(100); + format %{ "[$reg + $offset]" %} + interface(MEMORY_INTER) %{ + base($reg); + index(0x0); + scale(0x0); + disp($offset); + %} +%} + +operand indOffset16NarrowAlg4_klass(iRegNsrc reg, immL16Alg4 offset) %{ + predicate(Universe::narrow_klass_base() == NULL && Universe::narrow_klass_shift() == 0); + constraint(ALLOC_IN_RC(bits64_reg_ro)); match(AddP (DecodeNKlass reg) offset); op_cost(100); format %{ "[$reg + $offset]" %} @@ -4998,9 +4964,9 @@ // encoding and format. The classic case of this is memory operands. // Indirect is not included since its use is limited to Compare & Swap. -opclass memory(indirect, indOffset16 /*, indIndex, tlsReference*/, indirectNarrow, indOffset16Narrow); +opclass memory(indirect, indOffset16 /*, indIndex, tlsReference*/, indirectNarrow, indirectNarrow_klass, indOffset16Narrow, indOffset16Narrow_klass); // Memory operand where offsets are 4-aligned. Required for ld, std. -opclass memoryAlg4(indirect, indOffset16Alg4, indirectNarrow, indOffset16NarrowAlg4); +opclass memoryAlg4(indirect, indOffset16Alg4, indirectNarrow, indOffset16NarrowAlg4, indOffset16NarrowAlg4_klass); opclass indirectMemory(indirect, indirectNarrow); // Special opclass for I and ConvL2I. @@ -5009,7 +4975,7 @@ // Operand classes to match encode and decode. iRegN_P2N is only used // for storeN. I have never seen an encode node elsewhere. opclass iRegN_P2N(iRegNsrc, iRegP2N); -opclass iRegP_N2P(iRegPsrc, iRegN2P); +opclass iRegP_N2P(iRegPsrc, iRegN2P, iRegN2P_klass); //----------PIPELINE----------------------------------------------------------- @@ -5593,6 +5559,19 @@ ins_pipe(pipe_class_memory); %} +instruct loadN2P_klass_unscaled(iRegPdst dst, memory mem) %{ + match(Set dst (DecodeNKlass (LoadNKlass mem))); + // SAPJVM GL 2014-05-21 Differs. + predicate(Universe::narrow_klass_base() == NULL && Universe::narrow_klass_shift() == 0 && + _kids[0]->_leaf->as_Load()->is_unordered()); + ins_cost(MEMORY_REF_COST); + + format %{ "LWZ $dst, $mem \t// DecodeN (unscaled)" %} + size(4); + ins_encode( enc_lwz(dst, mem) ); + ins_pipe(pipe_class_memory); +%} + // Load Pointer instruct loadP(iRegPdst dst, memoryAlg4 mem) %{ match(Set dst (LoadP mem)); @@ -5669,8 +5648,9 @@ %} // Load Float acquire. -instruct loadF_ac(regF dst, memory mem) %{ +instruct loadF_ac(regF dst, memory mem, flagsRegCR0 cr0) %{ match(Set dst (LoadF mem)); + effect(TEMP cr0); ins_cost(3*MEMORY_REF_COST); format %{ "LFS $dst, $mem \t// acquire\n\t" @@ -5705,8 +5685,9 @@ %} // Load Double - aligned acquire. -instruct loadD_ac(regD dst, memory mem) %{ +instruct loadD_ac(regD dst, memory mem, flagsRegCR0 cr0) %{ match(Set dst (LoadD mem)); + effect(TEMP cr0); ins_cost(3*MEMORY_REF_COST); format %{ "LFD $dst, $mem \t// acquire\n\t" @@ -6034,11 +6015,10 @@ instruct loadBase(iRegLdst dst) %{ effect(DEF dst); - format %{ "MR $dst, r30_heapbase" %} - size(4); - ins_encode %{ - // TODO: PPC port $archOpcode(ppc64Opcode_or); - __ mr($dst$$Register, R30); + format %{ "LoadConst $dst, heapbase" %} + ins_encode %{ + // TODO: PPC port $archOpcode(ppc64Opcode_compound); + __ load_const_optimized($dst$$Register, Universe::narrow_oop_base(), R0); %} ins_pipe(pipe_class_default); %} @@ -6114,7 +6094,7 @@ effect(TEMP src2); ins_cost(DEFAULT_COST); - format %{ "ORI $dst, $src1, $src2 \t// narrow klass lo" %} + format %{ "ORI $dst, $src1, $src2 \t// narrow klass lo" %} size(4); ins_encode %{ // TODO: PPC port $archOpcode(ppc64Opcode_ori); @@ -6563,8 +6543,9 @@ // do a releasing store. For this, it gets the address of // CMSCollectorCardTableModRefBSExt::_requires_release as input. // (Using releaseFieldAddr in the match rule is a hack.) -instruct storeCM_CMS(memory mem, iRegLdst releaseFieldAddr) %{ +instruct storeCM_CMS(memory mem, iRegLdst releaseFieldAddr, flagsReg crx) %{ match(Set mem (StoreCM mem releaseFieldAddr)); + effect(TEMP crx); predicate(false); ins_cost(MEMORY_REF_COST); @@ -6572,7 +6553,7 @@ ins_cannot_rematerialize(true); format %{ "STB #0, $mem \t// CMS card-mark byte (must be 0!), checking requires_release in [$releaseFieldAddr]" %} - ins_encode( enc_cms_card_mark(mem, releaseFieldAddr) ); + ins_encode( enc_cms_card_mark(mem, releaseFieldAddr, crx) ); ins_pipe(pipe_class_memory); %} @@ -6589,8 +6570,9 @@ expand %{ immL baseImm %{ 0 /* TODO: PPC port (jlong)CMSCollectorCardTableModRefBSExt::requires_release_address() */ %} iRegLdst releaseFieldAddress; + flagsReg crx; loadConL_Ex(releaseFieldAddress, baseImm); - storeCM_CMS(mem, releaseFieldAddress); + storeCM_CMS(mem, releaseFieldAddress, crx); %} %} @@ -6639,39 +6621,34 @@ predicate(false); format %{ "SUB $dst, $src, oop_base \t// encode" %} - size(4); - ins_encode %{ - // TODO: PPC port $archOpcode(ppc64Opcode_subf); - __ subf($dst$$Register, R30, $src$$Register); + ins_encode %{ + // TODO: PPC port $archOpcode(ppc64Opcode_compound); + __ sub_const_optimized($dst$$Register, $src$$Register, Universe::narrow_oop_base(), R0); %} ins_pipe(pipe_class_default); %} // Conditional sub base. -instruct cond_sub_base(iRegNdst dst, flagsReg crx, iRegPsrc src1) %{ +instruct cond_sub_base(iRegNdst dst, flagsRegSrc crx, iRegPsrc src1) %{ // The match rule is needed to make it a 'MachTypeNode'! match(Set dst (EncodeP (Binary crx src1))); predicate(false); - ins_variable_size_depending_on_alignment(true); - format %{ "BEQ $crx, done\n\t" - "SUB $dst, $src1, R30 \t// encode: subtract base if != NULL\n" + "SUB $dst, $src1, heapbase \t// encode: subtract base if != NULL\n" "done:" %} - size(false /* TODO: PPC PORT (InsertEndGroupPPC64 && Compile::current()->do_hb_scheduling())*/ ? 12 : 8); - ins_encode %{ - // TODO: PPC port $archOpcode(ppc64Opcode_cmove); + ins_encode %{ + // TODO: PPC port $archOpcode(ppc64Opcode_compound); Label done; __ beq($crx$$CondRegister, done); - __ subf($dst$$Register, R30, $src1$$Register); - // TODO PPC port __ endgroup_if_needed(_size == 12); + __ sub_const_optimized($dst$$Register, $src1$$Register, Universe::narrow_oop_base(), R0); __ bind(done); %} ins_pipe(pipe_class_default); %} // Power 7 can use isel instruction -instruct cond_set_0_oop(iRegNdst dst, flagsReg crx, iRegPsrc src1) %{ +instruct cond_set_0_oop(iRegNdst dst, flagsRegSrc crx, iRegPsrc src1) %{ // The match rule is needed to make it a 'MachTypeNode'! match(Set dst (EncodeP (Binary crx src1))); predicate(false); @@ -6777,42 +6754,37 @@ match(Set dst (DecodeN src)); predicate(false); - format %{ "ADD $dst, $src, R30 \t// DecodeN, add oop base" %} - size(4); - ins_encode %{ - // TODO: PPC port $archOpcode(ppc64Opcode_add); - __ add($dst$$Register, $src$$Register, R30); + format %{ "ADD $dst, $src, heapbase \t// DecodeN, add oop base" %} + ins_encode %{ + // TODO: PPC port $archOpcode(ppc64Opcode_compound); + __ add_const_optimized($dst$$Register, $src$$Register, Universe::narrow_oop_base(), R0); %} ins_pipe(pipe_class_default); %} // conditianal add base for expand -instruct cond_add_base(iRegPdst dst, flagsReg crx, iRegPsrc src1) %{ +instruct cond_add_base(iRegPdst dst, flagsRegSrc crx, iRegPsrc src) %{ // The match rule is needed to make it a 'MachTypeNode'! // NOTICE that the rule is nonsense - we just have to make sure that: // - _matrule->_rChild->_opType == "DecodeN" (see InstructForm::captures_bottom_type() in formssel.cpp) // - we have to match 'crx' to avoid an "illegal USE of non-input: flagsReg crx" error in ADLC. - match(Set dst (DecodeN (Binary crx src1))); + match(Set dst (DecodeN (Binary crx src))); predicate(false); - ins_variable_size_depending_on_alignment(true); - format %{ "BEQ $crx, done\n\t" - "ADD $dst, $src1, R30 \t// DecodeN: add oop base if $src1 != NULL\n" + "ADD $dst, $src, heapbase \t// DecodeN: add oop base if $src != NULL\n" "done:" %} - size(false /* TODO: PPC PORT (InsertEndGroupPPC64 && Compile::current()->do_hb_scheduling()) */? 12 : 8); - ins_encode %{ - // TODO: PPC port $archOpcode(ppc64Opcode_cmove); + ins_encode %{ + // TODO: PPC port $archOpcode(ppc64Opcode_compound); Label done; __ beq($crx$$CondRegister, done); - __ add($dst$$Register, $src1$$Register, R30); - // TODO PPC port __ endgroup_if_needed(_size == 12); + __ add_const_optimized($dst$$Register, $src$$Register, Universe::narrow_oop_base(), R0); __ bind(done); %} ins_pipe(pipe_class_default); %} -instruct cond_set_0_ptr(iRegPdst dst, flagsReg crx, iRegPsrc src1) %{ +instruct cond_set_0_ptr(iRegPdst dst, flagsRegSrc crx, iRegPsrc src1) %{ // The match rule is needed to make it a 'MachTypeNode'! // NOTICE that the rule is nonsense - we just have to make sure that: // - _matrule->_rChild->_opType == "DecodeN" (see InstructForm::captures_bottom_type() in formssel.cpp) @@ -6888,7 +6860,7 @@ Universe::narrow_oop_base_disjoint()); ins_cost(DEFAULT_COST); - format %{ "MOV $dst, R30 \t\n" + format %{ "MOV $dst, heapbase \t\n" "RLDIMI $dst, $src, shift, 32-shift \t// decode with disjoint base" %} postalloc_expand %{ loadBaseNode *n1 = new loadBaseNode(); @@ -6946,7 +6918,7 @@ assert(ra_->is_oop(this) == true, "A decodeN node must produce an oop!"); ra_->set_oop(n_cond_set, true); - + ra_->set_pair(n1->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this)); ra_->set_pair(n_compare->_idx, ra_->get_reg_second(n_crx), ra_->get_reg_first(n_crx)); ra_->set_pair(n2->_idx, ra_->get_reg_second(this), ra_->get_reg_first(this)); @@ -7303,7 +7275,7 @@ //----------Conditional Move--------------------------------------------------- // Cmove using isel. -instruct cmovI_reg_isel(cmpOp cmp, flagsReg crx, iRegIdst dst, iRegIsrc src) %{ +instruct cmovI_reg_isel(cmpOp cmp, flagsRegSrc crx, iRegIdst dst, iRegIsrc src) %{ match(Set dst (CMoveI (Binary cmp crx) (Binary dst src))); predicate(VM_Version::has_isel()); ins_cost(DEFAULT_COST); @@ -7321,7 +7293,7 @@ ins_pipe(pipe_class_default); %} -instruct cmovI_reg(cmpOp cmp, flagsReg crx, iRegIdst dst, iRegIsrc src) %{ +instruct cmovI_reg(cmpOp cmp, flagsRegSrc crx, iRegIdst dst, iRegIsrc src) %{ match(Set dst (CMoveI (Binary cmp crx) (Binary dst src))); predicate(!VM_Version::has_isel()); ins_cost(DEFAULT_COST+BRANCH_COST); @@ -7335,7 +7307,7 @@ ins_pipe(pipe_class_default); %} -instruct cmovI_imm(cmpOp cmp, flagsReg crx, iRegIdst dst, immI16 src) %{ +instruct cmovI_imm(cmpOp cmp, flagsRegSrc crx, iRegIdst dst, immI16 src) %{ match(Set dst (CMoveI (Binary cmp crx) (Binary dst src))); ins_cost(DEFAULT_COST+BRANCH_COST); @@ -7349,7 +7321,7 @@ %} // Cmove using isel. -instruct cmovL_reg_isel(cmpOp cmp, flagsReg crx, iRegLdst dst, iRegLsrc src) %{ +instruct cmovL_reg_isel(cmpOp cmp, flagsRegSrc crx, iRegLdst dst, iRegLsrc src) %{ match(Set dst (CMoveL (Binary cmp crx) (Binary dst src))); predicate(VM_Version::has_isel()); ins_cost(DEFAULT_COST); @@ -7367,7 +7339,7 @@ ins_pipe(pipe_class_default); %} -instruct cmovL_reg(cmpOp cmp, flagsReg crx, iRegLdst dst, iRegLsrc src) %{ +instruct cmovL_reg(cmpOp cmp, flagsRegSrc crx, iRegLdst dst, iRegLsrc src) %{ match(Set dst (CMoveL (Binary cmp crx) (Binary dst src))); predicate(!VM_Version::has_isel()); ins_cost(DEFAULT_COST+BRANCH_COST); @@ -7381,7 +7353,7 @@ ins_pipe(pipe_class_default); %} -instruct cmovL_imm(cmpOp cmp, flagsReg crx, iRegLdst dst, immL16 src) %{ +instruct cmovL_imm(cmpOp cmp, flagsRegSrc crx, iRegLdst dst, immL16 src) %{ match(Set dst (CMoveL (Binary cmp crx) (Binary dst src))); ins_cost(DEFAULT_COST+BRANCH_COST); @@ -7395,7 +7367,7 @@ %} // Cmove using isel. -instruct cmovN_reg_isel(cmpOp cmp, flagsReg crx, iRegNdst dst, iRegNsrc src) %{ +instruct cmovN_reg_isel(cmpOp cmp, flagsRegSrc crx, iRegNdst dst, iRegNsrc src) %{ match(Set dst (CMoveN (Binary cmp crx) (Binary dst src))); predicate(VM_Version::has_isel()); ins_cost(DEFAULT_COST); @@ -7414,7 +7386,7 @@ %} // Conditional move for RegN. Only cmov(reg, reg). -instruct cmovN_reg(cmpOp cmp, flagsReg crx, iRegNdst dst, iRegNsrc src) %{ +instruct cmovN_reg(cmpOp cmp, flagsRegSrc crx, iRegNdst dst, iRegNsrc src) %{ match(Set dst (CMoveN (Binary cmp crx) (Binary dst src))); predicate(!VM_Version::has_isel()); ins_cost(DEFAULT_COST+BRANCH_COST); @@ -7428,7 +7400,7 @@ ins_pipe(pipe_class_default); %} -instruct cmovN_imm(cmpOp cmp, flagsReg crx, iRegNdst dst, immN_0 src) %{ +instruct cmovN_imm(cmpOp cmp, flagsRegSrc crx, iRegNdst dst, immN_0 src) %{ match(Set dst (CMoveN (Binary cmp crx) (Binary dst src))); ins_cost(DEFAULT_COST+BRANCH_COST); @@ -7442,7 +7414,7 @@ %} // Cmove using isel. -instruct cmovP_reg_isel(cmpOp cmp, flagsReg crx, iRegPdst dst, iRegPsrc src) %{ +instruct cmovP_reg_isel(cmpOp cmp, flagsRegSrc crx, iRegPdst dst, iRegPsrc src) %{ match(Set dst (CMoveP (Binary cmp crx) (Binary dst src))); predicate(VM_Version::has_isel()); ins_cost(DEFAULT_COST); @@ -7460,7 +7432,7 @@ ins_pipe(pipe_class_default); %} -instruct cmovP_reg(cmpOp cmp, flagsReg crx, iRegPdst dst, iRegP_N2P src) %{ +instruct cmovP_reg(cmpOp cmp, flagsRegSrc crx, iRegPdst dst, iRegP_N2P src) %{ match(Set dst (CMoveP (Binary cmp crx) (Binary dst src))); predicate(!VM_Version::has_isel()); ins_cost(DEFAULT_COST+BRANCH_COST); @@ -7474,7 +7446,7 @@ ins_pipe(pipe_class_default); %} -instruct cmovP_imm(cmpOp cmp, flagsReg crx, iRegPdst dst, immP_0 src) %{ +instruct cmovP_imm(cmpOp cmp, flagsRegSrc crx, iRegPdst dst, immP_0 src) %{ match(Set dst (CMoveP (Binary cmp crx) (Binary dst src))); ins_cost(DEFAULT_COST+BRANCH_COST); @@ -7487,7 +7459,7 @@ ins_pipe(pipe_class_default); %} -instruct cmovF_reg(cmpOp cmp, flagsReg crx, regF dst, regF src) %{ +instruct cmovF_reg(cmpOp cmp, flagsRegSrc crx, regF dst, regF src) %{ match(Set dst (CMoveF (Binary cmp crx) (Binary dst src))); ins_cost(DEFAULT_COST+BRANCH_COST); @@ -7509,7 +7481,7 @@ ins_pipe(pipe_class_default); %} -instruct cmovD_reg(cmpOp cmp, flagsReg crx, regD dst, regD src) %{ +instruct cmovD_reg(cmpOp cmp, flagsRegSrc crx, regD dst, regD src) %{ match(Set dst (CMoveD (Binary cmp crx) (Binary dst src))); ins_cost(DEFAULT_COST+BRANCH_COST); @@ -7542,8 +7514,9 @@ // Mem_ptr must be a memory operand, else this node does not get // Flag_needs_anti_dependence_check set by adlc. If this is not set this node // can be rematerialized which leads to errors. -instruct storeLConditional_regP_regL_regL(flagsReg crx, indirect mem_ptr, iRegLsrc oldVal, iRegLsrc newVal) %{ +instruct storeLConditional_regP_regL_regL(flagsReg crx, indirect mem_ptr, iRegLsrc oldVal, iRegLsrc newVal, flagsRegCR0 cr0) %{ match(Set crx (StoreLConditional mem_ptr (Binary oldVal newVal))); + effect(TEMP cr0); format %{ "CMPXCHGD if ($crx = ($oldVal == *$mem_ptr)) *mem_ptr = $newVal; as bool" %} ins_encode %{ // TODO: PPC port $archOpcode(ppc64Opcode_compound); @@ -7560,16 +7533,16 @@ // Mem_ptr must be a memory operand, else this node does not get // Flag_needs_anti_dependence_check set by adlc. If this is not set this node // can be rematerialized which leads to errors. -instruct storePConditional_regP_regP_regP(flagsReg crx, indirect mem_ptr, iRegPsrc oldVal, iRegPsrc newVal) %{ - match(Set crx (StorePConditional mem_ptr (Binary oldVal newVal))); - format %{ "CMPXCHGD if ($crx = ($oldVal == *$mem_ptr)) *mem_ptr = $newVal; as bool" %} - ins_encode %{ - // TODO: PPC port $archOpcode(ppc64Opcode_compound); - __ cmpxchgd($crx$$CondRegister, R0, $oldVal$$Register, $newVal$$Register, $mem_ptr$$Register, - MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(), - noreg, NULL, true); - %} - ins_pipe(pipe_class_default); +instruct storePConditional_regP_regP_regP(flagsRegCR0 cr0, indirect mem_ptr, iRegPsrc oldVal, iRegPsrc newVal) %{ + match(Set cr0 (StorePConditional mem_ptr (Binary oldVal newVal))); + ins_cost(2*MEMORY_REF_COST); + + format %{ "STDCX_ if ($cr0 = ($oldVal == *$mem_ptr)) *mem_ptr = $newVal; as bool" %} + ins_encode %{ + // TODO: PPC port $archOpcode(ppc64Opcode_stdcx_); + __ stdcx_($newVal$$Register, $mem_ptr$$Register); + %} + ins_pipe(pipe_class_memory); %} // Implement LoadPLocked. Must be ordered against changes of the memory location @@ -7577,13 +7550,14 @@ // Don't know whether this is ever used. instruct loadPLocked(iRegPdst dst, memory mem) %{ match(Set dst (LoadPLocked mem)); - ins_cost(MEMORY_REF_COST); - - format %{ "LD $dst, $mem \t// loadPLocked\n\t" - "TWI $dst\n\t" - "ISYNC" %} - size(12); - ins_encode( enc_ld_ac(dst, mem) ); + ins_cost(2*MEMORY_REF_COST); + + format %{ "LDARX $dst, $mem \t// loadPLocked\n\t" %} + size(4); + ins_encode %{ + // TODO: PPC port $archOpcode(ppc64Opcode_ldarx); + __ ldarx($dst$$Register, $mem$$Register, MacroAssembler::cmpxchgx_hint_atomic_update()); + %} ins_pipe(pipe_class_memory); %} @@ -7593,8 +7567,9 @@ // (CompareAndSwap ...)" or "If (CmpI (CompareAndSwap ..))" cannot be // matched. -instruct compareAndSwapI_regP_regI_regI(iRegIdst res, iRegPdst mem_ptr, iRegIsrc src1, iRegIsrc src2) %{ +instruct compareAndSwapI_regP_regI_regI(iRegIdst res, iRegPdst mem_ptr, iRegIsrc src1, iRegIsrc src2, flagsRegCR0 cr0) %{ match(Set res (CompareAndSwapI mem_ptr (Binary src1 src2))); + effect(TEMP cr0); format %{ "CMPXCHGW $res, $mem_ptr, $src1, $src2; as bool" %} // Variable size: instruction count smaller if regs are disjoint. ins_encode %{ @@ -7607,8 +7582,9 @@ ins_pipe(pipe_class_default); %} -instruct compareAndSwapN_regP_regN_regN(iRegIdst res, iRegPdst mem_ptr, iRegNsrc src1, iRegNsrc src2) %{ +instruct compareAndSwapN_regP_regN_regN(iRegIdst res, iRegPdst mem_ptr, iRegNsrc src1, iRegNsrc src2, flagsRegCR0 cr0) %{ match(Set res (CompareAndSwapN mem_ptr (Binary src1 src2))); + effect(TEMP cr0); format %{ "CMPXCHGW $res, $mem_ptr, $src1, $src2; as bool" %} // Variable size: instruction count smaller if regs are disjoint. ins_encode %{ @@ -7621,8 +7597,9 @@ ins_pipe(pipe_class_default); %} -instruct compareAndSwapL_regP_regL_regL(iRegIdst res, iRegPdst mem_ptr, iRegLsrc src1, iRegLsrc src2) %{ +instruct compareAndSwapL_regP_regL_regL(iRegIdst res, iRegPdst mem_ptr, iRegLsrc src1, iRegLsrc src2, flagsRegCR0 cr0) %{ match(Set res (CompareAndSwapL mem_ptr (Binary src1 src2))); + effect(TEMP cr0); format %{ "CMPXCHGD $res, $mem_ptr, $src1, $src2; as bool" %} // Variable size: instruction count smaller if regs are disjoint. ins_encode %{ @@ -7635,8 +7612,9 @@ ins_pipe(pipe_class_default); %} -instruct compareAndSwapP_regP_regP_regP(iRegIdst res, iRegPdst mem_ptr, iRegPsrc src1, iRegPsrc src2) %{ +instruct compareAndSwapP_regP_regP_regP(iRegIdst res, iRegPdst mem_ptr, iRegPsrc src1, iRegPsrc src2, flagsRegCR0 cr0) %{ match(Set res (CompareAndSwapP mem_ptr (Binary src1 src2))); + effect(TEMP cr0); format %{ "CMPXCHGD $res, $mem_ptr, $src1, $src2; as bool; ptr" %} // Variable size: instruction count smaller if regs are disjoint. ins_encode %{ @@ -7649,48 +7627,54 @@ ins_pipe(pipe_class_default); %} -instruct getAndAddI(iRegIdst res, iRegPdst mem_ptr, iRegIsrc src) %{ +instruct getAndAddI(iRegIdst res, iRegPdst mem_ptr, iRegIsrc src, flagsRegCR0 cr0) %{ match(Set res (GetAndAddI mem_ptr src)); + effect(TEMP cr0); format %{ "GetAndAddI $res, $mem_ptr, $src" %} // Variable size: instruction count smaller if regs are disjoint. ins_encode( enc_GetAndAddI(res, mem_ptr, src) ); ins_pipe(pipe_class_default); %} -instruct getAndAddL(iRegLdst res, iRegPdst mem_ptr, iRegLsrc src) %{ +instruct getAndAddL(iRegLdst res, iRegPdst mem_ptr, iRegLsrc src, flagsRegCR0 cr0) %{ match(Set res (GetAndAddL mem_ptr src)); + effect(TEMP cr0); format %{ "GetAndAddL $res, $mem_ptr, $src" %} // Variable size: instruction count smaller if regs are disjoint. ins_encode( enc_GetAndAddL(res, mem_ptr, src) ); ins_pipe(pipe_class_default); %} -instruct getAndSetI(iRegIdst res, iRegPdst mem_ptr, iRegIsrc src) %{ +instruct getAndSetI(iRegIdst res, iRegPdst mem_ptr, iRegIsrc src, flagsRegCR0 cr0) %{ match(Set res (GetAndSetI mem_ptr src)); + effect(TEMP cr0); format %{ "GetAndSetI $res, $mem_ptr, $src" %} // Variable size: instruction count smaller if regs are disjoint. ins_encode( enc_GetAndSetI(res, mem_ptr, src) ); ins_pipe(pipe_class_default); %} -instruct getAndSetL(iRegLdst res, iRegPdst mem_ptr, iRegLsrc src) %{ +instruct getAndSetL(iRegLdst res, iRegPdst mem_ptr, iRegLsrc src, flagsRegCR0 cr0) %{ match(Set res (GetAndSetL mem_ptr src)); + effect(TEMP cr0); format %{ "GetAndSetL $res, $mem_ptr, $src" %} // Variable size: instruction count smaller if regs are disjoint. ins_encode( enc_GetAndSetL(res, mem_ptr, src) ); ins_pipe(pipe_class_default); %} -instruct getAndSetP(iRegPdst res, iRegPdst mem_ptr, iRegPsrc src) %{ +instruct getAndSetP(iRegPdst res, iRegPdst mem_ptr, iRegPsrc src, flagsRegCR0 cr0) %{ match(Set res (GetAndSetP mem_ptr src)); + effect(TEMP cr0); format %{ "GetAndSetP $res, $mem_ptr, $src" %} // Variable size: instruction count smaller if regs are disjoint. ins_encode( enc_GetAndSetL(res, mem_ptr, src) ); ins_pipe(pipe_class_default); %} -instruct getAndSetN(iRegNdst res, iRegPdst mem_ptr, iRegNsrc src) %{ +instruct getAndSetN(iRegNdst res, iRegPdst mem_ptr, iRegNsrc src, flagsRegCR0 cr0) %{ match(Set res (GetAndSetN mem_ptr src)); + effect(TEMP cr0); format %{ "GetAndSetN $res, $mem_ptr, $src" %} // Variable size: instruction count smaller if regs are disjoint. ins_encode( enc_GetAndSetI(res, mem_ptr, src) ); @@ -7898,18 +7882,8 @@ %} // Immediate Subtraction -// The compiler converts "x-c0" into "x+ -c0" (see SubINode::Ideal), -// so this rule seems to be unused. -instruct subI_reg_imm16(iRegIdst dst, iRegIsrc src1, immI16 src2) %{ - match(Set dst (SubI src1 src2)); - format %{ "SUBI $dst, $src1, $src2" %} - size(4); - ins_encode %{ - // TODO: PPC port $archOpcode(ppc64Opcode_addi); - __ addi($dst$$Register, $src1$$Register, ($src2$$constant) * (-1)); - %} - ins_pipe(pipe_class_default); -%} +// Immediate Subtraction: The compiler converts "x-c0" into "x+ -c0" (see SubLNode::Ideal), +// Don't try to use addi with - $src2$$constant since it can overflow when $src2$$constant == minI16. // SubI from constant (using subfic). instruct subI_imm16_reg(iRegIdst dst, immI16 src1, iRegIsrc src2) %{ @@ -7989,22 +7963,6 @@ ins_pipe(pipe_class_default); %} -// Immediate Subtraction -// The compiler converts "x-c0" into "x+ -c0" (see SubLNode::Ideal), -// so this rule seems to be unused. -// No constant pool entries required. -instruct subL_reg_imm16(iRegLdst dst, iRegLsrc src1, immL16 src2) %{ - match(Set dst (SubL src1 src2)); - - format %{ "SUBI $dst, $src1, $src2 \t// long" %} - size(4); - ins_encode %{ - // TODO: PPC port $archOpcode(ppc64Opcode_addi); - __ addi($dst$$Register, $src1$$Register, ($src2$$constant) * (-1)); - %} - ins_pipe(pipe_class_default); -%} - // Turn the sign-bit of a long into a 64-bit mask, 0x0...0 for // positive longs and 0xF...F for negative ones. instruct signmask64I_regL(iRegIdst dst, iRegLsrc src) %{ @@ -8165,7 +8123,7 @@ ins_pipe(pipe_class_default); %} -instruct cmovI_bne_negI_reg(iRegIdst dst, flagsReg crx, iRegIsrc src1) %{ +instruct cmovI_bne_negI_reg(iRegIdst dst, flagsRegSrc crx, iRegIsrc src1) %{ effect(USE_DEF dst, USE src1, USE crx); predicate(false); @@ -8228,7 +8186,7 @@ ins_pipe(pipe_class_default); %} -instruct cmovL_bne_negL_reg(iRegLdst dst, flagsReg crx, iRegLsrc src1) %{ +instruct cmovL_bne_negL_reg(iRegLdst dst, flagsRegSrc crx, iRegLsrc src1) %{ effect(USE_DEF dst, USE src1, USE crx); predicate(false); @@ -8281,7 +8239,7 @@ %} // Long Remainder with registers -instruct modL_reg_reg_Ex(iRegLdst dst, iRegLsrc src1, iRegLsrc src2, flagsRegCR0 cr0) %{ +instruct modL_reg_reg_Ex(iRegLdst dst, iRegLsrc src1, iRegLsrc src2) %{ match(Set dst (ModL src1 src2)); ins_cost(10*DEFAULT_COST); @@ -9011,7 +8969,6 @@ instruct andL_reg_uimm16(iRegLdst dst, iRegLsrc src1, uimmL16 src2, flagsRegCR0 cr0) %{ match(Set dst (AndL src1 src2)); effect(KILL cr0); - ins_cost(DEFAULT_COST); format %{ "ANDI $dst, $src1, $src2 \t// long" %} size(4); @@ -9803,7 +9760,7 @@ ins_pipe(pipe_class_default); %} -instruct cmovI_bso_stackSlotL(iRegIdst dst, flagsReg crx, stackSlotL src) %{ +instruct cmovI_bso_stackSlotL(iRegIdst dst, flagsRegSrc crx, stackSlotL src) %{ // no match-rule, false predicate effect(DEF dst, USE crx, USE src); predicate(false); @@ -9817,7 +9774,7 @@ ins_pipe(pipe_class_default); %} -instruct cmovI_bso_stackSlotL_conLvalue0_Ex(iRegIdst dst, flagsReg crx, stackSlotL mem) %{ +instruct cmovI_bso_stackSlotL_conLvalue0_Ex(iRegIdst dst, flagsRegSrc crx, stackSlotL mem) %{ // no match-rule, false predicate effect(DEF dst, USE crx, USE mem); predicate(false); @@ -9972,7 +9929,7 @@ ins_pipe(pipe_class_default); %} -instruct cmovL_bso_stackSlotL(iRegLdst dst, flagsReg crx, stackSlotL src) %{ +instruct cmovL_bso_stackSlotL(iRegLdst dst, flagsRegSrc crx, stackSlotL src) %{ // no match-rule, false predicate effect(DEF dst, USE crx, USE src); predicate(false); @@ -9986,7 +9943,7 @@ ins_pipe(pipe_class_default); %} -instruct cmovL_bso_stackSlotL_conLvalue0_Ex(iRegLdst dst, flagsReg crx, stackSlotL mem) %{ +instruct cmovL_bso_stackSlotL_conLvalue0_Ex(iRegLdst dst, flagsRegSrc crx, stackSlotL mem) %{ // no match-rule, false predicate effect(DEF dst, USE crx, USE mem); predicate(false); @@ -10255,7 +10212,6 @@ size(4); ins_encode %{ // TODO: PPC port $archOpcode(ppc64Opcode_andi_); - // FIXME: avoid andi_ ? __ andi_(R0, $src1$$Register, $src2$$constant); %} ins_pipe(pipe_class_compare); @@ -10302,13 +10258,12 @@ size(4); ins_encode %{ // TODO: PPC port $archOpcode(ppc64Opcode_andi_); - // FIXME: avoid andi_ ? __ andi_(R0, $src1$$Register, $src2$$constant); %} ins_pipe(pipe_class_compare); %} -instruct cmovI_conIvalueMinus1_conIvalue1(iRegIdst dst, flagsReg crx) %{ +instruct cmovI_conIvalueMinus1_conIvalue1(iRegIdst dst, flagsRegSrc crx) %{ // no match-rule, false predicate effect(DEF dst, USE crx); predicate(false); @@ -10332,7 +10287,7 @@ ins_pipe(pipe_class_compare); %} -instruct cmovI_conIvalueMinus1_conIvalue0_conIvalue1_Ex(iRegIdst dst, flagsReg crx) %{ +instruct cmovI_conIvalueMinus1_conIvalue0_conIvalue1_Ex(iRegIdst dst, flagsRegSrc crx) %{ // no match-rule, false predicate effect(DEF dst, USE crx); predicate(false); @@ -10622,8 +10577,9 @@ //----------Float Compares---------------------------------------------------- instruct cmpFUnordered_reg_reg(flagsReg crx, regF src1, regF src2) %{ + // Needs matchrule, see cmpDUnordered. + match(Set crx (CmpF src1 src2)); // no match-rule, false predicate - effect(DEF crx, USE src1, USE src2); predicate(false); format %{ "cmpFUrd $crx, $src1, $src2" %} @@ -10731,8 +10687,14 @@ %} instruct cmpDUnordered_reg_reg(flagsReg crx, regD src1, regD src2) %{ - // no match-rule, false predicate - effect(DEF crx, USE src1, USE src2); + // Needs matchrule so that ideal opcode is Cmp. This causes that gcm places the + // node right before the conditional move using it. + // In jck test api/java_awt/geom/QuadCurve2DFloat/index.html#SetCurveTesttestCase7, + // compilation of java.awt.geom.RectangularShape::getBounds()Ljava/awt/Rectangle + // crashed in register allocation where the flags Reg between cmpDUnoredered and a + // conditional move was supposed to be spilled. + match(Set crx (CmpD src1 src2)); + // False predicate, shall not be matched. predicate(false); format %{ "cmpFUrd $crx, $src1, $src2" %} @@ -10830,7 +10792,7 @@ %} // Conditional Near Branch -instruct branchCon(cmpOp cmp, flagsReg crx, label lbl) %{ +instruct branchCon(cmpOp cmp, flagsRegSrc crx, label lbl) %{ // Same match rule as `branchConFar'. match(If cmp crx); effect(USE lbl); @@ -10853,7 +10815,7 @@ // expensive. // // Conditional Far Branch -instruct branchConFar(cmpOp cmp, flagsReg crx, label lbl) %{ +instruct branchConFar(cmpOp cmp, flagsRegSrc crx, label lbl) %{ // Same match rule as `branchCon'. match(If cmp crx); effect(USE crx, USE lbl); @@ -10871,7 +10833,7 @@ %} // Conditional Branch used with Power6 scheduler (can be far or short). -instruct branchConSched(cmpOp cmp, flagsReg crx, label lbl) %{ +instruct branchConSched(cmpOp cmp, flagsRegSrc crx, label lbl) %{ // Same match rule as `branchCon'. match(If cmp crx); effect(USE crx, USE lbl); @@ -10890,7 +10852,7 @@ ins_pipe(pipe_class_default); %} -instruct branchLoopEnd(cmpOp cmp, flagsReg crx, label labl) %{ +instruct branchLoopEnd(cmpOp cmp, flagsRegSrc crx, label labl) %{ match(CountedLoopEnd cmp crx); effect(USE labl); ins_cost(BRANCH_COST); @@ -10904,7 +10866,7 @@ ins_pipe(pipe_class_default); %} -instruct branchLoopEndFar(cmpOp cmp, flagsReg crx, label labl) %{ +instruct branchLoopEndFar(cmpOp cmp, flagsRegSrc crx, label labl) %{ match(CountedLoopEnd cmp crx); effect(USE labl); predicate(!false /* TODO: PPC port HB_Schedule */); @@ -10920,7 +10882,7 @@ %} // Conditional Branch used with Power6 scheduler (can be far or short). -instruct branchLoopEndSched(cmpOp cmp, flagsReg crx, label labl) %{ +instruct branchLoopEndSched(cmpOp cmp, flagsRegSrc crx, label labl) %{ match(CountedLoopEnd cmp crx); effect(USE labl); predicate(false /* TODO: PPC port HB_Schedule */); @@ -10969,13 +10931,14 @@ instruct cmpFastLock(flagsReg crx, iRegPdst oop, iRegPdst box, iRegPdst tmp1, iRegPdst tmp2, iRegPdst tmp3) %{ match(Set crx (FastLock oop box)); effect(TEMP tmp1, TEMP tmp2, TEMP tmp3); - // TODO PPC port predicate(!UseNewFastLockPPC64 || UseBiasedLocking); + predicate(/*(!UseNewFastLockPPC64 || UseBiasedLocking) &&*/ !Compile::current()->use_rtm()); format %{ "FASTLOCK $oop, $box, $tmp1, $tmp2, $tmp3" %} ins_encode %{ // TODO: PPC port $archOpcode(ppc64Opcode_compound); __ compiler_fast_lock_object($crx$$CondRegister, $oop$$Register, $box$$Register, - $tmp3$$Register, $tmp1$$Register, $tmp2$$Register); + $tmp3$$Register, $tmp1$$Register, $tmp2$$Register, + UseBiasedLocking && !UseOptoBiasInlining); // SAPJVM MD 2014-11-06 UseOptoBiasInlining // If locking was successfull, crx should indicate 'EQ'. // The compiler generates a branch to the runtime call to // _complete_monitor_locking_Java for the case where crx is 'NE'. @@ -10983,15 +10946,58 @@ ins_pipe(pipe_class_compare); %} +// Separate version for TM. Use bound register for box to enable USE_KILL. +instruct cmpFastLock_tm(flagsReg crx, iRegPdst oop, rarg2RegP box, iRegPdst tmp1, iRegPdst tmp2, iRegPdst tmp3) %{ + match(Set crx (FastLock oop box)); + effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, USE_KILL box); + predicate(Compile::current()->use_rtm()); + + format %{ "FASTLOCK $oop, $box, $tmp1, $tmp2, $tmp3 (TM)" %} + ins_encode %{ + // TODO: PPC port $archOpcode(ppc64Opcode_compound); + __ compiler_fast_lock_object($crx$$CondRegister, $oop$$Register, $box$$Register, + $tmp3$$Register, $tmp1$$Register, $tmp2$$Register, + /*Biased Locking*/ false, + _rtm_counters, _stack_rtm_counters, + ((Method*)(ra_->C->method()->constant_encoding()))->method_data(), + /*TM*/ true, ra_->C->profile_rtm()); + // If locking was successfull, crx should indicate 'EQ'. + // The compiler generates a branch to the runtime call to + // _complete_monitor_locking_Java for the case where crx is 'NE'. + %} + ins_pipe(pipe_class_compare); +%} + instruct cmpFastUnlock(flagsReg crx, iRegPdst oop, iRegPdst box, iRegPdst tmp1, iRegPdst tmp2, iRegPdst tmp3) %{ match(Set crx (FastUnlock oop box)); effect(TEMP tmp1, TEMP tmp2, TEMP tmp3); + predicate(!Compile::current()->use_rtm()); format %{ "FASTUNLOCK $oop, $box, $tmp1, $tmp2" %} ins_encode %{ // TODO: PPC port $archOpcode(ppc64Opcode_compound); __ compiler_fast_unlock_object($crx$$CondRegister, $oop$$Register, $box$$Register, - $tmp3$$Register, $tmp1$$Register, $tmp2$$Register); + $tmp3$$Register, $tmp1$$Register, $tmp2$$Register, + UseBiasedLocking && !UseOptoBiasInlining, + false); + // If unlocking was successfull, crx should indicate 'EQ'. + // The compiler generates a branch to the runtime call to + // _complete_monitor_unlocking_Java for the case where crx is 'NE'. + %} + ins_pipe(pipe_class_compare); +%} + +instruct cmpFastUnlock_tm(flagsReg crx, iRegPdst oop, iRegPdst box, iRegPdst tmp1, iRegPdst tmp2, iRegPdst tmp3) %{ + match(Set crx (FastUnlock oop box)); + effect(TEMP tmp1, TEMP tmp2, TEMP tmp3); + predicate(Compile::current()->use_rtm()); + + format %{ "FASTUNLOCK $oop, $box, $tmp1, $tmp2 (TM)" %} + ins_encode %{ + // TODO: PPC port $archOpcode(ppc64Opcode_compound); + __ compiler_fast_unlock_object($crx$$CondRegister, $oop$$Register, $box$$Register, + $tmp3$$Register, $tmp1$$Register, $tmp2$$Register, + /*Biased Locking*/ false, /*TM*/ true); // If unlocking was successfull, crx should indicate 'EQ'. // The compiler generates a branch to the runtime call to // _complete_monitor_unlocking_Java for the case where crx is 'NE'. @@ -11658,6 +11664,66 @@ ins_pipe(pipe_class_default); %} + +//----------Overflow Math Instructions----------------------------------------- + +// Note that we have to make sure that XER.SO is reset before using overflow instructions. +// Simple Overflow operations can be matched by very few instructions (e.g. addExact: xor, and_, bc). +// Seems like only Long intrinsincs have an advantage. (The only expensive one is OverflowMulL.) + +instruct overflowAddL_reg_reg(flagsRegCR0 cr0, iRegLsrc op1, iRegLsrc op2) %{ + match(Set cr0 (OverflowAddL op1 op2)); + + format %{ "add_ $op1, $op2\t# overflow check long" %} + ins_encode %{ + // TODO: PPC port $archOpcode(ppc64Opcode_compound); + __ li(R0, 0); + __ mtxer(R0); // clear XER.SO + __ addo_(R0, $op1$$Register, $op2$$Register); + %} + ins_pipe(pipe_class_default); +%} + +instruct overflowSubL_reg_reg(flagsRegCR0 cr0, iRegLsrc op1, iRegLsrc op2) %{ + match(Set cr0 (OverflowSubL op1 op2)); + + format %{ "subfo_ R0, $op2, $op1\t# overflow check long" %} + ins_encode %{ + // TODO: PPC port $archOpcode(ppc64Opcode_compound); + __ li(R0, 0); + __ mtxer(R0); // clear XER.SO + __ subfo_(R0, $op2$$Register, $op1$$Register); + %} + ins_pipe(pipe_class_default); +%} + +instruct overflowNegL_reg(flagsRegCR0 cr0, immL_0 zero, iRegLsrc op2) %{ + match(Set cr0 (OverflowSubL zero op2)); + + format %{ "nego_ R0, $op2\t# overflow check long" %} + ins_encode %{ + // TODO: PPC port $archOpcode(ppc64Opcode_compound); + __ li(R0, 0); + __ mtxer(R0); // clear XER.SO + __ nego_(R0, $op2$$Register); + %} + ins_pipe(pipe_class_default); +%} + +instruct overflowMulL_reg_reg(flagsRegCR0 cr0, iRegLsrc op1, iRegLsrc op2) %{ + match(Set cr0 (OverflowMulL op1 op2)); + + format %{ "mulldo_ R0, $op1, $op2\t# overflow check long" %} + ins_encode %{ + // TODO: PPC port $archOpcode(ppc64Opcode_compound); + __ li(R0, 0); + __ mtxer(R0); // clear XER.SO + __ mulldo_(R0, $op1$$Register, $op2$$Register); + %} + ins_pipe(pipe_class_default); +%} + + // ============================================================================ // Safepoint Instruction diff --git a/src/cpu/ppc/vm/register_definitions_ppc.cpp b/src/cpu/ppc/vm/register_definitions_ppc.cpp --- a/src/cpu/ppc/vm/register_definitions_ppc.cpp +++ b/src/cpu/ppc/vm/register_definitions_ppc.cpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. - * Copyright 2012, 2013 SAP AG. All rights reserved. + * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012, 2015 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -23,19 +23,10 @@ * */ -// make sure the defines don't screw up the declarations later on in this file +// Make sure the defines don't screw up the declarations later on in this file. #define DONT_USE_REGISTER_DEFINES -#include "precompiled.hpp" -#include "asm/macroAssembler.hpp" #include "asm/register.hpp" -#include "register_ppc.hpp" -#ifdef TARGET_ARCH_MODEL_ppc_32 -# include "interp_masm_ppc_32.hpp" -#endif -#ifdef TARGET_ARCH_MODEL_ppc_64 -# include "interp_masm_ppc_64.hpp" -#endif REGISTER_DEFINITION(Register, noreg); diff --git a/src/cpu/ppc/vm/relocInfo_ppc.cpp b/src/cpu/ppc/vm/relocInfo_ppc.cpp --- a/src/cpu/ppc/vm/relocInfo_ppc.cpp +++ b/src/cpu/ppc/vm/relocInfo_ppc.cpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved. - * Copyright 2012, 2013 SAP AG. All rights reserved. + * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012, 2015 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -25,14 +25,12 @@ #include "precompiled.hpp" #include "asm/assembler.inline.hpp" -#include "assembler_ppc.inline.hpp" #include "code/relocInfo.hpp" #include "nativeInst_ppc.hpp" #include "oops/oop.inline.hpp" #include "runtime/safepoint.hpp" void Relocation::pd_set_data_value(address x, intptr_t o, bool verify_only) { - bool copy_back_to_oop_pool = true; // TODO: PPC port // The following comment is from the declaration of DataRelocation: // // "The "o" (displacement) argument is relevant only to split relocations diff --git a/src/cpu/ppc/vm/sharedRuntime_ppc.cpp b/src/cpu/ppc/vm/sharedRuntime_ppc.cpp --- a/src/cpu/ppc/vm/sharedRuntime_ppc.cpp +++ b/src/cpu/ppc/vm/sharedRuntime_ppc.cpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved. - * Copyright 2012, 2014 SAP AG. All rights reserved. + * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012, 2015 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -28,6 +28,7 @@ #include "code/debugInfoRec.hpp" #include "code/icBuffer.hpp" #include "code/vtableStubs.hpp" +#include "frame_ppc.hpp" #include "interpreter/interpreter.hpp" #include "interpreter/interp_masm.hpp" #include "oops/compiledICHolder.hpp" @@ -194,8 +195,8 @@ RegisterSaver_LiveIntReg( R27 ), RegisterSaver_LiveIntReg( R28 ), RegisterSaver_LiveIntReg( R29 ), - RegisterSaver_LiveIntReg( R31 ), - RegisterSaver_LiveIntReg( R30 ), // r30 must be the last register + RegisterSaver_LiveIntReg( R30 ), + RegisterSaver_LiveIntReg( R31 ), // must be the last register (see save/restore functions below) }; OopMap* RegisterSaver::push_frame_reg_args_and_save_live_registers(MacroAssembler* masm, @@ -229,29 +230,30 @@ BLOCK_COMMENT("push_frame_reg_args_and_save_live_registers {"); - // Save r30 in the last slot of the not yet pushed frame so that we + // Save r31 in the last slot of the not yet pushed frame so that we // can use it as scratch reg. - __ std(R30, -reg_size, R1_SP); + __ std(R31, -reg_size, R1_SP); assert(-reg_size == register_save_offset - frame_size_in_bytes + ((regstosave_num-1)*reg_size), "consistency check"); // save the flags // Do the save_LR_CR by hand and adjust the return pc if requested. - __ mfcr(R30); - __ std(R30, _abi(cr), R1_SP); + __ mfcr(R31); + __ std(R31, _abi(cr), R1_SP); switch (return_pc_location) { - case return_pc_is_lr: __ mflr(R30); break; - case return_pc_is_r4: __ mr(R30, R4); break; + case return_pc_is_lr: __ mflr(R31); break; + case return_pc_is_r4: __ mr(R31, R4); break; case return_pc_is_thread_saved_exception_pc: - __ ld(R30, thread_(saved_exception_pc)); break; + __ ld(R31, thread_(saved_exception_pc)); break; default: ShouldNotReachHere(); } - if (return_pc_adjustment != 0) - __ addi(R30, R30, return_pc_adjustment); - __ std(R30, _abi(lr), R1_SP); + if (return_pc_adjustment != 0) { + __ addi(R31, R31, return_pc_adjustment); + } + __ std(R31, _abi(lr), R1_SP); // push a new frame - __ push_frame(frame_size_in_bytes, R30); + __ push_frame(frame_size_in_bytes, R31); // save all registers (ints and floats) offset = register_save_offset; @@ -261,7 +263,7 @@ switch (reg_type) { case RegisterSaver::int_reg: { - if (reg_num != 30) { // We spilled R30 right at the beginning. + if (reg_num != 31) { // We spilled R31 right at the beginning. __ std(as_Register(reg_num), offset, R1_SP); } break; @@ -272,8 +274,8 @@ } case RegisterSaver::special_reg: { if (reg_num == SR_CTR_SpecialRegisterEnumValue) { - __ mfctr(R30); - __ std(R30, offset, R1_SP); + __ mfctr(R31); + __ std(R31, offset, R1_SP); } else { Unimplemented(); } @@ -321,7 +323,7 @@ switch (reg_type) { case RegisterSaver::int_reg: { - if (reg_num != 30) // R30 restored at the end, it's the tmp reg! + if (reg_num != 31) // R31 restored at the end, it's the tmp reg! __ ld(as_Register(reg_num), offset, R1_SP); break; } @@ -332,8 +334,8 @@ case RegisterSaver::special_reg: { if (reg_num == SR_CTR_SpecialRegisterEnumValue) { if (restore_ctr) { // Nothing to do here if ctr already contains the next address. - __ ld(R30, offset, R1_SP); - __ mtctr(R30); + __ ld(R31, offset, R1_SP); + __ mtctr(R31); } } else { Unimplemented(); @@ -350,10 +352,10 @@ __ pop_frame(); // restore the flags - __ restore_LR_CR(R30); + __ restore_LR_CR(R31); // restore scratch register's value - __ ld(R30, -reg_size, R1_SP); + __ ld(R31, -reg_size, R1_SP); BLOCK_COMMENT("} restore_live_registers_and_pop_frame"); } @@ -2021,6 +2023,8 @@ __ push_frame(frame_size_in_bytes, r_temp_1); // Push the c2n adapter's frame. frame_done_pc = (intptr_t)__ pc(); + __ verify_thread(); + // Native nmethod wrappers never take possesion of the oop arguments. // So the caller will gc the arguments. // The only thing we need an oopMap for is if the call is static. @@ -2594,7 +2598,7 @@ } uint SharedRuntime::out_preserve_stack_slots() { -#ifdef COMPILER2 +#if defined(COMPILER1) || defined(COMPILER2) return frame::jit_out_preserve_size / VMRegImpl::stack_slot_size; #else return 0; @@ -2868,11 +2872,6 @@ __ std(R0, in_bytes(JavaThread::exception_oop_offset()), R16_thread); __ BIND(skip_restore_excp); - // reload narrro_oop_base - if (UseCompressedOops && Universe::narrow_oop_base() != 0) { - __ load_const_optimized(R30, Universe::narrow_oop_base()); - } - __ pop_frame(); // stack: (deoptee, optional i2c, caller of deoptee, ...). diff --git a/src/cpu/ppc/vm/stubGenerator_ppc.cpp b/src/cpu/ppc/vm/stubGenerator_ppc.cpp --- a/src/cpu/ppc/vm/stubGenerator_ppc.cpp +++ b/src/cpu/ppc/vm/stubGenerator_ppc.cpp @@ -261,9 +261,6 @@ // global toc register __ load_const(R29, MacroAssembler::global_toc(), R11_scratch1); - // Load narrow oop base. - __ reinit_heapbase(R30, R11_scratch1); - // Remember the senderSP so we interpreter can pop c2i arguments off of the stack // when called via a c2i. @@ -418,6 +415,23 @@ // or native call stub. The pending exception in Thread is // converted into a Java-level exception. // + // Read: + // + // LR: The pc the runtime library callee wants to return to. + // Since the exception occurred in the callee, the return pc + // from the point of view of Java is the exception pc. + // thread: Needed for method handles. + // + // Invalidate: + // + // volatile registers (except below). + // + // Update: + // + // R4_ARG2: exception + // + // (LR is unchanged and is live out). + // address generate_forward_exception() { StubCodeMark mark(this, "StubRoutines", "forward_exception"); address start = __ pc(); @@ -1256,9 +1270,9 @@ Register tmp3 = R8_ARG6; #if defined(ABI_ELFv2) - address nooverlap_target = aligned ? - StubRoutines::arrayof_jbyte_disjoint_arraycopy() : - StubRoutines::jbyte_disjoint_arraycopy(); + address nooverlap_target = aligned ? + StubRoutines::arrayof_jbyte_disjoint_arraycopy() : + StubRoutines::jbyte_disjoint_arraycopy(); #else address nooverlap_target = aligned ? ((FunctionDescriptor*)StubRoutines::arrayof_jbyte_disjoint_arraycopy())->entry() : diff --git a/src/cpu/ppc/vm/templateInterpreter_ppc.cpp b/src/cpu/ppc/vm/templateInterpreter_ppc.cpp --- a/src/cpu/ppc/vm/templateInterpreter_ppc.cpp +++ b/src/cpu/ppc/vm/templateInterpreter_ppc.cpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, 2014 SAP AG. All rights reserved. + * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013, 2015 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -264,11 +264,11 @@ __ cmpdi(CCR0, Rmdo, 0); __ beq(CCR0, no_mdo); - // Increment invocation counter in the MDO. - const int mdo_ic_offs = in_bytes(MethodData::invocation_counter_offset()) + in_bytes(InvocationCounter::counter_offset()); - __ lwz(Rscratch2, mdo_ic_offs, Rmdo); + // Increment backedge counter in the MDO. + const int mdo_bc_offs = in_bytes(MethodData::backedge_counter_offset()) + in_bytes(InvocationCounter::counter_offset()); + __ lwz(Rscratch2, mdo_bc_offs, Rmdo); __ addi(Rscratch2, Rscratch2, increment); - __ stw(Rscratch2, mdo_ic_offs, Rmdo); + __ stw(Rscratch2, mdo_bc_offs, Rmdo); __ load_const_optimized(Rscratch1, mask, R0); __ and_(Rscratch1, Rscratch2, Rscratch1); __ bne(CCR0, done); @@ -276,12 +276,12 @@ } // Increment counter in MethodCounters*. - const int mo_ic_offs = in_bytes(MethodCounters::invocation_counter_offset()) + in_bytes(InvocationCounter::counter_offset()); + const int mo_bc_offs = in_bytes(MethodCounters::backedge_counter_offset()) + in_bytes(InvocationCounter::counter_offset()); __ bind(no_mdo); __ get_method_counters(R19_method, R3_counters, done); - __ lwz(Rscratch2, mo_ic_offs, R3_counters); + __ lwz(Rscratch2, mo_bc_offs, R3_counters); __ addi(Rscratch2, Rscratch2, increment); - __ stw(Rscratch2, mo_ic_offs, R3_counters); + __ stw(Rscratch2, mo_bc_offs, R3_counters); __ load_const_optimized(Rscratch1, mask, R0); __ and_(Rscratch1, Rscratch2, Rscratch1); __ beq(CCR0, *overflow); @@ -611,12 +611,7 @@ // For others we can use a normal (native) entry. inline bool math_entry_available(AbstractInterpreter::MethodKind kind) { - // Provide math entry with debugging on demand. - // Note: Debugging changes which code will get executed: - // Debugging or disabled InlineIntrinsics: java method will get interpreted and performs a native call. - // Not debugging and enabled InlineIntrinics: processor instruction will get used. - // Result might differ slightly due to rounding etc. - if (!InlineIntrinsics && (!FLAG_IS_ERGO(InlineIntrinsics))) return false; // Generate a vanilla entry. + if (!InlineIntrinsics) return false; return ((kind==Interpreter::java_lang_math_sqrt && VM_Version::has_fsqrt()) || (kind==Interpreter::java_lang_math_abs)); @@ -628,15 +623,8 @@ return Interpreter::entry_for_kind(Interpreter::zerolocals); } - Label Lslow_path; - const Register Rjvmti_mode = R11_scratch1; address entry = __ pc(); - // Provide math entry with debugging on demand. - __ lwz(Rjvmti_mode, thread_(interp_only_mode)); - __ cmpwi(CCR0, Rjvmti_mode, 0); - __ bne(CCR0, Lslow_path); // jvmti_mode!=0 - __ lfd(F1_RET, Interpreter::stackElementSize, R15_esp); // Pop c2i arguments (if any) off when we return. @@ -659,9 +647,6 @@ // And we're done. __ blr(); - // Provide slow path for JVMTI case. - __ bind(Lslow_path); - __ branch_to_entry(Interpreter::entry_for_kind(Interpreter::zerolocals), R12_scratch2); __ flush(); return entry; diff --git a/src/cpu/ppc/vm/templateInterpreter_ppc.hpp b/src/cpu/ppc/vm/templateInterpreter_ppc.hpp --- a/src/cpu/ppc/vm/templateInterpreter_ppc.hpp +++ b/src/cpu/ppc/vm/templateInterpreter_ppc.hpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved. - * Copyright 2013, 2014 SAP AG. All rights reserved. + * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013, 2015 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -34,7 +34,7 @@ // Run with +PrintInterpreter to get the VM to print out the size. // Max size with JVMTI - const static int InterpreterCodeSize = 210*K; + const static int InterpreterCodeSize = 230*K; #endif // CPU_PPC_VM_TEMPLATEINTERPRETER_PPC_HPP diff --git a/src/cpu/ppc/vm/templateTable_ppc_64.cpp b/src/cpu/ppc/vm/templateTable_ppc_64.cpp --- a/src/cpu/ppc/vm/templateTable_ppc_64.cpp +++ b/src/cpu/ppc/vm/templateTable_ppc_64.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved. * Copyright 2013, 2015 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * @@ -375,23 +375,22 @@ int index_size = wide ? sizeof(u2) : sizeof(u1); const Register Rscratch = R11_scratch1; - Label resolved; + Label is_null; // We are resolved if the resolved reference cache entry contains a // non-null object (CallSite, etc.) __ get_cache_index_at_bcp(Rscratch, 1, index_size); // Load index. - __ load_resolved_reference_at_index(R17_tos, Rscratch); - __ cmpdi(CCR0, R17_tos, 0); - __ bne(CCR0, resolved); + __ load_resolved_reference_at_index(R17_tos, Rscratch, &is_null); + __ verify_oop(R17_tos); + __ dispatch_epilog(atos, Bytecodes::length_for(bytecode())); + + __ bind(is_null); __ load_const_optimized(R3_ARG1, (int)bytecode()); address entry = CAST_FROM_FN_PTR(address, InterpreterRuntime::resolve_ldc); // First time invocation - must resolve first. __ call_VM(R17_tos, entry, R3_ARG1); - - __ align(32, 12); - __ bind(resolved); __ verify_oop(R17_tos); } @@ -3795,9 +3794,9 @@ transition(atos, itos); Label Ldone, Lis_null, Lquicked, Lresolved; - Register Roffset = R5_ARG3, + Register Roffset = R6_ARG4, RobjKlass = R4_ARG2, - RspecifiedKlass = R6_ARG4, // Generate_ClassCastException_verbose_handler will expect the value in this register. + RspecifiedKlass = R5_ARG3, Rcpool = R11_scratch1, Rtags = R12_scratch2; diff --git a/src/cpu/ppc/vm/vm_version_ppc.cpp b/src/cpu/ppc/vm/vm_version_ppc.cpp --- a/src/cpu/ppc/vm/vm_version_ppc.cpp +++ b/src/cpu/ppc/vm/vm_version_ppc.cpp @@ -32,12 +32,13 @@ #include "runtime/os.hpp" #include "runtime/stubCodeGenerator.hpp" #include "utilities/defaultStream.hpp" +#include "utilities/globalDefinitions.hpp" #include "vm_version_ppc.hpp" # include int VM_Version::_features = VM_Version::unknown_m; -int VM_Version::_measured_cache_line_size = 128; // default value +int VM_Version::_measured_cache_line_size = 32; // pessimistic init value const char* VM_Version::_features_str = ""; bool VM_Version::_is_determine_features_test_running = false; @@ -55,7 +56,9 @@ // If PowerArchitecturePPC64 hasn't been specified explicitly determine from features. if (FLAG_IS_DEFAULT(PowerArchitecturePPC64)) { - if (VM_Version::has_popcntw()) { + if (VM_Version::has_lqarx()) { + FLAG_SET_ERGO(uintx, PowerArchitecturePPC64, 8); + } else if (VM_Version::has_popcntw()) { FLAG_SET_ERGO(uintx, PowerArchitecturePPC64, 7); } else if (VM_Version::has_cmpb()) { FLAG_SET_ERGO(uintx, PowerArchitecturePPC64, 6); @@ -66,8 +69,14 @@ } } guarantee(PowerArchitecturePPC64 == 0 || PowerArchitecturePPC64 == 5 || - PowerArchitecturePPC64 == 6 || PowerArchitecturePPC64 == 7, - "PowerArchitecturePPC64 should be 0, 5, 6 or 7"); + PowerArchitecturePPC64 == 6 || PowerArchitecturePPC64 == 7 || + PowerArchitecturePPC64 == 8, + "PowerArchitecturePPC64 should be 0, 5, 6, 7, or 8"); + + // Power 8: Configure Data Stream Control Register. + if (PowerArchitecturePPC64 >= 8) { + config_dscr(); + } if (!UseSIGTRAP) { MSG(TrapBasedICMissChecks); @@ -97,7 +106,7 @@ // Create and print feature-string. char buf[(num_features+1) * 16]; // Max 16 chars per feature. jio_snprintf(buf, sizeof(buf), - "ppc64%s%s%s%s%s%s%s%s", + "ppc64%s%s%s%s%s%s%s%s%s%s%s%s", (has_fsqrt() ? " fsqrt" : ""), (has_isel() ? " isel" : ""), (has_lxarxeh() ? " lxarxeh" : ""), @@ -106,11 +115,17 @@ (has_popcntb() ? " popcntb" : ""), (has_popcntw() ? " popcntw" : ""), (has_fcfids() ? " fcfids" : ""), - (has_vand() ? " vand" : "") + (has_vand() ? " vand" : ""), + (has_lqarx() ? " lqarx" : ""), + (has_vcipher() ? " vcipher" : ""), + (has_vpmsumb() ? " vpmsumb" : ""), + (has_tcheck() ? " tcheck" : "") // Make sure number of %s matches num_features! ); _features_str = os::strdup(buf); - NOT_PRODUCT(if (Verbose) print_features();); + if (Verbose) { + print_features(); + } // PPC64 supports 8-byte compare-exchange operations (see // Atomic::cmpxchg and StubGenerator::generate_atomic_cmpxchg_ptr) @@ -171,6 +186,58 @@ FLAG_SET_DEFAULT(UseSHA256Intrinsics, false); FLAG_SET_DEFAULT(UseSHA512Intrinsics, false); } + // Adjust RTM (Restricted Transactional Memory) flags. + if (!has_tcheck() && UseRTMLocking) { + // Can't continue because UseRTMLocking affects UseBiasedLocking flag + // setting during arguments processing. See use_biased_locking(). + // VM_Version_init() is executed after UseBiasedLocking is used + // in Thread::allocate(). + vm_exit_during_initialization("RTM instructions are not available on this CPU"); + } + + if (UseRTMLocking) { +#if INCLUDE_RTM_OPT + if (!UnlockExperimentalVMOptions) { + vm_exit_during_initialization("UseRTMLocking is only available as experimental option on this platform. " + "It must be enabled via -XX:+UnlockExperimentalVMOptions flag."); + } else { + warning("UseRTMLocking is only available as experimental option on this platform."); + } + if (!FLAG_IS_CMDLINE(UseRTMLocking)) { + // RTM locking should be used only for applications with + // high lock contention. For now we do not use it by default. + vm_exit_during_initialization("UseRTMLocking flag should be only set on command line"); + } + if (!is_power_of_2(RTMTotalCountIncrRate)) { + warning("RTMTotalCountIncrRate must be a power of 2, resetting it to 64"); + FLAG_SET_DEFAULT(RTMTotalCountIncrRate, 64); + } + if (RTMAbortRatio < 0 || RTMAbortRatio > 100) { + warning("RTMAbortRatio must be in the range 0 to 100, resetting it to 50"); + FLAG_SET_DEFAULT(RTMAbortRatio, 50); + } + FLAG_SET_ERGO(bool, UseNewFastLockPPC64, false); // Does not implement TM. + guarantee(RTMSpinLoopCount > 0, "unsupported"); +#else + // Only C2 does RTM locking optimization. + // Can't continue because UseRTMLocking affects UseBiasedLocking flag + // setting during arguments processing. See use_biased_locking(). + vm_exit_during_initialization("RTM locking optimization is not supported in this VM"); +#endif + } else { // !UseRTMLocking + if (UseRTMForStackLocks) { + if (!FLAG_IS_DEFAULT(UseRTMForStackLocks)) { + warning("UseRTMForStackLocks flag should be off when UseRTMLocking flag is off"); + } + FLAG_SET_DEFAULT(UseRTMForStackLocks, false); + } + if (UseRTMDeopt) { + FLAG_SET_DEFAULT(UseRTMDeopt, false); + } + if (PrintPreciseRTMLockingStatistics) { + FLAG_SET_DEFAULT(PrintPreciseRTMLockingStatistics, false); + } + } // This machine does not allow unaligned memory accesses if (UseUnalignedAccesses) { @@ -180,6 +247,27 @@ } } +bool VM_Version::use_biased_locking() { +#if INCLUDE_RTM_OPT + // RTM locking is most useful when there is high lock contention and + // low data contention. With high lock contention the lock is usually + // inflated and biased locking is not suitable for that case. + // RTM locking code requires that biased locking is off. + // Note: we can't switch off UseBiasedLocking in get_processor_features() + // because it is used by Thread::allocate() which is called before + // VM_Version::initialize(). + if (UseRTMLocking && UseBiasedLocking) { + if (FLAG_IS_DEFAULT(UseBiasedLocking)) { + FLAG_SET_DEFAULT(UseBiasedLocking, false); + } else { + warning("Biased locking is not supported with RTM locking; ignoring UseBiasedLocking flag." ); + UseBiasedLocking = false; + } + } +#endif + return UseBiasedLocking; +} + void VM_Version::print_features() { tty->print_cr("Version: %s cache_line_size = %d", cpu_features(), (int) get_cache_line_size()); } @@ -443,16 +531,19 @@ // Don't use R0 in ldarx. // Keep R3_ARG1 unmodified, it contains &field (see below). // Keep R4_ARG2 unmodified, it contains offset = 0 (see below). - a->fsqrt(F3, F4); // code[0] -> fsqrt_m - a->fsqrts(F3, F4); // code[1] -> fsqrts_m - a->isel(R7, R5, R6, 0); // code[2] -> isel_m - a->ldarx_unchecked(R7, R3_ARG1, R4_ARG2, 1); // code[3] -> lxarx_m - a->cmpb(R7, R5, R6); // code[4] -> bcmp - //a->mftgpr(R7, F3); // code[5] -> mftgpr - a->popcntb(R7, R5); // code[6] -> popcntb - a->popcntw(R7, R5); // code[7] -> popcntw - a->fcfids(F3, F4); // code[8] -> fcfids - a->vand(VR0, VR0, VR0); // code[9] -> vand + a->fsqrt(F3, F4); // code[0] -> fsqrt_m + a->fsqrts(F3, F4); // code[1] -> fsqrts_m + a->isel(R7, R5, R6, 0); // code[2] -> isel_m + a->ldarx_unchecked(R7, R3_ARG1, R4_ARG2, 1); // code[3] -> lxarx_m + a->cmpb(R7, R5, R6); // code[4] -> cmpb + a->popcntb(R7, R5); // code[5] -> popcntb + a->popcntw(R7, R5); // code[6] -> popcntw + a->fcfids(F3, F4); // code[7] -> fcfids + a->vand(VR0, VR0, VR0); // code[8] -> vand + a->lqarx_unchecked(R7, R3_ARG1, R4_ARG2, 1); // code[9] -> lqarx_m + a->vcipher(VR0, VR1, VR2); // code[10] -> vcipher + a->vpmsumb(VR0, VR1, VR2); // code[11] -> vpmsumb + a->tcheck(0); // code[12] -> tcheck a->blr(); // Emit function to set one cache line to zero. Emit function descriptor and get pointer to it. @@ -491,11 +582,14 @@ if (code[feature_cntr++]) features |= isel_m; if (code[feature_cntr++]) features |= lxarxeh_m; if (code[feature_cntr++]) features |= cmpb_m; - //if(code[feature_cntr++])features |= mftgpr_m; if (code[feature_cntr++]) features |= popcntb_m; if (code[feature_cntr++]) features |= popcntw_m; if (code[feature_cntr++]) features |= fcfids_m; if (code[feature_cntr++]) features |= vand_m; + if (code[feature_cntr++]) features |= lqarx_m; + if (code[feature_cntr++]) features |= vcipher_m; + if (code[feature_cntr++]) features |= vpmsumb_m; + if (code[feature_cntr++]) features |= tcheck_m; // Print the detection code. if (PrintAssembly) { @@ -507,6 +601,69 @@ _features = features; } +// Power 8: Configure Data Stream Control Register. +void VM_Version::config_dscr() { + assert(has_tcheck(), "Only execute on Power 8 or later!"); + + // 7 InstWords for each call (function descriptor + blr instruction). + const int code_size = (2+2*7)*BytesPerInstWord; + + // Allocate space for the code. + ResourceMark rm; + CodeBuffer cb("config_dscr", code_size, 0); + MacroAssembler* a = new MacroAssembler(&cb); + + // Emit code. + uint64_t (*get_dscr)() = (uint64_t(*)())(void *)a->emit_fd(); + uint32_t *code = (uint32_t *)a->pc(); + a->mfdscr(R3); + a->blr(); + + void (*set_dscr)(long) = (void(*)(long))(void *)a->emit_fd(); + a->mtdscr(R3); + a->blr(); + + uint32_t *code_end = (uint32_t *)a->pc(); + a->flush(); + + // Print the detection code. + if (PrintAssembly) { + ttyLocker ttyl; + tty->print_cr("Decoding dscr configuration stub at " INTPTR_FORMAT " before execution:", code); + Disassembler::decode((u_char*)code, (u_char*)code_end, tty); + } + + // Apply the configuration if needed. + uint64_t dscr_val = (*get_dscr)(); + if (Verbose) { + tty->print_cr("dscr value was 0x%lx" , dscr_val); + } + bool change_requested = false; + if (DSCR_PPC64 != (uintx)-1) { + dscr_val = DSCR_PPC64; + change_requested = true; + } + if (DSCR_DPFD_PPC64 <= 7) { + uint64_t mask = 0x7; + if ((dscr_val & mask) != DSCR_DPFD_PPC64) { + dscr_val = (dscr_val & ~mask) | (DSCR_DPFD_PPC64); + change_requested = true; + } + } + if (DSCR_URG_PPC64 <= 7) { + uint64_t mask = 0x7 << 6; + if ((dscr_val & mask) != DSCR_DPFD_PPC64 << 6) { + dscr_val = (dscr_val & ~mask) | (DSCR_URG_PPC64 << 6); + change_requested = true; + } + } + if (change_requested) { + (*set_dscr)(dscr_val); + if (Verbose) { + tty->print_cr("dscr was set to 0x%lx" , (*get_dscr)()); + } + } +} static int saved_features = 0; diff --git a/src/cpu/ppc/vm/vm_version_ppc.hpp b/src/cpu/ppc/vm/vm_version_ppc.hpp --- a/src/cpu/ppc/vm/vm_version_ppc.hpp +++ b/src/cpu/ppc/vm/vm_version_ppc.hpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. - * Copyright 2012, 2014 SAP AG. All rights reserved. + * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012, 2015 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -41,7 +41,10 @@ popcntw, fcfids, vand, - dcba, + lqarx, + vcipher, + vpmsumb, + tcheck, num_features // last entry to count features }; enum Feature_Flag_Set { @@ -55,7 +58,10 @@ popcntw_m = (1 << popcntw), fcfids_m = (1 << fcfids ), vand_m = (1 << vand ), - dcba_m = (1 << dcba ), + lqarx_m = (1 << lqarx ), + vcipher_m = (1 << vcipher), + vpmsumb_m = (1 << vpmsumb), + tcheck_m = (1 << tcheck ), all_features_m = -1 }; static int _features; @@ -65,12 +71,16 @@ static void print_features(); static void determine_features(); // also measures cache line size + static void config_dscr(); // Power 8: Configure Data Stream Control Register. static void determine_section_size(); static void power6_micro_bench(); public: // Initialization static void initialize(); + // Override Abstract_VM_Version implementation + static bool use_biased_locking(); + static bool is_determine_features_test_running() { return _is_determine_features_test_running; } // CPU instruction support static bool has_fsqrt() { return (_features & fsqrt_m) != 0; } @@ -82,7 +92,10 @@ static bool has_popcntw() { return (_features & popcntw_m) != 0; } static bool has_fcfids() { return (_features & fcfids_m) != 0; } static bool has_vand() { return (_features & vand_m) != 0; } - static bool has_dcba() { return (_features & dcba_m) != 0; } + static bool has_lqarx() { return (_features & lqarx_m) != 0; } + static bool has_vcipher() { return (_features & vcipher_m) != 0; } + static bool has_vpmsumb() { return (_features & vpmsumb_m) != 0; } + static bool has_tcheck() { return (_features & tcheck_m) != 0; } static const char* cpu_features() { return _features_str; } diff --git a/src/cpu/ppc/vm/vtableStubs_ppc_64.cpp b/src/cpu/ppc/vm/vtableStubs_ppc_64.cpp --- a/src/cpu/ppc/vm/vtableStubs_ppc_64.cpp +++ b/src/cpu/ppc/vm/vtableStubs_ppc_64.cpp @@ -1,6 +1,6 @@ /* - * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. - * Copyright 2012, 2014 SAP AG. All rights reserved. + * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012, 2015 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -24,7 +24,6 @@ */ #include "precompiled.hpp" -#include "asm/assembler.hpp" #include "asm/macroAssembler.inline.hpp" #include "code/vtableStubs.hpp" #include "interp_masm_ppc_64.hpp"