diff a/make/hotspot/gensrc/GensrcAdlc.gmk b/make/hotspot/gensrc/GensrcAdlc.gmk --- a/make/hotspot/gensrc/GensrcAdlc.gmk +++ b/make/hotspot/gensrc/GensrcAdlc.gmk @@ -127,10 +127,16 @@ $d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/$(HOTSPOT_TARGET_CPU).ad \ $d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/$(HOTSPOT_TARGET_CPU_ARCH).ad \ $d/os_cpu/$(HOTSPOT_TARGET_OS)_$(HOTSPOT_TARGET_CPU_ARCH)/$(HOTSPOT_TARGET_OS)_$(HOTSPOT_TARGET_CPU_ARCH).ad \ ))) + ifeq ($(HOTSPOT_TARGET_CPU_ARCH), aarch64) + AD_SRC_FILES += $(call uniq, $(wildcard $(foreach d, $(AD_SRC_ROOTS), \ + $d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/$(HOTSPOT_TARGET_CPU_ARCH)_sve.ad \ + ))) + endif + ifeq ($(call check-jvm-feature, shenandoahgc), true) AD_SRC_FILES += $(call uniq, $(wildcard $(foreach d, $(AD_SRC_ROOTS), \ $d/cpu/$(HOTSPOT_TARGET_CPU_ARCH)/gc/shenandoah/shenandoah_$(HOTSPOT_TARGET_CPU).ad \ ))) endif diff a/src/hotspot/cpu/aarch64/aarch64-asmtest.py b/src/hotspot/cpu/aarch64/aarch64-asmtest.py --- a/src/hotspot/cpu/aarch64/aarch64-asmtest.py +++ b/src/hotspot/cpu/aarch64/aarch64-asmtest.py @@ -66,10 +66,53 @@ if (self.number == 31): return self.astr() else: return self.astr("r") +class SVEVectorRegister(FloatRegister): + def __str__(self): + return self.astr("z") + +class SVEPRegister(Register): + def __str__(self): + return self.astr("p") + + def generate(self): + self.number = random.randint(0, 15) + return self + +class SVEGoverningPRegister(Register): + def __str__(self): + return self.astr("p") + def generate(self): + self.number = random.randint(0, 7) + return self + +class RegVariant(object): + def __init__(self, low, high): + self.number = random.randint(low, high) + + def astr(self): + nameMap = { + 0: ".b", + 1: ".h", + 2: ".s", + 3: ".d", + 4: ".q" + } + return nameMap.get(self.number) + + def cstr(self): + nameMap = { + 0: "__ B", + 1: "__ H", + 2: "__ S", + 3: "__ D", + 4: "__ Q" + } + return nameMap.get(self.number) + class FloatZero(Operand): def __str__(self): return "0.0" @@ -80,11 +123,14 @@ _modes = {'x' : GeneralRegister, 'w' : GeneralRegister, 's' : FloatRegister, 'd' : FloatRegister, - 'z' : FloatZero} + 'z' : FloatZero, + 'p' : SVEPRegister, + 'P' : SVEGoverningPRegister, + 'Z' : SVEVectorRegister} @classmethod def create(cls, mode): return OperandFactory._modes[mode]() @@ -837,10 +883,104 @@ formatStr = "%s%s" + ''.join([", %s" for i in range(1, self.numRegs)]) return (formatStr % tuple([Instruction.astr(self)] + [(self.reg[i].astr(self.modes[i])) for i in range(self.numRegs)])) +class SVEVectorOp(Instruction): + def __init__(self, args): + name = args[0] + regTypes = args[1] + regs = [] + for c in regTypes: + regs.append(OperandFactory.create(c).generate()) + self.reg = regs + self.numRegs = len(regs) + if regTypes[0] != "p" and regTypes[1] == 'P': + self._isPredicated = True + self._merge = "/m" + else: + self._isPredicated = False + self._merge ="" + + self._bitwiseop = False + if name[0] == 'f': + self._width = RegVariant(2, 3) + elif not self._isPredicated and (name == "and" or name == "eor" or name == "orr"): + self._width = RegVariant(3, 3) + self._bitwiseop = True + else: + self._width = RegVariant(0, 3) + if len(args) > 2: + self._dnm = args[2] + else: + self._dnm = None + Instruction.__init__(self, name) + + def cstr(self): + formatStr = "%s%s" + ''.join([", %s" for i in range(0, self.numRegs)] + [");"]) + if self._bitwiseop: + width = [] + formatStr = "%s%s" + ''.join([", %s" for i in range(1, self.numRegs)] + [");"]) + else: + width = [self._width.cstr()] + return (formatStr + % tuple(["__ sve_" + self._name + "("] + + [str(self.reg[0])] + + width + + [str(self.reg[i]) for i in range(1, self.numRegs)])) + def astr(self): + formatStr = "%s%s" + ''.join([", %s" for i in range(1, self.numRegs)]) + if self._dnm == 'dn': + formatStr += ", %s" + dnReg = [str(self.reg[0]) + self._width.astr()] + else: + dnReg = [] + + if self._isPredicated: + restRegs = [str(self.reg[1]) + self._merge] + dnReg + [str(self.reg[i]) + self._width.astr() for i in range(2, self.numRegs)] + else: + restRegs = dnReg + [str(self.reg[i]) + self._width.astr() for i in range(1, self.numRegs)] + return (formatStr + % tuple([Instruction.astr(self)] + + [str(self.reg[0]) + self._width.astr()] + + restRegs)) + def generate(self): + return self + +class SVEReductionOp(Instruction): + def __init__(self, args): + name = args[0] + lowRegType = args[1] + self.reg = [] + Instruction.__init__(self, name) + self.reg.append(OperandFactory.create('s').generate()) + self.reg.append(OperandFactory.create('P').generate()) + self.reg.append(OperandFactory.create('Z').generate()) + self._width = RegVariant(lowRegType, 3) + def cstr(self): + return "__ sve_%s(%s, %s, %s, %s);" % (self.name(), + str(self.reg[0]), + self._width.cstr(), + str(self.reg[1]), + str(self.reg[2])) + def astr(self): + if self.name() == "uaddv": + dstRegName = "d" + str(self.reg[0].number) + else: + dstRegName = self._width.astr()[1] + str(self.reg[0].number) + formatStr = "%s %s, %s, %s" + if self.name() == "fadda": + formatStr += ", %s" + moreReg = [dstRegName] + else: + moreReg = [] + return formatStr % tuple([self.name()] + + [dstRegName] + + [str(self.reg[1])] + + moreReg + + [str(self.reg[2]) + self._width.astr()]) + class LdStSIMDOp(Instruction): def __init__(self, args): self._name, self.regnum, self.arrangement, self.addresskind = args def generate(self): @@ -1158,11 +1298,46 @@ ["dup", "__ dup(v0, __ T16B, zr);", "dup\tv0.16b, wzr"], ["mov", "__ mov(v1, __ T1D, 0, zr);", "mov\tv1.d[0], xzr"], ["mov", "__ mov(v1, __ T2S, 1, zr);", "mov\tv1.s[1], wzr"], ["mov", "__ mov(v1, __ T4H, 2, zr);", "mov\tv1.h[2], wzr"], ["mov", "__ mov(v1, __ T8B, 3, zr);", "mov\tv1.b[3], wzr"], - ["ld1", "__ ld1(v31, v0, __ T2D, Address(__ post(r1, r0)));", "ld1\t{v31.2d, v0.2d}, [x1], x0"]]) + ["ld1", "__ ld1(v31, v0, __ T2D, Address(__ post(r1, r0)));", "ld1\t{v31.2d, v0.2d}, [x1], x0"], + # SVE instructions + ["cpy", "__ sve_cpy(z0, __ S, p0, v1);", "mov\tz0.s, p0/m, s1"], + ["inc", "__ sve_inc(r0, __ S);", "incw\tx0"], + ["dec", "__ sve_dec(r1, __ H);", "dech\tx1"], + ["lsl", "__ sve_lsl(z0, __ B, z1, 7);", "lsl\tz0.b, z1.b, #7"], + ["lsl", "__ sve_lsl(z21, __ H, z1, 15);", "lsl\tz21.h, z1.h, #15"], + ["lsl", "__ sve_lsl(z0, __ S, z1, 31);", "lsl\tz0.s, z1.s, #31"], + ["lsl", "__ sve_lsl(z0, __ D, z1, 63);", "lsl\tz0.d, z1.d, #63"], + ["lsr", "__ sve_lsr(z0, __ B, z1, 7);", "lsr\tz0.b, z1.b, #7"], + ["asr", "__ sve_asr(z0, __ H, z11, 15);", "asr\tz0.h, z11.h, #15"], + ["lsr", "__ sve_lsr(z30, __ S, z1, 31);", "lsr\tz30.s, z1.s, #31"], + ["asr", "__ sve_asr(z0, __ D, z1, 63);", "asr\tz0.d, z1.d, #63"], + ["addvl", "__ sve_addvl(sp, r0, 31);", "addvl\tsp, x0, #31"], + ["addpl", "__ sve_addpl(r1, sp, -32);", "addpl\tx1, sp, -32"], + ["cntp", "__ sve_cntp(r8, __ B, p0, p1);", "cntp\tx8, p0, p1.b"], + ["dup", "__ sve_dup(z0, __ B, 127);", "dup\tz0.b, 127"], + ["dup", "__ sve_dup(z1, __ H, -128);", "dup\tz1.h, -128"], + ["dup", "__ sve_dup(z2, __ S, 32512);", "dup\tz2.s, 32512"], + ["dup", "__ sve_dup(z7, __ D, -32768);", "dup\tz7.d, -32768"], + ["ld1b", "__ sve_ld1b(z0, __ B, p0, Address(sp));", "ld1b\t{z0.b}, p0/z, [sp]"], + ["ld1h", "__ sve_ld1h(z10, __ H, p1, Address(sp, -8));", "ld1h\t{z10.h}, p1/z, [sp, #-8, MUL VL]"], + ["ld1w", "__ sve_ld1w(z20, __ S, p2, Address(r0, 7));", "ld1w\t{z20.s}, p2/z, [x0, #7, MUL VL]"], + ["ld1b", "__ sve_ld1b(z30, __ B, p3, Address(sp, r8));", "ld1b\t{z30.b}, p3/z, [sp, x8]"], + ["ld1w", "__ sve_ld1w(z0, __ S, p4, Address(sp, r28));", "ld1w\t{z0.s}, p4/z, [sp, x28, LSL #2]"], + ["ld1d", "__ sve_ld1d(z11, __ D, p5, Address(r0, r1));", "ld1d\t{z11.d}, p5/z, [x0, x1, LSL #3]"], + ["st1b", "__ sve_st1b(z22, __ B, p6, Address(sp));", "st1b\t{z22.b}, p6, [sp]"], + ["st1b", "__ sve_st1b(z31, __ B, p7, Address(sp, -8));", "st1b\t{z31.b}, p7, [sp, #-8, MUL VL]"], + ["st1w", "__ sve_st1w(z0, __ S, p1, Address(r0, 7));", "st1w\t{z0.s}, p1, [x0, #7, MUL VL]"], + ["st1b", "__ sve_st1b(z0, __ B, p2, Address(sp, r1));", "st1b\t{z0.b}, p2, [sp, x1]"], + ["st1h", "__ sve_st1h(z0, __ H, p3, Address(sp, r8));", "st1h\t{z0.h}, p3, [sp, x8, LSL #1]"], + ["st1d", "__ sve_st1d(z0, __ D, p4, Address(r0, r18));", "st1d\t{z0.d}, p4, [x0, x18, LSL #3]"], + ["ldr", "__ sve_ldr(z0, Address(sp));", "ldr\tz0, [sp]"], + ["ldr", "__ sve_ldr(z31, Address(sp, -256));", "ldr\tz31, [sp, #-256, MUL VL]"], + ["str", "__ sve_str(z8, Address(r8, 255));", "str\tz8, [x8, #255, MUL VL]"], +]) print "\n// FloatImmediateOp" for float in ("2.0", "2.125", "4.0", "4.25", "8.0", "8.5", "16.0", "17.0", "0.125", "0.1328125", "0.25", "0.265625", "0.5", "0.53125", "1.0", "1.0625", "-2.0", "-2.125", "-4.0", "-4.25", "-8.0", "-8.5", "-16.0", "-17.0", @@ -1183,20 +1358,63 @@ ["ldsmin", "ldsmin", size, suffix], ["ldsmax", "ldsmax", size, suffix], ["ldumin", "ldumin", size, suffix], ["ldumax", "ldumax", size, suffix]]); +generate(SVEVectorOp, [["add", "ZZZ"], + ["sub", "ZZZ"], + ["fadd", "ZZZ"], + ["fmul", "ZZZ"], + ["fsub", "ZZZ"], + ["abs", "ZPZ"], + ["add", "ZPZ", "dn"], + ["asr", "ZPZ", "dn"], + ["cnt", "ZPZ"], + ["lsl", "ZPZ", "dn"], + ["lsr", "ZPZ", "dn"], + ["mul", "ZPZ", "dn"], + ["neg", "ZPZ"], + ["not", "ZPZ"], + ["smax", "ZPZ", "dn"], + ["smin", "ZPZ", "dn"], + ["sub", "ZPZ", "dn"], + ["fabs", "ZPZ"], + ["fadd", "ZPZ", "dn"], + ["fdiv", "ZPZ", "dn"], + ["fmax", "ZPZ", "dn"], + ["fmin", "ZPZ", "dn"], + ["fmul", "ZPZ", "dn"], + ["fneg", "ZPZ"], + ["frintm", "ZPZ"], + ["frintn", "ZPZ"], + ["frintp", "ZPZ"], + ["fsqrt", "ZPZ"], + ["fsub", "ZPZ", "dn"], + ["fmla", "ZPZZ"], + ["fmls", "ZPZZ"], + ["fnmla", "ZPZZ"], + ["fnmls", "ZPZZ"], + ["mla", "ZPZZ"], + ["mls", "ZPZZ"], + ["and", "ZZZ"], + ["eor", "ZZZ"], + ["orr", "ZZZ"], + ]) + +generate(SVEReductionOp, [["andv", 0], ["orv", 0], ["eorv", 0], ["smaxv", 0], ["sminv", 0], + ["fminv", 2], ["fmaxv", 2], ["fadda", 2], ["uaddv", 0]]) + print "\n __ bind(forth);" outfile.write("forth:\n") outfile.close() import subprocess import sys -# compile for 8.1 and sha2 because of lse atomics and sha512 crypto extension. -subprocess.check_call([AARCH64_AS, "-march=armv8.1-a+sha2", "aarch64ops.s", "-o", "aarch64ops.o"]) +# compile for sve with 8.1 and sha2 because of lse atomics and sha512 crypto extension. +subprocess.check_call([AARCH64_AS, "-march=armv8.1-a+sha2+sve", "aarch64ops.s", "-o", "aarch64ops.o"]) print print "/*", sys.stdout.flush() subprocess.check_call([AARCH64_OBJDUMP, "-d", "aarch64ops.o"]) diff a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad --- a/src/hotspot/cpu/aarch64/aarch64.ad +++ b/src/hotspot/cpu/aarch64/aarch64.ad @@ -1168,11 +1168,11 @@ P2, P3, P4, P5, P6, - P7, + // P7, non-allocatable, preserved with all elements preset to TRUE. P8, P9, P10, P11, P12, @@ -1189,11 +1189,11 @@ P2, P3, P4, P5, P6, - P7 + // P7, non-allocatable, preserved with all elements preset to TRUE. ); // Singleton class for condition codes reg_class int_flags(RFLAGS); @@ -1896,10 +1896,14 @@ __ clinit_barrier(rscratch2, rscratch1, &L_skip_barrier); __ far_jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub())); __ bind(L_skip_barrier); } + if (UseSVE > 0 && C->max_vector_size() >= 16) { + __ reinitialize_ptrue(); + } + int bangsize = C->output()->bang_size_in_bytes(); if (C->output()->need_stack_bang(bangsize) && UseStackBanging) __ generate_stack_overflow_check(bangsize); __ build_frame(framesize); @@ -2060,11 +2064,29 @@ int dst_offset = ra_->reg2offset(dst_lo); if (bottom_type()->isa_vect() != NULL) { uint ireg = ideal_reg(); if (ireg == Op_VecA && cbuf) { - Unimplemented(); + C2_MacroAssembler _masm(cbuf); + int sve_vector_reg_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE); + if (src_lo_rc == rc_stack && dst_lo_rc == rc_stack) { + // stack->stack + __ spill_copy_sve_vector_stack_to_stack(src_offset, dst_offset, + sve_vector_reg_size_in_bytes); + } else if (src_lo_rc == rc_float && dst_lo_rc == rc_stack) { + __ spill_sve_vector(as_FloatRegister(Matcher::_regEncode[src_lo]), ra_->reg2offset(dst_lo), + sve_vector_reg_size_in_bytes); + } else if (src_lo_rc == rc_stack && dst_lo_rc == rc_float) { + __ unspill_sve_vector(as_FloatRegister(Matcher::_regEncode[dst_lo]), ra_->reg2offset(src_lo), + sve_vector_reg_size_in_bytes); + } else if (src_lo_rc == rc_float && dst_lo_rc == rc_float) { + __ sve_orr(as_FloatRegister(Matcher::_regEncode[dst_lo]), + as_FloatRegister(Matcher::_regEncode[src_lo]), + as_FloatRegister(Matcher::_regEncode[src_lo])); + } else { + ShouldNotReachHere(); + } } else if (cbuf) { assert(ireg == Op_VecD || ireg == Op_VecX, "must be 64 bit or 128 bit vector"); C2_MacroAssembler _masm(cbuf); assert((src_lo_rc != rc_int && dst_lo_rc != rc_int), "sanity"); if (src_lo_rc == rc_stack && dst_lo_rc == rc_stack) { @@ -2353,24 +2375,33 @@ } // Identify extra cases that we might want to provide match rules for vector nodes and // other intrinsics guarded with vector length (vlen) and element type (bt). const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { - if (!match_rule_supported(opcode)) { + if (!match_rule_supported(opcode) || !vector_size_supported(bt, vlen)) { return false; } - - // Special cases which require vector length - switch (opcode) { - case Op_MulAddVS2VI: { - if (vlen != 4) { + int bit_size = vlen * type2aelembytes(bt) * 8; + if (UseSVE == 0 && bit_size > 128) { + return false; + } + if (UseSVE > 0) { + return op_sve_supported(opcode); + } else { // NEON + // Special cases + switch (opcode) { + case Op_MulAddVS2VI: + if (bit_size < 128) { return false; } break; + case Op_MulVL: + return false; + default: + break; } } - return true; // Per default match rules are supported. } const bool Matcher::has_predicated_vectors(void) { return UseSVE > 0; @@ -2407,11 +2438,12 @@ return true; } // Vector width in bytes. const int Matcher::vector_width_in_bytes(BasicType bt) { - int size = MIN2(16, (int)MaxVectorSize); + // The MaxVectorSize should have been set by detecting SVE max vector register size. + int size = MIN2((UseSVE > 0) ? 256 : 16, (int)MaxVectorSize); // Minimum 2 values in vector if (size < 2*type2aelembytes(bt)) size = 0; // But never < 4 if (size < 4) size = 0; return size; @@ -3714,20 +3746,27 @@ } } if (call == NULL) { ciEnv::current()->record_failure("CodeCache is full"); return; + } else if (UseSVE > 0 && Compile::current()->max_vector_size() >= 16) { + // Only non uncommon_trap calls need to reinitialize ptrue. + if (uncommon_trap_request() == 0) { + __ reinitialize_ptrue(); + } } %} enc_class aarch64_enc_java_dynamic_call(method meth) %{ C2_MacroAssembler _masm(&cbuf); int method_index = resolved_method_index(cbuf); address call = __ ic_call((address)$meth$$method, method_index); if (call == NULL) { ciEnv::current()->record_failure("CodeCache is full"); return; + } else if (UseSVE > 0 && Compile::current()->max_vector_size() >= 16) { + __ reinitialize_ptrue(); } %} enc_class aarch64_enc_call_epilog() %{ C2_MacroAssembler _masm(&cbuf); @@ -3760,19 +3799,27 @@ __ stp(zr, rscratch2, Address(__ pre(sp, -2 * wordSize))); __ blr(rscratch1); __ bind(retaddr); __ add(sp, sp, 2 * wordSize); } + if (UseSVE > 0 && Compile::current()->max_vector_size() >= 16) { + __ reinitialize_ptrue(); + } %} enc_class aarch64_enc_rethrow() %{ C2_MacroAssembler _masm(&cbuf); __ far_jump(RuntimeAddress(OptoRuntime::rethrow_stub())); %} enc_class aarch64_enc_ret() %{ C2_MacroAssembler _masm(&cbuf); +#ifdef ASSERT + if (UseSVE > 0 && Compile::current()->max_vector_size() >= 16) { + __ verify_ptrue(); + } +#endif __ ret(lr); %} enc_class aarch64_enc_tail_call(iRegP jump_target) %{ C2_MacroAssembler _masm(&cbuf); @@ -4532,10 +4579,45 @@ op_cost(0); format %{ %} interface(CONST_INTER); %} +// 8 bit signed value. +operand immI8() +%{ + predicate(n->get_int() <= 127 && n->get_int() >= -128); + match(ConI); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + +// 8 bit signed value (simm8), or #simm8 LSL 8. +operand immI8_shift8() +%{ + predicate((n->get_int() <= 127 && n->get_int() >= -128) || + (n->get_int() <= 32512 && n->get_int() >= -32768 && (n->get_int() & 0xff) == 0)); + match(ConI); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + +// 8 bit signed value (simm8), or #simm8 LSL 8. +operand immL8_shift8() +%{ + predicate((n->get_long() <= 127 && n->get_long() >= -128) || + (n->get_long() <= 32512 && n->get_long() >= -32768 && (n->get_long() & 0xff) == 0)); + match(ConL); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + // 32 bit integer valid for add sub immediate operand immIAddSub() %{ predicate(Assembler::operand_valid_for_add_sub_immediate((int64_t)n->get_int())); match(ConI); @@ -16400,11 +16482,11 @@ %} // Load Vector (128 bits) instruct loadV16(vecX dst, vmem16 mem) %{ - predicate(n->as_LoadVector()->memory_size() == 16); + predicate(UseSVE == 0 && n->as_LoadVector()->memory_size() == 16); match(Set dst (LoadVector mem)); ins_cost(4 * INSN_COST); format %{ "ldrq $dst,$mem\t# vector (128 bits)" %} ins_encode( aarch64_enc_ldrvQ(dst, mem) ); ins_pipe(vload_reg_mem128); @@ -16456,11 +16538,11 @@ ins_pipe(vdup_reg_reg64); %} instruct replicate16B(vecX dst, iRegIorL2I src) %{ - predicate(n->as_Vector()->length() == 16); + predicate(UseSVE == 0 && n->as_Vector()->length() == 16); match(Set dst (ReplicateB src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (16B)" %} ins_encode %{ __ dup(as_FloatRegister($dst$$reg), __ T16B, as_Register($src$$reg)); @@ -16481,11 +16563,11 @@ ins_pipe(vmovi_reg_imm64); %} instruct replicate16B_imm(vecX dst, immI con) %{ - predicate(n->as_Vector()->length() == 16); + predicate(UseSVE == 0 && n->as_Vector()->length() == 16); match(Set dst (ReplicateB con)); ins_cost(INSN_COST); format %{ "movi $dst, $con\t# vector(16B)" %} ins_encode %{ __ mov(as_FloatRegister($dst$$reg), __ T16B, $con$$constant & 0xff); @@ -16506,11 +16588,11 @@ ins_pipe(vdup_reg_reg64); %} instruct replicate8S(vecX dst, iRegIorL2I src) %{ - predicate(n->as_Vector()->length() == 8); + predicate(UseSVE == 0 && n->as_Vector()->length() == 8); match(Set dst (ReplicateS src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (8S)" %} ins_encode %{ __ dup(as_FloatRegister($dst$$reg), __ T8H, as_Register($src$$reg)); @@ -16531,11 +16613,11 @@ ins_pipe(vmovi_reg_imm64); %} instruct replicate8S_imm(vecX dst, immI con) %{ - predicate(n->as_Vector()->length() == 8); + predicate(UseSVE == 0 && n->as_Vector()->length() == 8); match(Set dst (ReplicateS con)); ins_cost(INSN_COST); format %{ "movi $dst, $con\t# vector(8H)" %} ins_encode %{ __ mov(as_FloatRegister($dst$$reg), __ T8H, $con$$constant & 0xffff); @@ -16555,11 +16637,11 @@ ins_pipe(vdup_reg_reg64); %} instruct replicate4I(vecX dst, iRegIorL2I src) %{ - predicate(n->as_Vector()->length() == 4); + predicate(UseSVE == 0 && n->as_Vector()->length() == 4); match(Set dst (ReplicateI src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (4I)" %} ins_encode %{ __ dup(as_FloatRegister($dst$$reg), __ T4S, as_Register($src$$reg)); @@ -16579,11 +16661,11 @@ ins_pipe(vmovi_reg_imm64); %} instruct replicate4I_imm(vecX dst, immI con) %{ - predicate(n->as_Vector()->length() == 4); + predicate(UseSVE == 0 && n->as_Vector()->length() == 4); match(Set dst (ReplicateI con)); ins_cost(INSN_COST); format %{ "movi $dst, $con\t# vector(4I)" %} ins_encode %{ __ mov(as_FloatRegister($dst$$reg), __ T4S, $con$$constant); @@ -16591,11 +16673,11 @@ ins_pipe(vmovi_reg_imm128); %} instruct replicate2L(vecX dst, iRegL src) %{ - predicate(n->as_Vector()->length() == 2); + predicate(UseSVE == 0 && n->as_Vector()->length() == 2); match(Set dst (ReplicateL src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (2L)" %} ins_encode %{ __ dup(as_FloatRegister($dst$$reg), __ T2D, as_Register($src$$reg)); @@ -16603,11 +16685,11 @@ ins_pipe(vdup_reg_reg128); %} instruct replicate2L_zero(vecX dst, immI0 zero) %{ - predicate(n->as_Vector()->length() == 2); + predicate(UseSVE == 0 && n->as_Vector()->length() == 2); match(Set dst (ReplicateI zero)); ins_cost(INSN_COST); format %{ "movi $dst, $zero\t# vector(4I)" %} ins_encode %{ __ eor(as_FloatRegister($dst$$reg), __ T16B, @@ -16630,11 +16712,11 @@ ins_pipe(vdup_reg_freg64); %} instruct replicate4F(vecX dst, vRegF src) %{ - predicate(n->as_Vector()->length() == 4); + predicate(UseSVE == 0 && n->as_Vector()->length() == 4); match(Set dst (ReplicateF src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (4F)" %} ins_encode %{ __ dup(as_FloatRegister($dst$$reg), __ T4S, @@ -16643,11 +16725,11 @@ ins_pipe(vdup_reg_freg128); %} instruct replicate2D(vecX dst, vRegD src) %{ - predicate(n->as_Vector()->length() == 2); + predicate(UseSVE == 0 && n->as_Vector()->length() == 2); match(Set dst (ReplicateD src)); ins_cost(INSN_COST); format %{ "dup $dst, $src\t# vector (2D)" %} ins_encode %{ __ dup(as_FloatRegister($dst$$reg), __ T2D, diff a/src/hotspot/cpu/aarch64/aarch64_sve.ad b/src/hotspot/cpu/aarch64/aarch64_sve.ad --- /dev/null +++ b/src/hotspot/cpu/aarch64/aarch64_sve.ad @@ -0,0 +1,1637 @@ +// +// Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2020, Arm Ltd. All rights reserved. +// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +// +// This code is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License version 2 only, as +// published by the Free Software Foundation. +// +// This code is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// version 2 for more details (a copy is included in the LICENSE file that +// accompanied this code). +// +// You should have received a copy of the GNU General Public License version +// 2 along with this work; if not, write to the Free Software Foundation, +// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +// +// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA +// or visit www.oracle.com if you need additional information or have any +// questions. +// +// + +// This file is automatically generated by running "m4 aarch64_sve_ad.m4". Do not edit ---- + +// AArch64 SVE Architecture Description File + + +// 4 bit signed offset -- for predicated load/store + +operand vmemA_immIOffset4() +%{ + predicate(Address::offset_ok_for_sve_immed(n->get_int(), 4, + Matcher::scalable_vector_reg_size(T_BYTE))); + match(ConI); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + +operand vmemA_immLOffset4() +%{ + predicate(Address::offset_ok_for_sve_immed(n->get_long(), 4, + Matcher::scalable_vector_reg_size(T_BYTE))); + match(ConL); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%} + + +operand vmemA_indOffI4(iRegP reg, vmemA_immIOffset4 off) +%{ + constraint(ALLOC_IN_RC(ptr_reg)); + match(AddP reg off); + op_cost(0); + format %{ "[$reg, $off, MUL VL]" %} + interface(MEMORY_INTER) %{ + base($reg); + index(0xffffffff); + scale(0x0); + disp($off); + %} +%} + +operand vmemA_indOffL4(iRegP reg, vmemA_immLOffset4 off) +%{ + constraint(ALLOC_IN_RC(ptr_reg)); + match(AddP reg off); + op_cost(0); + format %{ "[$reg, $off, MUL VL]" %} + interface(MEMORY_INTER) %{ + base($reg); + index(0xffffffff); + scale(0x0); + disp($off); + %} +%} + +opclass vmemA(indirect, vmemA_indOffI4, vmemA_indOffL4); + +source_hpp %{ + bool op_sve_supported(int opcode); +%} + +source %{ + + static inline BasicType vector_element_basic_type(const MachNode* n) { + const TypeVect* vt = n->bottom_type()->is_vect(); + return vt->element_basic_type(); + } + + static inline BasicType vector_element_basic_type(const MachNode* use, const MachOper* opnd) { + int def_idx = use->operand_index(opnd); + Node* def = use->in(def_idx); + const TypeVect* vt = def->bottom_type()->is_vect(); + return vt->element_basic_type(); + } + + typedef void (C2_MacroAssembler::* sve_mem_insn_predicate)(FloatRegister Rt, Assembler::SIMD_RegVariant T, + PRegister Pg, const Address &adr); + + // Predicated load/store, with optional ptrue to all elements of given predicate register. + static void loadStoreA_predicate(C2_MacroAssembler masm, bool is_store, + FloatRegister reg, PRegister pg, BasicType bt, + int opcode, Register base, int index, int size, int disp) { + sve_mem_insn_predicate insn; + Assembler::SIMD_RegVariant type; + int esize = type2aelembytes(bt); + if (index == -1) { + assert(size == 0, "unsupported address mode: scale size = %d", size); + switch(esize) { + case 1: + insn = is_store ? &C2_MacroAssembler::sve_st1b : &C2_MacroAssembler::sve_ld1b; + type = Assembler::B; + break; + case 2: + insn = is_store ? &C2_MacroAssembler::sve_st1h : &C2_MacroAssembler::sve_ld1h; + type = Assembler::H; + break; + case 4: + insn = is_store ? &C2_MacroAssembler::sve_st1w : &C2_MacroAssembler::sve_ld1w; + type = Assembler::S; + break; + case 8: + insn = is_store ? &C2_MacroAssembler::sve_st1d : &C2_MacroAssembler::sve_ld1d; + type = Assembler::D; + break; + default: + assert(false, "unsupported"); + ShouldNotReachHere(); + } + (masm.*insn)(reg, type, pg, Address(base, disp / Matcher::scalable_vector_reg_size(T_BYTE))); + } else { + assert(false, "unimplemented"); + ShouldNotReachHere(); + } + } + + bool op_sve_supported(int opcode) { + switch (opcode) { + case Op_MulAddVS2VI: + // No multiply reduction instructions + case Op_MulReductionVD: + case Op_MulReductionVF: + case Op_MulReductionVI: + case Op_MulReductionVL: + // Others + case Op_Extract: + case Op_ExtractB: + case Op_ExtractC: + case Op_ExtractD: + case Op_ExtractF: + case Op_ExtractI: + case Op_ExtractL: + case Op_ExtractS: + case Op_ExtractUB: + return false; + default: + return true; + } + } + +%} + +definitions %{ + int_def SVE_COST (200, 200); +%} + + + + +// All SVE instructions + +// sve vector load/store + +// Use predicated vector load/store +instruct loadVA(vecA dst, vmemA mem) %{ + predicate(UseSVE > 0 && n->as_LoadVector()->memory_size() >= 16); + match(Set dst (LoadVector mem)); + ins_cost(SVE_COST); + format %{ "sve_ldr $dst, $mem\t # vector (sve)" %} + ins_encode %{ + FloatRegister dst_reg = as_FloatRegister($dst$$reg); + loadStoreA_predicate(C2_MacroAssembler(&cbuf), false, dst_reg, ptrue, + vector_element_basic_type(this), $mem->opcode(), + as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp); + %} + ins_pipe(pipe_slow); +%} + +instruct storeVA(vecA src, vmemA mem) %{ + predicate(UseSVE > 0 && n->as_StoreVector()->memory_size() >= 16); + match(Set mem (StoreVector mem src)); + ins_cost(SVE_COST); + format %{ "sve_str $mem, $src\t # vector (sve)" %} + ins_encode %{ + FloatRegister src_reg = as_FloatRegister($src$$reg); + loadStoreA_predicate(C2_MacroAssembler(&cbuf), true, src_reg, ptrue, + vector_element_basic_type(this, $src), $mem->opcode(), + as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp); + %} + ins_pipe(pipe_slow); +%} + + +// sve abs + +instruct vabsAB(vecA dst, vecA src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16 && + n->bottom_type()->is_vect()->element_basic_type() == T_BYTE); + match(Set dst (AbsVB src)); + ins_cost(SVE_COST); + format %{ "sve_abs $dst, $src\t# vector (sve) (B)" %} + ins_encode %{ + __ sve_abs(as_FloatRegister($dst$$reg), __ B, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vabsAS(vecA dst, vecA src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8 && + n->bottom_type()->is_vect()->element_basic_type() == T_SHORT); + match(Set dst (AbsVS src)); + ins_cost(SVE_COST); + format %{ "sve_abs $dst, $src\t# vector (sve) (H)" %} + ins_encode %{ + __ sve_abs(as_FloatRegister($dst$$reg), __ H, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vabsAI(vecA dst, vecA src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4 && + n->bottom_type()->is_vect()->element_basic_type() == T_INT); + match(Set dst (AbsVI src)); + ins_cost(SVE_COST); + format %{ "sve_abs $dst, $src\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_abs(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vabsAL(vecA dst, vecA src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2 && + n->bottom_type()->is_vect()->element_basic_type() == T_LONG); + match(Set dst (AbsVL src)); + ins_cost(SVE_COST); + format %{ "sve_abs $dst, $src\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_abs(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vabsAF(vecA dst, vecA src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4 && + n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT); + match(Set dst (AbsVF src)); + ins_cost(SVE_COST); + format %{ "sve_fabs $dst, $src\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_fabs(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vabsAD(vecA dst, vecA src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2 && + n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE); + match(Set dst (AbsVD src)); + ins_cost(SVE_COST); + format %{ "sve_fabs $dst, $src\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_fabs(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// sve add + +instruct vaddAB(vecA dst, vecA src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (AddVB src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_add $dst, $src1, $src2\t # vector (sve) (B)" %} + ins_encode %{ + __ sve_add(as_FloatRegister($dst$$reg), __ B, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vaddAS(vecA dst, vecA src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (AddVS src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_add $dst, $src1, $src2\t # vector (sve) (H)" %} + ins_encode %{ + __ sve_add(as_FloatRegister($dst$$reg), __ H, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vaddAI(vecA dst, vecA src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (AddVI src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_add $dst, $src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_add(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vaddAL(vecA dst, vecA src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (AddVL src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_add $dst, $src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_add(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vaddAF(vecA dst, vecA src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (AddVF src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fadd $dst, $src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fadd(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vaddAD(vecA dst, vecA src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (AddVD src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fadd $dst, $src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fadd(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// sve and + +instruct vandA(vecA dst, vecA src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (AndV src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_and $dst, $src1, $src2\t# vector (sve)" %} + ins_encode %{ + __ sve_and(as_FloatRegister($dst$$reg), + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// sve or + +instruct vorA(vecA dst, vecA src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (OrV src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_orr $dst, $src1, $src2\t# vector (sve)" %} + ins_encode %{ + __ sve_orr(as_FloatRegister($dst$$reg), + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// sve xor + +instruct vxorA(vecA dst, vecA src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (XorV src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_eor $dst, $src1, $src2\t# vector (sve)" %} + ins_encode %{ + __ sve_eor(as_FloatRegister($dst$$reg), + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// sve float div + +instruct vdivAF(vecA dst_src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (DivVF dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fdiv $dst_src1, $dst_src1, $src2\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_fdiv(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vdivAD(vecA dst_src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (DivVD dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fdiv $dst_src1, $dst_src1, $src2\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_fdiv(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// sve max + +instruct vmaxAF(vecA dst_src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4 && + n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT); + match(Set dst_src1 (MaxV dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fmax $dst_src1, $dst_src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fmax(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vmaxAD(vecA dst_src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2 && + n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE); + match(Set dst_src1 (MaxV dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fmax $dst_src1, $dst_src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fmax(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vminAF(vecA dst_src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4 && + n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT); + match(Set dst_src1 (MinV dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fmin $dst_src1, $dst_src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fmin(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vminAD(vecA dst_src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2 && + n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE); + match(Set dst_src1 (MinV dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fmin $dst_src1, $dst_src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fmin(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// sve fmla + +// dst_src1 = dst_src1 + src2 * src3 +instruct vfmlaAF(vecA dst_src1, vecA src2, vecA src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (FmaVF dst_src1 (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fmla $dst_src1, $src2, $src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fmla(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 + src2 * src3 +instruct vfmlaAD(vecA dst_src1, vecA src2, vecA src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (FmaVD dst_src1 (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fmla $dst_src1, $src2, $src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fmla(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// sve fmls + +// dst_src1 = dst_src1 + -src2 * src3 +// dst_src1 = dst_src1 + src2 * -src3 +instruct vfmlsAF(vecA dst_src1, vecA src2, vecA src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (FmaVF dst_src1 (Binary (NegVF src2) src3))); + match(Set dst_src1 (FmaVF dst_src1 (Binary src2 (NegVF src3)))); + ins_cost(SVE_COST); + format %{ "sve_fmls $dst_src1, $src2, $src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fmls(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 + -src2 * src3 +// dst_src1 = dst_src1 + src2 * -src3 +instruct vfmlsAD(vecA dst_src1, vecA src2, vecA src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (FmaVD dst_src1 (Binary (NegVD src2) src3))); + match(Set dst_src1 (FmaVD dst_src1 (Binary src2 (NegVD src3)))); + ins_cost(SVE_COST); + format %{ "sve_fmls $dst_src1, $src2, $src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fmls(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// sve fnmla + +// dst_src1 = -dst_src1 + -src2 * src3 +// dst_src1 = -dst_src1 + src2 * -src3 +instruct vfnmlaAF(vecA dst_src1, vecA src2, vecA src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (FmaVF (NegVF dst_src1) (Binary (NegVF src2) src3))); + match(Set dst_src1 (FmaVF (NegVF dst_src1) (Binary src2 (NegVF src3)))); + ins_cost(SVE_COST); + format %{ "sve_fnmla $dst_src1, $src2, $src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fnmla(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = -dst_src1 + -src2 * src3 +// dst_src1 = -dst_src1 + src2 * -src3 +instruct vfnmlaAD(vecA dst_src1, vecA src2, vecA src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (FmaVD (NegVD dst_src1) (Binary (NegVD src2) src3))); + match(Set dst_src1 (FmaVD (NegVD dst_src1) (Binary src2 (NegVD src3)))); + ins_cost(SVE_COST); + format %{ "sve_fnmla $dst_src1, $src2, $src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fnmla(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// sve fnmls + +// dst_src1 = -dst_src1 + src2 * src3 +instruct vfnmlsAF(vecA dst_src1, vecA src2, vecA src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (FmaVF (NegVF dst_src1) (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fnmls $dst_src1, $src2, $src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fnmls(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = -dst_src1 + src2 * src3 +instruct vfnmlsAD(vecA dst_src1, vecA src2, vecA src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (FmaVD (NegVD dst_src1) (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fnmls $dst_src1, $src2, $src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fnmls(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// sve mla + +// dst_src1 = dst_src1 + src2 * src3 +instruct vmlaAB(vecA dst_src1, vecA src2, vecA src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst_src1 (AddVB dst_src1 (MulVB src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mla $dst_src1, src2, src3\t # vector (sve) (B)" %} + ins_encode %{ + __ sve_mla(as_FloatRegister($dst_src1$$reg), __ B, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 + src2 * src3 +instruct vmlaAS(vecA dst_src1, vecA src2, vecA src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst_src1 (AddVS dst_src1 (MulVS src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mla $dst_src1, src2, src3\t # vector (sve) (H)" %} + ins_encode %{ + __ sve_mla(as_FloatRegister($dst_src1$$reg), __ H, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 + src2 * src3 +instruct vmlaAI(vecA dst_src1, vecA src2, vecA src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (AddVI dst_src1 (MulVI src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mla $dst_src1, src2, src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_mla(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 + src2 * src3 +instruct vmlaAL(vecA dst_src1, vecA src2, vecA src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (AddVL dst_src1 (MulVL src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mla $dst_src1, src2, src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_mla(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// sve mls + +// dst_src1 = dst_src1 - src2 * src3 +instruct vmlsAB(vecA dst_src1, vecA src2, vecA src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst_src1 (SubVB dst_src1 (MulVB src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mls $dst_src1, src2, src3\t # vector (sve) (B)" %} + ins_encode %{ + __ sve_mls(as_FloatRegister($dst_src1$$reg), __ B, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 - src2 * src3 +instruct vmlsAS(vecA dst_src1, vecA src2, vecA src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst_src1 (SubVS dst_src1 (MulVS src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mls $dst_src1, src2, src3\t # vector (sve) (H)" %} + ins_encode %{ + __ sve_mls(as_FloatRegister($dst_src1$$reg), __ H, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 - src2 * src3 +instruct vmlsAI(vecA dst_src1, vecA src2, vecA src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (SubVI dst_src1 (MulVI src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mls $dst_src1, src2, src3\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_mls(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// dst_src1 = dst_src1 - src2 * src3 +instruct vmlsAL(vecA dst_src1, vecA src2, vecA src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (SubVL dst_src1 (MulVL src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mls $dst_src1, src2, src3\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_mls(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%} + + +// sve mul + +instruct vmulAB(vecA dst_src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst_src1 (MulVB dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_mul $dst_src1, $dst_src1, $src2\t # vector (sve) (B)" %} + ins_encode %{ + __ sve_mul(as_FloatRegister($dst_src1$$reg), __ B, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vmulAS(vecA dst_src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst_src1 (MulVS dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_mul $dst_src1, $dst_src1, $src2\t # vector (sve) (H)" %} + ins_encode %{ + __ sve_mul(as_FloatRegister($dst_src1$$reg), __ H, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vmulAI(vecA dst_src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst_src1 (MulVI dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_mul $dst_src1, $dst_src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_mul(as_FloatRegister($dst_src1$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vmulAL(vecA dst_src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst_src1 (MulVL dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_mul $dst_src1, $dst_src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_mul(as_FloatRegister($dst_src1$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vmulAF(vecA dst, vecA src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (MulVF src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fmul $dst, $src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fmul(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vmulAD(vecA dst, vecA src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (MulVD src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fmul $dst, $src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fmul(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// sve fneg + +instruct vnegAF(vecA dst, vecA src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (NegVF src)); + ins_cost(SVE_COST); + format %{ "sve_fneg $dst, $src\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_fneg(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vnegAD(vecA dst, vecA src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (NegVD src)); + ins_cost(SVE_COST); + format %{ "sve_fneg $dst, $src\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_fneg(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// sve popcount vector + +instruct vpopcountAI(vecA dst, vecA src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (PopCountVI src)); + format %{ "sve_cnt $dst, $src\t# vector (sve) (S)\n\t" %} + ins_encode %{ + __ sve_cnt(as_FloatRegister($dst$$reg), __ S, ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// sve add reduction + +instruct reduce_addAI(iRegINoSp dst, iRegIorL2I src1, vecA src2, vRegD tmp) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16 && + (n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_INT)); + match(Set dst (AddReductionVI src1 src2)); + effect(TEMP_DEF dst, TEMP tmp); + ins_cost(SVE_COST); + format %{ "sve_uaddv $tmp, $src2\t# vector (sve) (S)\n\t" + "umov $dst, $tmp, S, 0\n\t" + "addw $dst, $dst, $src1\t # add reduction S" %} + ins_encode %{ + __ sve_uaddv(as_FloatRegister($tmp$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg)); + __ umov($dst$$Register, as_FloatRegister($tmp$$reg), __ S, 0); + __ addw($dst$$Register, $dst$$Register, $src1$$Register); + %} + ins_pipe(pipe_slow); +%} + +instruct reduce_addAL(iRegLNoSp dst, iRegL src1, vecA src2, vRegD tmp) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16 && + (n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_LONG)); + match(Set dst (AddReductionVL src1 src2)); + effect(TEMP_DEF dst, TEMP tmp); + ins_cost(SVE_COST); + format %{ "sve_uaddv $tmp, $src2\t# vector (sve) (D)\n\t" + "umov $dst, $tmp, D, 0\n\t" + "add $dst, $dst, $src1\t # add reduction D" %} + ins_encode %{ + __ sve_uaddv(as_FloatRegister($tmp$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg)); + __ umov($dst$$Register, as_FloatRegister($tmp$$reg), __ D, 0); + __ add($dst$$Register, $dst$$Register, $src1$$Register); + %} + ins_pipe(pipe_slow); +%} + +instruct reduce_addAF(vRegF src1_dst, vecA src2) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16); + match(Set src1_dst (AddReductionVF src1_dst src2)); + ins_cost(SVE_COST); + format %{ "sve_fadda $src1_dst, $src1_dst, $src2\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_fadda(as_FloatRegister($src1_dst$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct reduce_addAD(vRegD src1_dst, vecA src2) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16); + match(Set src1_dst (AddReductionVD src1_dst src2)); + ins_cost(SVE_COST); + format %{ "sve_fadda $src1_dst, $src1_dst, $src2\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_fadda(as_FloatRegister($src1_dst$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// sve max reduction + +instruct reduce_maxAF(vRegF dst, vRegF src1, vecA src2) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT && + n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16); + match(Set dst (MaxReductionV src1 src2)); + ins_cost(INSN_COST); + effect(TEMP_DEF dst); + format %{ "sve_fmaxv $dst, $src2 # vector (sve) (S)\n\t" + "fmaxs $dst, $dst, $src1\t # max reduction F" %} + ins_encode %{ + __ sve_fmaxv(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg)); + __ fmaxs(as_FloatRegister($dst$$reg), as_FloatRegister($dst$$reg), as_FloatRegister($src1$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct reduce_maxAD(vRegD dst, vRegD src1, vecA src2) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE && + n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16); + match(Set dst (MaxReductionV src1 src2)); + ins_cost(INSN_COST); + effect(TEMP_DEF dst); + format %{ "sve_fmaxv $dst, $src2 # vector (sve) (S)\n\t" + "fmaxs $dst, $dst, $src1\t # max reduction D" %} + ins_encode %{ + __ sve_fmaxv(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg)); + __ fmaxd(as_FloatRegister($dst$$reg), as_FloatRegister($dst$$reg), as_FloatRegister($src1$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// sve min reduction + +instruct reduce_minAF(vRegF dst, vRegF src1, vecA src2) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_FLOAT && + n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16); + match(Set dst (MinReductionV src1 src2)); + ins_cost(INSN_COST); + effect(TEMP_DEF dst); + format %{ "sve_fminv $dst, $src2 # vector (sve) (S)\n\t" + "fmins $dst, $dst, $src1\t # min reduction F" %} + ins_encode %{ + __ sve_fminv(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($src2$$reg)); + __ fmins(as_FloatRegister($dst$$reg), as_FloatRegister($dst$$reg), as_FloatRegister($src1$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct reduce_minAD(vRegD dst, vRegD src1, vecA src2) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE && + n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16); + match(Set dst (MinReductionV src1 src2)); + ins_cost(INSN_COST); + effect(TEMP_DEF dst); + format %{ "sve_fminv $dst, $src2 # vector (sve) (S)\n\t" + "fmins $dst, $dst, $src1\t # min reduction D" %} + ins_encode %{ + __ sve_fminv(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($src2$$reg)); + __ fmind(as_FloatRegister($dst$$reg), as_FloatRegister($dst$$reg), as_FloatRegister($src1$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// sve vector Math.rint, floor, ceil + +instruct vroundAD(vecA dst, vecA src, immI rmode) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2 && + n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE); + match(Set dst (RoundDoubleModeV src rmode)); + format %{ "sve_frint $dst, $src, $rmode\t# vector (sve) (D)" %} + ins_encode %{ + switch ($rmode$$constant) { + case RoundDoubleModeNode::rmode_rint: + __ sve_frintn(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($src$$reg)); + break; + case RoundDoubleModeNode::rmode_floor: + __ sve_frintm(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($src$$reg)); + break; + case RoundDoubleModeNode::rmode_ceil: + __ sve_frintp(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($src$$reg)); + break; + } + %} + ins_pipe(pipe_slow); +%} + +// sve replicate + +instruct replicateAB(vecA dst, iRegIorL2I src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (ReplicateB src)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $src\t# vector (sve) (B)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ B, as_Register($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateAS(vecA dst, iRegIorL2I src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (ReplicateS src)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $src\t# vector (sve) (H)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ H, as_Register($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateAI(vecA dst, iRegIorL2I src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (ReplicateI src)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $src\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ S, as_Register($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateAL(vecA dst, iRegL src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (ReplicateL src)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $src\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ D, as_Register($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + + +instruct replicateAB_imm8(vecA dst, immI8 con) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (ReplicateB con)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $con\t# vector (sve) (B)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ B, $con$$constant); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateAS_imm8(vecA dst, immI8_shift8 con) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (ReplicateS con)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $con\t# vector (sve) (H)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ H, $con$$constant); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateAI_imm8(vecA dst, immI8_shift8 con) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (ReplicateI con)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $con\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ S, $con$$constant); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateAL_imm8(vecA dst, immL8_shift8 con) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (ReplicateL con)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $con\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ D, $con$$constant); + %} + ins_pipe(pipe_slow); +%} + + +instruct replicateAF(vecA dst, vRegF src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (ReplicateF src)); + ins_cost(SVE_COST); + format %{ "sve_cpy $dst, $src\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_cpy(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct replicateAD(vecA dst, vRegD src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (ReplicateD src)); + ins_cost(SVE_COST); + format %{ "sve_cpy $dst, $src\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_cpy(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// sve shift + +instruct vasrAB(vecA dst, vecA shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (RShiftVB dst shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $dst, $shift\t# vector (sve) (B)" %} + ins_encode %{ + __ sve_asr(as_FloatRegister($dst$$reg), __ B, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrAS(vecA dst, vecA shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (RShiftVS dst shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $dst, $shift\t# vector (sve) (H)" %} + ins_encode %{ + __ sve_asr(as_FloatRegister($dst$$reg), __ H, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrAI(vecA dst, vecA shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (RShiftVI dst shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $dst, $shift\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_asr(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrAL(vecA dst, vecA shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (RShiftVL dst shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $dst, $shift\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_asr(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslAB(vecA dst, vecA shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (LShiftVB dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $dst, $shift\t# vector (sve) (B)" %} + ins_encode %{ + __ sve_lsl(as_FloatRegister($dst$$reg), __ B, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslAS(vecA dst, vecA shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (LShiftVS dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $dst, $shift\t# vector (sve) (H)" %} + ins_encode %{ + __ sve_lsl(as_FloatRegister($dst$$reg), __ H, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslAI(vecA dst, vecA shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (LShiftVI dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $dst, $shift\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_lsl(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslAL(vecA dst, vecA shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (LShiftVL dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $dst, $shift\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_lsl(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrAB(vecA dst, vecA shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (URShiftVB dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $dst, $shift\t# vector (sve) (B)" %} + ins_encode %{ + __ sve_lsr(as_FloatRegister($dst$$reg), __ B, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrAS(vecA dst, vecA shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (URShiftVS dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $dst, $shift\t# vector (sve) (H)" %} + ins_encode %{ + __ sve_lsr(as_FloatRegister($dst$$reg), __ H, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrAI(vecA dst, vecA shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (URShiftVI dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $dst, $shift\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_lsr(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrAL(vecA dst, vecA shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (URShiftVL dst shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $dst, $shift\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_lsr(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrAB_imm(vecA dst, vecA src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (RShiftVB src shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $src, $shift\t# vector (sve) (B)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + if (con >= 8) con = 7; + __ sve_asr(as_FloatRegister($dst$$reg), __ B, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrAS_imm(vecA dst, vecA src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (RShiftVS src shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $src, $shift\t# vector (sve) (H)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + if (con >= 16) con = 15; + __ sve_asr(as_FloatRegister($dst$$reg), __ H, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrAI_imm(vecA dst, vecA src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (RShiftVI src shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $src, $shift\t# vector (sve) (S)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_asr(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vasrAL_imm(vecA dst, vecA src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (RShiftVL src shift)); + ins_cost(SVE_COST); + format %{ "sve_asr $dst, $src, $shift\t# vector (sve) (D)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_asr(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrAB_imm(vecA dst, vecA src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (URShiftVB src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $src, $shift\t# vector (sve) (B)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + if (con >= 8) { + __ sve_eor(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsr(as_FloatRegister($dst$$reg), __ B, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrAS_imm(vecA dst, vecA src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (URShiftVS src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $src, $shift\t# vector (sve) (H)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + if (con >= 8) { + __ sve_eor(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsr(as_FloatRegister($dst$$reg), __ H, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrAI_imm(vecA dst, vecA src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (URShiftVI src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $src, $shift\t# vector (sve) (S)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsr(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlsrAL_imm(vecA dst, vecA src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (URShiftVL src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsr $dst, $src, $shift\t# vector (sve) (D)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsr(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslAB_imm(vecA dst, vecA src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (LShiftVB src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $src, $shift\t# vector (sve) (B)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con >= 8) { + __ sve_eor(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsl(as_FloatRegister($dst$$reg), __ B, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslAS_imm(vecA dst, vecA src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (LShiftVS src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $src, $shift\t# vector (sve) (H)" %} + ins_encode %{ + int con = (int)$shift$$constant; + if (con >= 8) { + __ sve_eor(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + } + __ sve_lsl(as_FloatRegister($dst$$reg), __ H, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslAI_imm(vecA dst, vecA src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (LShiftVI src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $src, $shift\t# vector (sve) (S)" %} + ins_encode %{ + int con = (int)$shift$$constant; + __ sve_lsl(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vlslAL_imm(vecA dst, vecA src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (LShiftVL src shift)); + ins_cost(SVE_COST); + format %{ "sve_lsl $dst, $src, $shift\t# vector (sve) (D)" %} + ins_encode %{ + int con = (int)$shift$$constant; + __ sve_lsl(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%} + +instruct vshiftcntAB(vecA dst, iRegIorL2I cnt) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16 && + (n->bottom_type()->is_vect()->element_basic_type() == T_BYTE)); + match(Set dst (LShiftCntV cnt)); + match(Set dst (RShiftCntV cnt)); + format %{ "sve_dup $dst, $cnt\t# vector shift count (sve) (B)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ B, as_Register($cnt$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vshiftcntAS(vecA dst, iRegIorL2I cnt) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8 && + (n->bottom_type()->is_vect()->element_basic_type() == T_SHORT || + (n->bottom_type()->is_vect()->element_basic_type() == T_CHAR))); + match(Set dst (LShiftCntV cnt)); + match(Set dst (RShiftCntV cnt)); + format %{ "sve_dup $dst, $cnt\t# vector shift count (sve) (H)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ H, as_Register($cnt$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vshiftcntAI(vecA dst, iRegIorL2I cnt) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4 && + (n->bottom_type()->is_vect()->element_basic_type() == T_INT)); + match(Set dst (LShiftCntV cnt)); + match(Set dst (RShiftCntV cnt)); + format %{ "sve_dup $dst, $cnt\t# vector shift count (sve) (S)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ S, as_Register($cnt$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vshiftcntAL(vecA dst, iRegIorL2I cnt) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2 && + (n->bottom_type()->is_vect()->element_basic_type() == T_LONG)); + match(Set dst (LShiftCntV cnt)); + match(Set dst (RShiftCntV cnt)); + format %{ "sve_dup $dst, $cnt\t# vector shift count (sve) (D)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ D, as_Register($cnt$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// sve sqrt + +instruct vsqrtAF(vecA dst, vecA src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (SqrtVF src)); + ins_cost(SVE_COST); + format %{ "sve_fsqrt $dst, $src\t# vector (sve) (S)" %} + ins_encode %{ + __ sve_fsqrt(as_FloatRegister($dst$$reg), __ S, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsqrtAD(vecA dst, vecA src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= 16); + match(Set dst (SqrtVD src)); + ins_cost(SVE_COST); + format %{ "sve_fsqrt $dst, $src\t# vector (sve) (D)" %} + ins_encode %{ + __ sve_fsqrt(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +// sve sub + +instruct vsubAB(vecA dst, vecA src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 16); + match(Set dst (SubVB src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_sub $dst, $src1, $src2\t # vector (sve) (B)" %} + ins_encode %{ + __ sve_sub(as_FloatRegister($dst$$reg), __ B, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsubAS(vecA dst, vecA src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 8); + match(Set dst (SubVS src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_sub $dst, $src1, $src2\t # vector (sve) (H)" %} + ins_encode %{ + __ sve_sub(as_FloatRegister($dst$$reg), __ H, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsubAI(vecA dst, vecA src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (SubVI src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_sub $dst, $src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_sub(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsubAL(vecA dst, vecA src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (SubVL src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_sub $dst, $src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_sub(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsubAF(vecA dst, vecA src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (SubVF src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fsub $dst, $src1, $src2\t # vector (sve) (S)" %} + ins_encode %{ + __ sve_fsub(as_FloatRegister($dst$$reg), __ S, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + +instruct vsubAD(vecA dst, vecA src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2); + match(Set dst (SubVD src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fsub $dst, $src1, $src2\t # vector (sve) (D)" %} + ins_encode %{ + __ sve_fsub(as_FloatRegister($dst$$reg), __ D, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%} + diff a/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4 --- /dev/null +++ b/src/hotspot/cpu/aarch64/aarch64_sve_ad.m4 @@ -0,0 +1,767 @@ +// +// Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2020, Arm Ltd. All rights reserved. +// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +// +// This code is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License version 2 only, as +// published by the Free Software Foundation. +// +// This code is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// version 2 for more details (a copy is included in the LICENSE file that +// accompanied this code). +// +// You should have received a copy of the GNU General Public License version +// 2 along with this work; if not, write to the Free Software Foundation, +// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +// +// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA +// or visit www.oracle.com if you need additional information or have any +// questions. +// +// + +dnl Generate the warning +// This file is automatically generated by running "m4 aarch64_sve_ad.m4". Do not edit ---- +dnl + +// AArch64 SVE Architecture Description File + +dnl +dnl OPERAND_VMEMORYA_IMMEDIATE_OFFSET($1, $2, $3 ) +dnl OPERAND_VMEMORYA_IMMEDIATE_OFFSET(imm_type_abbr, imm_type, imm_len) +define(`OPERAND_VMEMORYA_IMMEDIATE_OFFSET', ` +operand vmemA_imm$1Offset$3() +%{ + predicate(Address::offset_ok_for_sve_immed(n->get_$2(), $3, + Matcher::scalable_vector_reg_size(T_BYTE))); + match(Con$1); + + op_cost(0); + format %{ %} + interface(CONST_INTER); +%}') +dnl +// 4 bit signed offset -- for predicated load/store +OPERAND_VMEMORYA_IMMEDIATE_OFFSET(I, int, 4) +OPERAND_VMEMORYA_IMMEDIATE_OFFSET(L, long, 4) +dnl +dnl OPERAND_VMEMORYA_INDIRECT_OFFSET($1, $2 ) +dnl OPERAND_VMEMORYA_INDIRECT_OFFSET(imm_type_abbr, imm_len) +define(`OPERAND_VMEMORYA_INDIRECT_OFFSET', ` +operand vmemA_indOff$1$2(iRegP reg, vmemA_imm$1Offset$2 off) +%{ + constraint(ALLOC_IN_RC(ptr_reg)); + match(AddP reg off); + op_cost(0); + format %{ "[$reg, $off, MUL VL]" %} + interface(MEMORY_INTER) %{ + base($reg); + `index'(0xffffffff); + scale(0x0); + disp($off); + %} +%}') +dnl +OPERAND_VMEMORYA_INDIRECT_OFFSET(I, 4) +OPERAND_VMEMORYA_INDIRECT_OFFSET(L, 4) + +opclass vmemA(indirect, vmemA_indOffI4, vmemA_indOffL4); + +source_hpp %{ + bool op_sve_supported(int opcode); +%} + +source %{ + + static inline BasicType vector_element_basic_type(const MachNode* n) { + const TypeVect* vt = n->bottom_type()->is_vect(); + return vt->element_basic_type(); + } + + static inline BasicType vector_element_basic_type(const MachNode* use, const MachOper* opnd) { + int def_idx = use->operand_index(opnd); + Node* def = use->in(def_idx); + const TypeVect* vt = def->bottom_type()->is_vect(); + return vt->element_basic_type(); + } + + typedef void (C2_MacroAssembler::* sve_mem_insn_predicate)(FloatRegister Rt, Assembler::SIMD_RegVariant T, + PRegister Pg, const Address &adr); + + // Predicated load/store, with optional ptrue to all elements of given predicate register. + static void loadStoreA_predicate(C2_MacroAssembler masm, bool is_store, + FloatRegister reg, PRegister pg, BasicType bt, + int opcode, Register base, int index, int size, int disp) { + sve_mem_insn_predicate insn; + Assembler::SIMD_RegVariant type; + int esize = type2aelembytes(bt); + if (index == -1) { + assert(size == 0, "unsupported address mode: scale size = %d", size); + switch(esize) { + case 1: + insn = is_store ? &C2_MacroAssembler::sve_st1b : &C2_MacroAssembler::sve_ld1b; + type = Assembler::B; + break; + case 2: + insn = is_store ? &C2_MacroAssembler::sve_st1h : &C2_MacroAssembler::sve_ld1h; + type = Assembler::H; + break; + case 4: + insn = is_store ? &C2_MacroAssembler::sve_st1w : &C2_MacroAssembler::sve_ld1w; + type = Assembler::S; + break; + case 8: + insn = is_store ? &C2_MacroAssembler::sve_st1d : &C2_MacroAssembler::sve_ld1d; + type = Assembler::D; + break; + default: + assert(false, "unsupported"); + ShouldNotReachHere(); + } + (masm.*insn)(reg, type, pg, Address(base, disp / Matcher::scalable_vector_reg_size(T_BYTE))); + } else { + assert(false, "unimplemented"); + ShouldNotReachHere(); + } + } + + bool op_sve_supported(int opcode) { + switch (opcode) { + case Op_MulAddVS2VI: + // No multiply reduction instructions + case Op_MulReductionVD: + case Op_MulReductionVF: + case Op_MulReductionVI: + case Op_MulReductionVL: + // Others + case Op_Extract: + case Op_ExtractB: + case Op_ExtractC: + case Op_ExtractD: + case Op_ExtractF: + case Op_ExtractI: + case Op_ExtractL: + case Op_ExtractS: + case Op_ExtractUB: + return false; + default: + return true; + } + } + +%} + +definitions %{ + int_def SVE_COST (200, 200); +%} + + +dnl +dnl ELEMENT_SHORT_CHART($1, $2) +dnl ELEMENT_SHORT_CHART(etype, node) +define(`ELEMENT_SHORT_CHAR',`ifelse(`$1', `T_SHORT', + `($2->bottom_type()->is_vect()->element_basic_type() == T_SHORT || + ($2->bottom_type()->is_vect()->element_basic_type() == T_CHAR))', + `($2->bottom_type()->is_vect()->element_basic_type() == $1)')') +dnl + +// All SVE instructions + +// sve vector load/store + +// Use predicated vector load/store +instruct loadVA(vecA dst, vmemA mem) %{ + predicate(UseSVE > 0 && n->as_LoadVector()->memory_size() >= 16); + match(Set dst (LoadVector mem)); + ins_cost(SVE_COST); + format %{ "sve_ldr $dst, $mem\t # vector (sve)" %} + ins_encode %{ + FloatRegister dst_reg = as_FloatRegister($dst$$reg); + loadStoreA_predicate(C2_MacroAssembler(&cbuf), false, dst_reg, ptrue, + vector_element_basic_type(this), $mem->opcode(), + as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp); + %} + ins_pipe(pipe_slow); +%} + +instruct storeVA(vecA src, vmemA mem) %{ + predicate(UseSVE > 0 && n->as_StoreVector()->memory_size() >= 16); + match(Set mem (StoreVector mem src)); + ins_cost(SVE_COST); + format %{ "sve_str $mem, $src\t # vector (sve)" %} + ins_encode %{ + FloatRegister src_reg = as_FloatRegister($src$$reg); + loadStoreA_predicate(C2_MacroAssembler(&cbuf), true, src_reg, ptrue, + vector_element_basic_type(this, $src), $mem->opcode(), + as_Register($mem$$base), $mem$$index, $mem$$scale, $mem$$disp); + %} + ins_pipe(pipe_slow); +%} + +dnl +dnl UNARY_OP_TRUE_PREDICATE_ETYPE($1, $2, $3, $4, $5, %6 ) +dnl UNARY_OP_TRUE_PREDICATE_ETYPE(insn_name, op_name, element_type, size, min_vec_len, insn) +define(`UNARY_OP_TRUE_PREDICATE_ETYPE', ` +instruct $1(vecA dst, vecA src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $5 && + n->bottom_type()->is_vect()->element_basic_type() == $3); + match(Set dst ($2 src)); + ins_cost(SVE_COST); + format %{ "$6 $dst, $src\t# vector (sve) ($4)" %} + ins_encode %{ + __ $6(as_FloatRegister($dst$$reg), __ $4, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// sve abs +UNARY_OP_TRUE_PREDICATE_ETYPE(vabsAB, AbsVB, T_BYTE, B, 16, sve_abs) +UNARY_OP_TRUE_PREDICATE_ETYPE(vabsAS, AbsVS, T_SHORT, H, 8, sve_abs) +UNARY_OP_TRUE_PREDICATE_ETYPE(vabsAI, AbsVI, T_INT, S, 4, sve_abs) +UNARY_OP_TRUE_PREDICATE_ETYPE(vabsAL, AbsVL, T_LONG, D, 2, sve_abs) +UNARY_OP_TRUE_PREDICATE_ETYPE(vabsAF, AbsVF, T_FLOAT, S, 4, sve_fabs) +UNARY_OP_TRUE_PREDICATE_ETYPE(vabsAD, AbsVD, T_DOUBLE, D, 2, sve_fabs) +dnl +dnl BINARY_OP_UNPREDICATED($1, $2 $3, $4 $5 ) +dnl BINARY_OP_UNPREDICATED(insn_name, op_name, size, min_vec_len, insn) +define(`BINARY_OP_UNPREDICATED', ` +instruct $1(vecA dst, vecA src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $4); + match(Set dst ($2 src1 src2)); + ins_cost(SVE_COST); + format %{ "$5 $dst, $src1, $src2\t # vector (sve) ($3)" %} + ins_encode %{ + __ $5(as_FloatRegister($dst$$reg), __ $3, + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// sve add +BINARY_OP_UNPREDICATED(vaddAB, AddVB, B, 16, sve_add) +BINARY_OP_UNPREDICATED(vaddAS, AddVS, H, 8, sve_add) +BINARY_OP_UNPREDICATED(vaddAI, AddVI, S, 4, sve_add) +BINARY_OP_UNPREDICATED(vaddAL, AddVL, D, 2, sve_add) +BINARY_OP_UNPREDICATED(vaddAF, AddVF, S, 4, sve_fadd) +BINARY_OP_UNPREDICATED(vaddAD, AddVD, D, 2, sve_fadd) +dnl +dnl BINARY_OP_UNSIZED($1, $2, $3, $4 ) +dnl BINARY_OP_UNSIZED(insn_name, op_name, min_vec_len, insn) +define(`BINARY_OP_UNSIZED', ` +instruct $1(vecA dst, vecA src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= $3); + match(Set dst ($2 src1 src2)); + ins_cost(SVE_COST); + format %{ "$4 $dst, $src1, $src2\t# vector (sve)" %} + ins_encode %{ + __ $4(as_FloatRegister($dst$$reg), + as_FloatRegister($src1$$reg), + as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// sve and +BINARY_OP_UNSIZED(vandA, AndV, 16, sve_and) + +// sve or +BINARY_OP_UNSIZED(vorA, OrV, 16, sve_orr) + +// sve xor +BINARY_OP_UNSIZED(vxorA, XorV, 16, sve_eor) +dnl +dnl VDIVF($1, $2 , $3 ) +dnl VDIVF(name_suffix, size, min_vec_len) +define(`VDIVF', ` +instruct vdivA$1(vecA dst_src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (DivV$1 dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "sve_fdiv $dst_src1, $dst_src1, $src2\t# vector (sve) ($2)" %} + ins_encode %{ + __ sve_fdiv(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// sve float div +VDIVF(F, S, 4) +VDIVF(D, D, 2) + +dnl +dnl BINARY_OP_TRUE_PREDICATE_ETYPE($1, $2, $3, $4, $5, $6 ) +dnl BINARY_OP_TRUE_PREDICATE_ETYPE(insn_name, op_name, element_type, size, min_vec_len, insn) +define(`BINARY_OP_TRUE_PREDICATE_ETYPE', ` +instruct $1(vecA dst_src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $5 && + n->bottom_type()->is_vect()->element_basic_type() == $3); + match(Set dst_src1 ($2 dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "$6 $dst_src1, $dst_src1, $src2\t # vector (sve) ($4)" %} + ins_encode %{ + __ $6(as_FloatRegister($dst_src1$$reg), __ $4, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// sve max +BINARY_OP_TRUE_PREDICATE_ETYPE(vmaxAF, MaxV, T_FLOAT, S, 4, sve_fmax) +BINARY_OP_TRUE_PREDICATE_ETYPE(vmaxAD, MaxV, T_DOUBLE, D, 2, sve_fmax) +BINARY_OP_TRUE_PREDICATE_ETYPE(vminAF, MinV, T_FLOAT, S, 4, sve_fmin) +BINARY_OP_TRUE_PREDICATE_ETYPE(vminAD, MinV, T_DOUBLE, D, 2, sve_fmin) + +dnl +dnl VFMLA($1 $2 $3 ) +dnl VFMLA(name_suffix, size, min_vec_len) +define(`VFMLA', ` +// dst_src1 = dst_src1 + src2 * src3 +instruct vfmlaA$1(vecA dst_src1, vecA src2, vecA src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (FmaV$1 dst_src1 (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fmla $dst_src1, $src2, $src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_fmla(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// sve fmla +VFMLA(F, S, 4) +VFMLA(D, D, 2) + +dnl +dnl VFMLS($1 $2 $3 ) +dnl VFMLS(name_suffix, size, min_vec_len) +define(`VFMLS', ` +// dst_src1 = dst_src1 + -src2 * src3 +// dst_src1 = dst_src1 + src2 * -src3 +instruct vfmlsA$1(vecA dst_src1, vecA src2, vecA src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (FmaV$1 dst_src1 (Binary (NegV$1 src2) src3))); + match(Set dst_src1 (FmaV$1 dst_src1 (Binary src2 (NegV$1 src3)))); + ins_cost(SVE_COST); + format %{ "sve_fmls $dst_src1, $src2, $src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_fmls(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// sve fmls +VFMLS(F, S, 4) +VFMLS(D, D, 2) + +dnl +dnl VFNMLA($1 $2 $3 ) +dnl VFNMLA(name_suffix, size, min_vec_len) +define(`VFNMLA', ` +// dst_src1 = -dst_src1 + -src2 * src3 +// dst_src1 = -dst_src1 + src2 * -src3 +instruct vfnmlaA$1(vecA dst_src1, vecA src2, vecA src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (FmaV$1 (NegV$1 dst_src1) (Binary (NegV$1 src2) src3))); + match(Set dst_src1 (FmaV$1 (NegV$1 dst_src1) (Binary src2 (NegV$1 src3)))); + ins_cost(SVE_COST); + format %{ "sve_fnmla $dst_src1, $src2, $src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_fnmla(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// sve fnmla +VFNMLA(F, S, 4) +VFNMLA(D, D, 2) + +dnl +dnl VFNMLS($1 $2 $3 ) +dnl VFNMLS(name_suffix, size, min_vec_len) +define(`VFNMLS', ` +// dst_src1 = -dst_src1 + src2 * src3 +instruct vfnmlsA$1(vecA dst_src1, vecA src2, vecA src3) %{ + predicate(UseFMA && UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (FmaV$1 (NegV$1 dst_src1) (Binary src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_fnmls $dst_src1, $src2, $src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_fnmls(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// sve fnmls +VFNMLS(F, S, 4) +VFNMLS(D, D, 2) + +dnl +dnl VMLA($1 $2 $3 ) +dnl VMLA(name_suffix, size, min_vec_len) +define(`VMLA', ` +// dst_src1 = dst_src1 + src2 * src3 +instruct vmlaA$1(vecA dst_src1, vecA src2, vecA src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (AddV$1 dst_src1 (MulV$1 src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mla $dst_src1, src2, src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_mla(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// sve mla +VMLA(B, B, 16) +VMLA(S, H, 8) +VMLA(I, S, 4) +VMLA(L, D, 2) + +dnl +dnl VMLS($1 $2 $3 ) +dnl VMLS(name_suffix, size, min_vec_len) +define(`VMLS', ` +// dst_src1 = dst_src1 - src2 * src3 +instruct vmlsA$1(vecA dst_src1, vecA src2, vecA src3) +%{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $3); + match(Set dst_src1 (SubV$1 dst_src1 (MulV$1 src2 src3))); + ins_cost(SVE_COST); + format %{ "sve_mls $dst_src1, src2, src3\t # vector (sve) ($2)" %} + ins_encode %{ + __ sve_mls(as_FloatRegister($dst_src1$$reg), __ $2, + ptrue, as_FloatRegister($src2$$reg), as_FloatRegister($src3$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// sve mls +VMLS(B, B, 16) +VMLS(S, H, 8) +VMLS(I, S, 4) +VMLS(L, D, 2) + +dnl +dnl BINARY_OP_TRUE_PREDICATE($1, $2, $3, $4, $5 ) +dnl BINARY_OP_TRUE_PREDICATE(insn_name, op_name, size, min_vec_len, insn) +define(`BINARY_OP_TRUE_PREDICATE', ` +instruct $1(vecA dst_src1, vecA src2) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $4); + match(Set dst_src1 ($2 dst_src1 src2)); + ins_cost(SVE_COST); + format %{ "$5 $dst_src1, $dst_src1, $src2\t # vector (sve) ($3)" %} + ins_encode %{ + __ $5(as_FloatRegister($dst_src1$$reg), __ $3, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// sve mul +BINARY_OP_TRUE_PREDICATE(vmulAB, MulVB, B, 16, sve_mul) +BINARY_OP_TRUE_PREDICATE(vmulAS, MulVS, H, 8, sve_mul) +BINARY_OP_TRUE_PREDICATE(vmulAI, MulVI, S, 4, sve_mul) +BINARY_OP_TRUE_PREDICATE(vmulAL, MulVL, D, 2, sve_mul) +BINARY_OP_UNPREDICATED(vmulAF, MulVF, S, 4, sve_fmul) +BINARY_OP_UNPREDICATED(vmulAD, MulVD, D, 2, sve_fmul) + +dnl +dnl UNARY_OP_TRUE_PREDICATE($1, $2, $3, $4, $5 ) +dnl UNARY_OP_TRUE_PREDICATE(insn_name, op_name, size, min_vec_bytes, insn) +define(`UNARY_OP_TRUE_PREDICATE', ` +instruct $1(vecA dst, vecA src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length_in_bytes() >= $4); + match(Set dst ($2 src)); + ins_cost(SVE_COST); + format %{ "$5 $dst, $src\t# vector (sve) ($3)" %} + ins_encode %{ + __ $5(as_FloatRegister($dst$$reg), __ $3, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// sve fneg +UNARY_OP_TRUE_PREDICATE(vnegAF, NegVF, S, 16, sve_fneg) +UNARY_OP_TRUE_PREDICATE(vnegAD, NegVD, D, 16, sve_fneg) + +// sve popcount vector + +instruct vpopcountAI(vecA dst, vecA src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 4); + match(Set dst (PopCountVI src)); + format %{ "sve_cnt $dst, $src\t# vector (sve) (S)\n\t" %} + ins_encode %{ + __ sve_cnt(as_FloatRegister($dst$$reg), __ S, ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%} + +dnl +dnl REDUCE_ADD($1, $2, $3, $4, $5, $6, $7 ) +dnl REDUCE_ADD(insn_name, op_name, reg_dst, reg_src, size, elem_type, insn1) +define(`REDUCE_ADD', ` +instruct $1($3 dst, $4 src1, vecA src2, vRegD tmp) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16 && + ELEMENT_SHORT_CHAR($6, n->in(2))); + match(Set dst ($2 src1 src2)); + effect(TEMP_DEF dst, TEMP tmp); + ins_cost(SVE_COST); + format %{ "sve_uaddv $tmp, $src2\t# vector (sve) ($5)\n\t" + "umov $dst, $tmp, $5, 0\n\t" + "$7 $dst, $dst, $src1\t # add reduction $5" %} + ins_encode %{ + __ sve_uaddv(as_FloatRegister($tmp$$reg), __ $5, + ptrue, as_FloatRegister($src2$$reg)); + __ umov($dst$$Register, as_FloatRegister($tmp$$reg), __ $5, 0); + __ $7($dst$$Register, $dst$$Register, $src1$$Register); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +dnl REDUCE_ADDF($1, $2, $3, $4 ) +dnl REDUCE_ADDF(insn_name, op_name, reg_dst, size) +define(`REDUCE_ADDF', ` +instruct $1($3 src1_dst, vecA src2) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16); + match(Set src1_dst ($2 src1_dst src2)); + ins_cost(SVE_COST); + format %{ "sve_fadda $src1_dst, $src1_dst, $src2\t# vector (sve) ($4)" %} + ins_encode %{ + __ sve_fadda(as_FloatRegister($src1_dst$$reg), __ $4, + ptrue, as_FloatRegister($src2$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// sve add reduction +REDUCE_ADD(reduce_addAI, AddReductionVI, iRegINoSp, iRegIorL2I, S, T_INT, addw) +REDUCE_ADD(reduce_addAL, AddReductionVL, iRegLNoSp, iRegL, D, T_LONG, add) +REDUCE_ADDF(reduce_addAF, AddReductionVF, vRegF, S) +REDUCE_ADDF(reduce_addAD, AddReductionVD, vRegD, D) + +dnl +dnl REDUCE_FMINMAX($1, $2, $3, $4, $5 ) +dnl REDUCE_FMINMAX(min_max, name_suffix, element_type, size, reg_src_dst) +define(`REDUCE_FMINMAX', ` +instruct reduce_$1A$2($5 dst, $5 src1, vecA src2) %{ + predicate(UseSVE > 0 && n->in(2)->bottom_type()->is_vect()->element_basic_type() == $3 && + n->in(2)->bottom_type()->is_vect()->length_in_bytes() >= 16); + match(Set dst (translit($1, `m', `M')ReductionV src1 src2)); + ins_cost(INSN_COST); + effect(TEMP_DEF dst); + format %{ "sve_f$1v $dst, $src2 # vector (sve) (S)\n\t" + "f$1s $dst, $dst, $src1\t # $1 reduction $2" %} + ins_encode %{ + __ sve_f$1v(as_FloatRegister($dst$$reg), __ $4, + ptrue, as_FloatRegister($src2$$reg)); + __ f`$1'translit($4, `SD', `sd')(as_FloatRegister($dst$$reg), as_FloatRegister($dst$$reg), as_FloatRegister($src1$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +// sve max reduction +REDUCE_FMINMAX(max, F, T_FLOAT, S, vRegF) +REDUCE_FMINMAX(max, D, T_DOUBLE, D, vRegD) + +// sve min reduction +REDUCE_FMINMAX(min, F, T_FLOAT, S, vRegF) +REDUCE_FMINMAX(min, D, T_DOUBLE, D, vRegD) + +// sve vector Math.rint, floor, ceil + +instruct vroundAD(vecA dst, vecA src, immI rmode) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= 2 && + n->bottom_type()->is_vect()->element_basic_type() == T_DOUBLE); + match(Set dst (RoundDoubleModeV src rmode)); + format %{ "sve_frint $dst, $src, $rmode\t# vector (sve) (D)" %} + ins_encode %{ + switch ($rmode$$constant) { + case RoundDoubleModeNode::rmode_rint: + __ sve_frintn(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($src$$reg)); + break; + case RoundDoubleModeNode::rmode_floor: + __ sve_frintm(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($src$$reg)); + break; + case RoundDoubleModeNode::rmode_ceil: + __ sve_frintp(as_FloatRegister($dst$$reg), __ D, + ptrue, as_FloatRegister($src$$reg)); + break; + } + %} + ins_pipe(pipe_slow); +%} +dnl +dnl REPLICATE($1, $2, $3, $4, $5 ) +dnl REPLICATE(insn_name, op_name, reg_src, size, min_vec_len) +define(`REPLICATE', ` +instruct $1(vecA dst, $3 src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $5); + match(Set dst ($2 src)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $src\t# vector (sve) ($4)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ $4, as_Register($src$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +dnl REPLICATE_IMM8($1, $2, $3, $4, $5 ) +dnl REPLICATE_IMM8(insn_name, op_name, imm_type, size, min_vec_len) +define(`REPLICATE_IMM8', ` +instruct $1(vecA dst, $3 con) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $5); + match(Set dst ($2 con)); + ins_cost(SVE_COST); + format %{ "sve_dup $dst, $con\t# vector (sve) ($4)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ $4, $con$$constant); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +dnl FREPLICATE($1, $2, $3, $4, $5 ) +dnl FREPLICATE(insn_name, op_name, reg_src, size, min_vec_len) +define(`FREPLICATE', ` +instruct $1(vecA dst, $3 src) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $5); + match(Set dst ($2 src)); + ins_cost(SVE_COST); + format %{ "sve_cpy $dst, $src\t# vector (sve) ($4)" %} + ins_encode %{ + __ sve_cpy(as_FloatRegister($dst$$reg), __ $4, + ptrue, as_FloatRegister($src$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// sve replicate +REPLICATE(replicateAB, ReplicateB, iRegIorL2I, B, 16) +REPLICATE(replicateAS, ReplicateS, iRegIorL2I, H, 8) +REPLICATE(replicateAI, ReplicateI, iRegIorL2I, S, 4) +REPLICATE(replicateAL, ReplicateL, iRegL, D, 2) + +REPLICATE_IMM8(replicateAB_imm8, ReplicateB, immI8, B, 16) +REPLICATE_IMM8(replicateAS_imm8, ReplicateS, immI8_shift8, H, 8) +REPLICATE_IMM8(replicateAI_imm8, ReplicateI, immI8_shift8, S, 4) +REPLICATE_IMM8(replicateAL_imm8, ReplicateL, immL8_shift8, D, 2) + +FREPLICATE(replicateAF, ReplicateF, vRegF, S, 4) +FREPLICATE(replicateAD, ReplicateD, vRegD, D, 2) +dnl +dnl VSHIFT_TRUE_PREDICATE($1, $2, $3, $4, $5 ) +dnl VSHIFT_TRUE_PREDICATE(insn_name, op_name, size, min_vec_len, insn) +define(`VSHIFT_TRUE_PREDICATE', ` +instruct $1(vecA dst, vecA shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $4); + match(Set dst ($2 dst shift)); + ins_cost(SVE_COST); + format %{ "$5 $dst, $dst, $shift\t# vector (sve) ($3)" %} + ins_encode %{ + __ $5(as_FloatRegister($dst$$reg), __ $3, + ptrue, as_FloatRegister($shift$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +dnl VSHIFT_IMM_UNPREDICATE($1, $2, $3, $4, $5 ) +dnl VSHIFT_IMM_UNPREDICATE(insn_name, op_name, size, min_vec_len, insn) +define(`VSHIFT_IMM_UNPREDICATE', ` +instruct $1(vecA dst, vecA src, immI shift) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $4); + match(Set dst ($2 src shift)); + ins_cost(SVE_COST); + format %{ "$5 $dst, $src, $shift\t# vector (sve) ($3)" %} + ins_encode %{ + int con = (int)$shift$$constant;dnl +ifelse(eval(index(`$1', `vasr') == 0 || index(`$1', `vlsr') == 0), 1, ` + if (con == 0) { + __ sve_orr(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + }')dnl +ifelse(eval(index(`$1', `vasr') == 0), 1, `ifelse(eval(index(`$3', `B') == 0), 1, ` + if (con >= 8) con = 7;')ifelse(eval(index(`$3', `H') == 0), 1, ` + if (con >= 16) con = 15;')')dnl +ifelse(eval((index(`$1', `vlsl') == 0 || index(`$1', `vlsr') == 0) && (index(`$3', `B') == 0 || index(`$3', `H') == 0)), 1, ` + if (con >= 8) { + __ sve_eor(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), + as_FloatRegister($src$$reg)); + return; + }') + __ $5(as_FloatRegister($dst$$reg), __ $3, + as_FloatRegister($src$$reg), con); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +dnl VSHIFT_COUNT($1, $2, $3, $4 ) +dnl VSHIFT_COUNT(insn_name, size, min_vec_len, type) +define(`VSHIFT_COUNT', ` +instruct $1(vecA dst, iRegIorL2I cnt) %{ + predicate(UseSVE > 0 && n->as_Vector()->length() >= $3 && + ELEMENT_SHORT_CHAR($4, n)); + match(Set dst (LShiftCntV cnt)); + match(Set dst (RShiftCntV cnt)); + format %{ "sve_dup $dst, $cnt\t# vector shift count (sve) ($2)" %} + ins_encode %{ + __ sve_dup(as_FloatRegister($dst$$reg), __ $2, as_Register($cnt$$reg)); + %} + ins_pipe(pipe_slow); +%}')dnl + +// sve shift +VSHIFT_TRUE_PREDICATE(vasrAB, RShiftVB, B, 16, sve_asr) +VSHIFT_TRUE_PREDICATE(vasrAS, RShiftVS, H, 8, sve_asr) +VSHIFT_TRUE_PREDICATE(vasrAI, RShiftVI, S, 4, sve_asr) +VSHIFT_TRUE_PREDICATE(vasrAL, RShiftVL, D, 2, sve_asr) +VSHIFT_TRUE_PREDICATE(vlslAB, LShiftVB, B, 16, sve_lsl) +VSHIFT_TRUE_PREDICATE(vlslAS, LShiftVS, H, 8, sve_lsl) +VSHIFT_TRUE_PREDICATE(vlslAI, LShiftVI, S, 4, sve_lsl) +VSHIFT_TRUE_PREDICATE(vlslAL, LShiftVL, D, 2, sve_lsl) +VSHIFT_TRUE_PREDICATE(vlsrAB, URShiftVB, B, 16, sve_lsr) +VSHIFT_TRUE_PREDICATE(vlsrAS, URShiftVS, H, 8, sve_lsr) +VSHIFT_TRUE_PREDICATE(vlsrAI, URShiftVI, S, 4, sve_lsr) +VSHIFT_TRUE_PREDICATE(vlsrAL, URShiftVL, D, 2, sve_lsr) +VSHIFT_IMM_UNPREDICATE(vasrAB_imm, RShiftVB, B, 16, sve_asr) +VSHIFT_IMM_UNPREDICATE(vasrAS_imm, RShiftVS, H, 8, sve_asr) +VSHIFT_IMM_UNPREDICATE(vasrAI_imm, RShiftVI, S, 4, sve_asr) +VSHIFT_IMM_UNPREDICATE(vasrAL_imm, RShiftVL, D, 2, sve_asr) +VSHIFT_IMM_UNPREDICATE(vlsrAB_imm, URShiftVB, B, 16, sve_lsr) +VSHIFT_IMM_UNPREDICATE(vlsrAS_imm, URShiftVS, H, 8, sve_lsr) +VSHIFT_IMM_UNPREDICATE(vlsrAI_imm, URShiftVI, S, 4, sve_lsr) +VSHIFT_IMM_UNPREDICATE(vlsrAL_imm, URShiftVL, D, 2, sve_lsr) +VSHIFT_IMM_UNPREDICATE(vlslAB_imm, LShiftVB, B, 16, sve_lsl) +VSHIFT_IMM_UNPREDICATE(vlslAS_imm, LShiftVS, H, 8, sve_lsl) +VSHIFT_IMM_UNPREDICATE(vlslAI_imm, LShiftVI, S, 4, sve_lsl) +VSHIFT_IMM_UNPREDICATE(vlslAL_imm, LShiftVL, D, 2, sve_lsl) +VSHIFT_COUNT(vshiftcntAB, B, 16, T_BYTE) +VSHIFT_COUNT(vshiftcntAS, H, 8, T_SHORT) +VSHIFT_COUNT(vshiftcntAI, S, 4, T_INT) +VSHIFT_COUNT(vshiftcntAL, D, 2, T_LONG) + +// sve sqrt +UNARY_OP_TRUE_PREDICATE(vsqrtAF, SqrtVF, S, 16, sve_fsqrt) +UNARY_OP_TRUE_PREDICATE(vsqrtAD, SqrtVD, D, 16, sve_fsqrt) + +// sve sub +BINARY_OP_UNPREDICATED(vsubAB, SubVB, B, 16, sve_sub) +BINARY_OP_UNPREDICATED(vsubAS, SubVS, H, 8, sve_sub) +BINARY_OP_UNPREDICATED(vsubAI, SubVI, S, 4, sve_sub) +BINARY_OP_UNPREDICATED(vsubAL, SubVL, D, 2, sve_sub) +BINARY_OP_UNPREDICATED(vsubAF, SubVF, S, 4, sve_fsub) +BINARY_OP_UNPREDICATED(vsubAD, SubVD, D, 2, sve_fsub) + diff a/src/hotspot/cpu/aarch64/assembler_aarch64.cpp b/src/hotspot/cpu/aarch64/assembler_aarch64.cpp --- a/src/hotspot/cpu/aarch64/assembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/assembler_aarch64.cpp @@ -634,10 +634,43 @@ __ mov(v1, __ T1D, 0, zr); // mov v1.d[0], xzr __ mov(v1, __ T2S, 1, zr); // mov v1.s[1], wzr __ mov(v1, __ T4H, 2, zr); // mov v1.h[2], wzr __ mov(v1, __ T8B, 3, zr); // mov v1.b[3], wzr __ ld1(v31, v0, __ T2D, Address(__ post(r1, r0))); // ld1 {v31.2d, v0.2d}, [x1], x0 + __ sve_cpy(z0, __ S, p0, v1); // mov z0.s, p0/m, s1 + __ sve_inc(r0, __ S); // incw x0 + __ sve_dec(r1, __ H); // dech x1 + __ sve_lsl(z0, __ B, z1, 7); // lsl z0.b, z1.b, #7 + __ sve_lsl(z21, __ H, z1, 15); // lsl z21.h, z1.h, #15 + __ sve_lsl(z0, __ S, z1, 31); // lsl z0.s, z1.s, #31 + __ sve_lsl(z0, __ D, z1, 63); // lsl z0.d, z1.d, #63 + __ sve_lsr(z0, __ B, z1, 7); // lsr z0.b, z1.b, #7 + __ sve_asr(z0, __ H, z11, 15); // asr z0.h, z11.h, #15 + __ sve_lsr(z30, __ S, z1, 31); // lsr z30.s, z1.s, #31 + __ sve_asr(z0, __ D, z1, 63); // asr z0.d, z1.d, #63 + __ sve_addvl(sp, r0, 31); // addvl sp, x0, #31 + __ sve_addpl(r1, sp, -32); // addpl x1, sp, -32 + __ sve_cntp(r8, __ B, p0, p1); // cntp x8, p0, p1.b + __ sve_dup(z0, __ B, 127); // dup z0.b, 127 + __ sve_dup(z1, __ H, -128); // dup z1.h, -128 + __ sve_dup(z2, __ S, 32512); // dup z2.s, 32512 + __ sve_dup(z7, __ D, -32768); // dup z7.d, -32768 + __ sve_ld1b(z0, __ B, p0, Address(sp)); // ld1b {z0.b}, p0/z, [sp] + __ sve_ld1h(z10, __ H, p1, Address(sp, -8)); // ld1h {z10.h}, p1/z, [sp, #-8, MUL VL] + __ sve_ld1w(z20, __ S, p2, Address(r0, 7)); // ld1w {z20.s}, p2/z, [x0, #7, MUL VL] + __ sve_ld1b(z30, __ B, p3, Address(sp, r8)); // ld1b {z30.b}, p3/z, [sp, x8] + __ sve_ld1w(z0, __ S, p4, Address(sp, r28)); // ld1w {z0.s}, p4/z, [sp, x28, LSL #2] + __ sve_ld1d(z11, __ D, p5, Address(r0, r1)); // ld1d {z11.d}, p5/z, [x0, x1, LSL #3] + __ sve_st1b(z22, __ B, p6, Address(sp)); // st1b {z22.b}, p6, [sp] + __ sve_st1b(z31, __ B, p7, Address(sp, -8)); // st1b {z31.b}, p7, [sp, #-8, MUL VL] + __ sve_st1w(z0, __ S, p1, Address(r0, 7)); // st1w {z0.s}, p1, [x0, #7, MUL VL] + __ sve_st1b(z0, __ B, p2, Address(sp, r1)); // st1b {z0.b}, p2, [sp, x1] + __ sve_st1h(z0, __ H, p3, Address(sp, r8)); // st1h {z0.h}, p3, [sp, x8, LSL #1] + __ sve_st1d(z0, __ D, p4, Address(r0, r18)); // st1d {z0.d}, p4, [x0, x18, LSL #3] + __ sve_ldr(z0, Address(sp)); // ldr z0, [sp] + __ sve_ldr(z31, Address(sp, -256)); // ldr z31, [sp, #-256, MUL VL] + __ sve_str(z8, Address(r8, 255)); // str z8, [x8, #255, MUL VL] // FloatImmediateOp __ fmovd(v0, 2.0); // fmov d0, #2.0 __ fmovd(v0, 2.125); // fmov d0, #2.125 __ fmovd(v0, 4.0); // fmov d0, #4.0 @@ -757,10 +790,61 @@ __ ldsminl(Assembler::word, r10, r15, r17); // ldsminl w10, w15, [x17] __ ldsmaxl(Assembler::word, r2, r10, r12); // ldsmaxl w2, w10, [x12] __ lduminl(Assembler::word, r12, r15, r13); // lduminl w12, w15, [x13] __ ldumaxl(Assembler::word, r2, r7, r20); // ldumaxl w2, w7, [x20] +// SVEVectorOp + __ sve_add(z25, __ B, z15, z4); // add z25.b, z15.b, z4.b + __ sve_sub(z4, __ S, z11, z17); // sub z4.s, z11.s, z17.s + __ sve_fadd(z16, __ D, z17, z10); // fadd z16.d, z17.d, z10.d + __ sve_fmul(z22, __ D, z12, z25); // fmul z22.d, z12.d, z25.d + __ sve_fsub(z28, __ D, z14, z10); // fsub z28.d, z14.d, z10.d + __ sve_abs(z1, __ H, p3, z30); // abs z1.h, p3/m, z30.h + __ sve_add(z15, __ B, p1, z2); // add z15.b, p1/m, z15.b, z2.b + __ sve_asr(z13, __ S, p4, z16); // asr z13.s, p4/m, z13.s, z16.s + __ sve_cnt(z3, __ D, p0, z11); // cnt z3.d, p0/m, z11.d + __ sve_lsl(z5, __ D, p2, z14); // lsl z5.d, p2/m, z5.d, z14.d + __ sve_lsr(z29, __ B, p0, z20); // lsr z29.b, p0/m, z29.b, z20.b + __ sve_mul(z20, __ S, p5, z27); // mul z20.s, p5/m, z20.s, z27.s + __ sve_neg(z26, __ B, p6, z4); // neg z26.b, p6/m, z4.b + __ sve_not(z22, __ B, p4, z30); // not z22.b, p4/m, z30.b + __ sve_smax(z11, __ H, p2, z27); // smax z11.h, p2/m, z11.h, z27.h + __ sve_smin(z28, __ S, p5, z30); // smin z28.s, p5/m, z28.s, z30.s + __ sve_sub(z30, __ S, p1, z13); // sub z30.s, p1/m, z30.s, z13.s + __ sve_fabs(z30, __ D, p4, z26); // fabs z30.d, p4/m, z26.d + __ sve_fadd(z15, __ S, p3, z11); // fadd z15.s, p3/m, z15.s, z11.s + __ sve_fdiv(z6, __ D, p7, z16); // fdiv z6.d, p7/m, z6.d, z16.d + __ sve_fmax(z27, __ S, p7, z7); // fmax z27.s, p7/m, z27.s, z7.s + __ sve_fmin(z19, __ D, p2, z4); // fmin z19.d, p2/m, z19.d, z4.d + __ sve_fmul(z17, __ S, p4, z22); // fmul z17.s, p4/m, z17.s, z22.s + __ sve_fneg(z28, __ D, p3, z21); // fneg z28.d, p3/m, z21.d + __ sve_frintm(z18, __ S, p5, z2); // frintm z18.s, p5/m, z2.s + __ sve_frintn(z6, __ S, p3, z15); // frintn z6.s, p3/m, z15.s + __ sve_frintp(z12, __ D, p5, z1); // frintp z12.d, p5/m, z1.d + __ sve_fsqrt(z18, __ S, p1, z17); // fsqrt z18.s, p1/m, z17.s + __ sve_fsub(z15, __ S, p5, z13); // fsub z15.s, p5/m, z15.s, z13.s + __ sve_fmla(z20, __ D, p7, z27, z11); // fmla z20.d, p7/m, z27.d, z11.d + __ sve_fmls(z3, __ D, p0, z30, z23); // fmls z3.d, p0/m, z30.d, z23.d + __ sve_fnmla(z17, __ S, p2, z27, z26); // fnmla z17.s, p2/m, z27.s, z26.s + __ sve_fnmls(z6, __ D, p5, z22, z30); // fnmls z6.d, p5/m, z22.d, z30.d + __ sve_mla(z2, __ H, p7, z26, z18); // mla z2.h, p7/m, z26.h, z18.h + __ sve_mls(z22, __ B, p4, z2, z17); // mls z22.b, p4/m, z2.b, z17.b + __ sve_and(z24, z25, z22); // and z24.d, z25.d, z22.d + __ sve_eor(z18, z12, z3); // eor z18.d, z12.d, z3.d + __ sve_orr(z29, z28, z16); // orr z29.d, z28.d, z16.d + +// SVEReductionOp + __ sve_andv(v6, __ S, p2, z28); // andv s6, p2, z28.s + __ sve_orv(v7, __ H, p1, z7); // orv h7, p1, z7.h + __ sve_eorv(v9, __ B, p5, z8); // eorv b9, p5, z8.b + __ sve_smaxv(v27, __ B, p5, z30); // smaxv b27, p5, z30.b + __ sve_sminv(v26, __ H, p0, z16); // sminv h26, p0, z16.h + __ sve_fminv(v3, __ D, p6, z8); // fminv d3, p6, z8.d + __ sve_fmaxv(v21, __ D, p6, z26); // fmaxv d21, p6, z26.d + __ sve_fadda(v22, __ S, p0, z4); // fadda s22, p0, s22, z4.s + __ sve_uaddv(v17, __ H, p0, z3); // uaddv d17, p0, z3.h + __ bind(forth); /* aarch64ops.o: file format elf64-littleaarch64 @@ -808,36 +892,36 @@ 94: b25a2969 orr x9, x11, #0x1ffc000000000 98: d278b411 eor x17, x0, #0x3fffffffffff00 9c: f26aad01 ands x1, x8, #0xffffffffffc00003 a0: 14000000 b a0 a4: 17ffffd7 b 0 - a8: 140001f2 b 870 + a8: 14000242 b 9b0 ac: 94000000 bl ac b0: 97ffffd4 bl 0 - b4: 940001ef bl 870 + b4: 9400023f bl 9b0 b8: 3400000a cbz w10, b8 bc: 34fffa2a cbz w10, 0 - c0: 34003d8a cbz w10, 870 + c0: 3400478a cbz w10, 9b0 c4: 35000008 cbnz w8, c4 c8: 35fff9c8 cbnz w8, 0 - cc: 35003d28 cbnz w8, 870 + cc: 35004728 cbnz w8, 9b0 d0: b400000b cbz x11, d0 d4: b4fff96b cbz x11, 0 - d8: b4003ccb cbz x11, 870 + d8: b40046cb cbz x11, 9b0 dc: b500001d cbnz x29, dc e0: b5fff91d cbnz x29, 0 - e4: b5003c7d cbnz x29, 870 + e4: b500467d cbnz x29, 9b0 e8: 10000013 adr x19, e8 ec: 10fff8b3 adr x19, 0 - f0: 10003c13 adr x19, 870 + f0: 10004613 adr x19, 9b0 f4: 90000013 adrp x19, 0 f8: 36300016 tbz w22, #6, f8 fc: 3637f836 tbz w22, #6, 0 - 100: 36303b96 tbz w22, #6, 870 + 100: 36304596 tbz w22, #6, 9b0 104: 3758000c tbnz w12, #11, 104 108: 375ff7cc tbnz w12, #11, 0 - 10c: 37583b2c tbnz w12, #11, 870 + 10c: 3758452c tbnz w12, #11, 9b0 110: 128313a0 mov w0, #0xffffe762 // #-6302 114: 528a32c7 mov w7, #0x5196 // #20886 118: 7289173b movk w27, #0x48b9 11c: 92ab3acc mov x12, #0xffffffffa629ffff // #-1507196929 120: d2a0bf94 mov x20, #0x5fc0000 // #100401152 @@ -850,62 +934,62 @@ 13c: d35a4016 ubfiz x22, x0, #38, #17 140: 13946c63 extr w3, w3, w20, #27 144: 93c3dbc8 extr x8, x30, x3, #54 148: 54000000 b.eq 148 // b.none 14c: 54fff5a0 b.eq 0 // b.none - 150: 54003900 b.eq 870 // b.none + 150: 54004300 b.eq 9b0 // b.none 154: 54000001 b.ne 154 // b.any 158: 54fff541 b.ne 0 // b.any - 15c: 540038a1 b.ne 870 // b.any + 15c: 540042a1 b.ne 9b0 // b.any 160: 54000002 b.cs 160 // b.hs, b.nlast 164: 54fff4e2 b.cs 0 // b.hs, b.nlast - 168: 54003842 b.cs 870 // b.hs, b.nlast + 168: 54004242 b.cs 9b0 // b.hs, b.nlast 16c: 54000002 b.cs 16c // b.hs, b.nlast 170: 54fff482 b.cs 0 // b.hs, b.nlast - 174: 540037e2 b.cs 870 // b.hs, b.nlast + 174: 540041e2 b.cs 9b0 // b.hs, b.nlast 178: 54000003 b.cc 178 // b.lo, b.ul, b.last 17c: 54fff423 b.cc 0 // b.lo, b.ul, b.last - 180: 54003783 b.cc 870 // b.lo, b.ul, b.last + 180: 54004183 b.cc 9b0 // b.lo, b.ul, b.last 184: 54000003 b.cc 184 // b.lo, b.ul, b.last 188: 54fff3c3 b.cc 0 // b.lo, b.ul, b.last - 18c: 54003723 b.cc 870 // b.lo, b.ul, b.last + 18c: 54004123 b.cc 9b0 // b.lo, b.ul, b.last 190: 54000004 b.mi 190 // b.first 194: 54fff364 b.mi 0 // b.first - 198: 540036c4 b.mi 870 // b.first + 198: 540040c4 b.mi 9b0 // b.first 19c: 54000005 b.pl 19c // b.nfrst 1a0: 54fff305 b.pl 0 // b.nfrst - 1a4: 54003665 b.pl 870 // b.nfrst + 1a4: 54004065 b.pl 9b0 // b.nfrst 1a8: 54000006 b.vs 1a8 1ac: 54fff2a6 b.vs 0 - 1b0: 54003606 b.vs 870 + 1b0: 54004006 b.vs 9b0 1b4: 54000007 b.vc 1b4 1b8: 54fff247 b.vc 0 - 1bc: 540035a7 b.vc 870 + 1bc: 54003fa7 b.vc 9b0 1c0: 54000008 b.hi 1c0 // b.pmore 1c4: 54fff1e8 b.hi 0 // b.pmore - 1c8: 54003548 b.hi 870 // b.pmore + 1c8: 54003f48 b.hi 9b0 // b.pmore 1cc: 54000009 b.ls 1cc // b.plast 1d0: 54fff189 b.ls 0 // b.plast - 1d4: 540034e9 b.ls 870 // b.plast + 1d4: 54003ee9 b.ls 9b0 // b.plast 1d8: 5400000a b.ge 1d8 // b.tcont 1dc: 54fff12a b.ge 0 // b.tcont - 1e0: 5400348a b.ge 870 // b.tcont + 1e0: 54003e8a b.ge 9b0 // b.tcont 1e4: 5400000b b.lt 1e4 // b.tstop 1e8: 54fff0cb b.lt 0 // b.tstop - 1ec: 5400342b b.lt 870 // b.tstop + 1ec: 54003e2b b.lt 9b0 // b.tstop 1f0: 5400000c b.gt 1f0 1f4: 54fff06c b.gt 0 - 1f8: 540033cc b.gt 870 + 1f8: 54003dcc b.gt 9b0 1fc: 5400000d b.le 1fc 200: 54fff00d b.le 0 - 204: 5400336d b.le 870 + 204: 54003d6d b.le 9b0 208: 5400000e b.al 208 20c: 54ffefae b.al 0 - 210: 5400330e b.al 870 + 210: 54003d0e b.al 9b0 214: 5400000f b.nv 214 218: 54ffef4f b.nv 0 - 21c: 540032af b.nv 870 + 21c: 54003caf b.nv 9b0 220: d40658e1 svc #0x32c7 224: d4014d22 hvc #0xa69 228: d4046543 smc #0x232a 22c: d4273f60 brk #0x39fb 230: d44cad80 hlt #0x656c @@ -1027,11 +1111,11 @@ 400: b99c2624 ldrsw x4, [x17, #7204] 404: fd5c2374 ldr d20, [x27, #14400] 408: bd5fa1d9 ldr s25, [x14, #8096] 40c: fd1d595a str d26, [x10, #15024] 410: bd1b1869 str s9, [x3, #6936] - 414: 580022fb ldr x27, 870 + 414: 58002cfb ldr x27, 9b0 418: 1800000b ldr w11, 418 41c: f8945060 prfum pldl1keep, [x3, #-187] 420: d8000000 prfm pldl1keep, 420 424: f8ae6ba0 prfm pldl1keep, [x29, x14] 428: f99a0080 prfm pldl1keep, [x4, #13312] @@ -1202,114 +1286,194 @@ 6bc: 4e081fe1 mov v1.d[0], xzr 6c0: 4e0c1fe1 mov v1.s[1], wzr 6c4: 4e0a1fe1 mov v1.h[2], wzr 6c8: 4e071fe1 mov v1.b[3], wzr 6cc: 4cc0ac3f ld1 {v31.2d, v0.2d}, [x1], x0 - 6d0: 1e601000 fmov d0, #2.000000000000000000e+00 - 6d4: 1e603000 fmov d0, #2.125000000000000000e+00 - 6d8: 1e621000 fmov d0, #4.000000000000000000e+00 - 6dc: 1e623000 fmov d0, #4.250000000000000000e+00 - 6e0: 1e641000 fmov d0, #8.000000000000000000e+00 - 6e4: 1e643000 fmov d0, #8.500000000000000000e+00 - 6e8: 1e661000 fmov d0, #1.600000000000000000e+01 - 6ec: 1e663000 fmov d0, #1.700000000000000000e+01 - 6f0: 1e681000 fmov d0, #1.250000000000000000e-01 - 6f4: 1e683000 fmov d0, #1.328125000000000000e-01 - 6f8: 1e6a1000 fmov d0, #2.500000000000000000e-01 - 6fc: 1e6a3000 fmov d0, #2.656250000000000000e-01 - 700: 1e6c1000 fmov d0, #5.000000000000000000e-01 - 704: 1e6c3000 fmov d0, #5.312500000000000000e-01 - 708: 1e6e1000 fmov d0, #1.000000000000000000e+00 - 70c: 1e6e3000 fmov d0, #1.062500000000000000e+00 - 710: 1e701000 fmov d0, #-2.000000000000000000e+00 - 714: 1e703000 fmov d0, #-2.125000000000000000e+00 - 718: 1e721000 fmov d0, #-4.000000000000000000e+00 - 71c: 1e723000 fmov d0, #-4.250000000000000000e+00 - 720: 1e741000 fmov d0, #-8.000000000000000000e+00 - 724: 1e743000 fmov d0, #-8.500000000000000000e+00 - 728: 1e761000 fmov d0, #-1.600000000000000000e+01 - 72c: 1e763000 fmov d0, #-1.700000000000000000e+01 - 730: 1e781000 fmov d0, #-1.250000000000000000e-01 - 734: 1e783000 fmov d0, #-1.328125000000000000e-01 - 738: 1e7a1000 fmov d0, #-2.500000000000000000e-01 - 73c: 1e7a3000 fmov d0, #-2.656250000000000000e-01 - 740: 1e7c1000 fmov d0, #-5.000000000000000000e-01 - 744: 1e7c3000 fmov d0, #-5.312500000000000000e-01 - 748: 1e7e1000 fmov d0, #-1.000000000000000000e+00 - 74c: 1e7e3000 fmov d0, #-1.062500000000000000e+00 - 750: f8388098 swp x24, x24, [x4] - 754: f8340010 ldadd x20, x16, [x0] - 758: f8241175 ldclr x4, x21, [x11] - 75c: f83e22d0 ldeor x30, x16, [x22] - 760: f82432ef ldset x4, x15, [x23] - 764: f83a5186 ldsmin x26, x6, [x12] - 768: f82f41ee ldsmax x15, x14, [x15] - 76c: f82973b9 ldumin x9, x25, [x29] - 770: f82b6194 ldumax x11, x20, [x12] - 774: f8b28216 swpa x18, x22, [x16] - 778: f8b50358 ldadda x21, x24, [x26] - 77c: f8a61206 ldclra x6, x6, [x16] - 780: f8b02219 ldeora x16, x25, [x16] - 784: f8bc3218 ldseta x28, x24, [x16] - 788: f8ba514f ldsmina x26, x15, [x10] - 78c: f8ad428e ldsmaxa x13, x14, [x20] - 790: f8a173d7 ldumina x1, x23, [x30] - 794: f8ae60c2 ldumaxa x14, x2, [x6] - 798: f8e38328 swpal x3, x8, [x25] - 79c: f8e003db ldaddal x0, x27, [x30] - 7a0: f8e513c5 ldclral x5, x5, [x30] - 7a4: f8eb2019 ldeoral x11, x25, [x0] - 7a8: f8ff3260 ldsetal xzr, x0, [x19] - 7ac: f8fd513a ldsminal x29, x26, [x9] - 7b0: f8fa41ec ldsmaxal x26, x12, [x15] - 7b4: f8eb724b lduminal x11, x11, [x18] - 7b8: f8f96316 ldumaxal x25, x22, [x24] - 7bc: f8608171 swpl x0, x17, [x11] - 7c0: f86600dd ldaddl x6, x29, [x6] - 7c4: f86512a5 ldclrl x5, x5, [x21] - 7c8: f8732250 ldeorl x19, x16, [x18] - 7cc: f87e339b ldsetl x30, x27, [x28] - 7d0: f861503c ldsminl x1, x28, [x1] - 7d4: f874421d ldsmaxl x20, x29, [x16] - 7d8: f86d73aa lduminl x13, x10, [x29] - 7dc: f87d62d3 ldumaxl x29, x19, [x22] - 7e0: b82a83e4 swp w10, w4, [sp] - 7e4: b83503e8 ldadd w21, w8, [sp] - 7e8: b833138a ldclr w19, w10, [x28] - 7ec: b82220b9 ldeor w2, w25, [x5] - 7f0: b82332c8 ldset w3, w8, [x22] - 7f4: b83350ad ldsmin w19, w13, [x5] - 7f8: b83d42b8 ldsmax w29, w24, [x21] - 7fc: b83a7078 ldumin w26, w24, [x3] - 800: b83862fa ldumax w24, w26, [x23] - 804: b8af8075 swpa w15, w21, [x3] - 808: b8b80328 ldadda w24, w8, [x25] - 80c: b8b41230 ldclra w20, w16, [x17] - 810: b8a22001 ldeora w2, w1, [x0] - 814: b8b83064 ldseta w24, w4, [x3] - 818: b8ac539f ldsmina w12, wzr, [x28] - 81c: b8aa405a ldsmaxa w10, w26, [x2] - 820: b8ac73f2 ldumina w12, w18, [sp] - 824: b8a163ad ldumaxa w1, w13, [x29] - 828: b8e08193 swpal w0, w19, [x12] - 82c: b8f101b6 ldaddal w17, w22, [x13] - 830: b8fc13fe ldclral w28, w30, [sp] - 834: b8e1239a ldeoral w1, w26, [x28] - 838: b8e4309e ldsetal w4, w30, [x4] - 83c: b8e6535e ldsminal w6, w30, [x26] - 840: b8f24109 ldsmaxal w18, w9, [x8] - 844: b8ec7280 lduminal w12, w0, [x20] - 848: b8e16058 ldumaxal w1, w24, [x2] - 84c: b8608309 swpl w0, w9, [x24] - 850: b87a03d0 ldaddl w26, w16, [x30] - 854: b86312ea ldclrl w3, w10, [x23] - 858: b86a2244 ldeorl w10, w4, [x18] - 85c: b862310b ldsetl w2, w11, [x8] - 860: b86a522f ldsminl w10, w15, [x17] - 864: b862418a ldsmaxl w2, w10, [x12] - 868: b86c71af lduminl w12, w15, [x13] - 86c: b8626287 ldumaxl w2, w7, [x20] + 6d0: 05a08020 mov z0.s, p0/m, s1 + 6d4: 04b0e3e0 incw x0 + 6d8: 0470e7e1 dech x1 + 6dc: 042f9c20 lsl z0.b, z1.b, #7 + 6e0: 043f9c35 lsl z21.h, z1.h, #15 + 6e4: 047f9c20 lsl z0.s, z1.s, #31 + 6e8: 04ff9c20 lsl z0.d, z1.d, #63 + 6ec: 04299420 lsr z0.b, z1.b, #7 + 6f0: 04319160 asr z0.h, z11.h, #15 + 6f4: 0461943e lsr z30.s, z1.s, #31 + 6f8: 04a19020 asr z0.d, z1.d, #63 + 6fc: 042053ff addvl sp, x0, #31 + 700: 047f5401 addpl x1, sp, #-32 + 704: 25208028 cntp x8, p0, p1.b + 708: 2538cfe0 mov z0.b, #127 + 70c: 2578d001 mov z1.h, #-128 + 710: 25b8efe2 mov z2.s, #32512 + 714: 25f8f007 mov z7.d, #-32768 + 718: a400a3e0 ld1b {z0.b}, p0/z, [sp] + 71c: a4a8a7ea ld1h {z10.h}, p1/z, [sp, #-8, mul vl] + 720: a547a814 ld1w {z20.s}, p2/z, [x0, #7, mul vl] + 724: a4084ffe ld1b {z30.b}, p3/z, [sp, x8] + 728: a55c53e0 ld1w {z0.s}, p4/z, [sp, x28, lsl #2] + 72c: a5e1540b ld1d {z11.d}, p5/z, [x0, x1, lsl #3] + 730: e400fbf6 st1b {z22.b}, p6, [sp] + 734: e408ffff st1b {z31.b}, p7, [sp, #-8, mul vl] + 738: e547e400 st1w {z0.s}, p1, [x0, #7, mul vl] + 73c: e4014be0 st1b {z0.b}, p2, [sp, x1] + 740: e4a84fe0 st1h {z0.h}, p3, [sp, x8, lsl #1] + 744: e5f25000 st1d {z0.d}, p4, [x0, x18, lsl #3] + 748: 858043e0 ldr z0, [sp] + 74c: 85a043ff ldr z31, [sp, #-256, mul vl] + 750: e59f5d08 str z8, [x8, #255, mul vl] + 754: 1e601000 fmov d0, #2.000000000000000000e+00 + 758: 1e603000 fmov d0, #2.125000000000000000e+00 + 75c: 1e621000 fmov d0, #4.000000000000000000e+00 + 760: 1e623000 fmov d0, #4.250000000000000000e+00 + 764: 1e641000 fmov d0, #8.000000000000000000e+00 + 768: 1e643000 fmov d0, #8.500000000000000000e+00 + 76c: 1e661000 fmov d0, #1.600000000000000000e+01 + 770: 1e663000 fmov d0, #1.700000000000000000e+01 + 774: 1e681000 fmov d0, #1.250000000000000000e-01 + 778: 1e683000 fmov d0, #1.328125000000000000e-01 + 77c: 1e6a1000 fmov d0, #2.500000000000000000e-01 + 780: 1e6a3000 fmov d0, #2.656250000000000000e-01 + 784: 1e6c1000 fmov d0, #5.000000000000000000e-01 + 788: 1e6c3000 fmov d0, #5.312500000000000000e-01 + 78c: 1e6e1000 fmov d0, #1.000000000000000000e+00 + 790: 1e6e3000 fmov d0, #1.062500000000000000e+00 + 794: 1e701000 fmov d0, #-2.000000000000000000e+00 + 798: 1e703000 fmov d0, #-2.125000000000000000e+00 + 79c: 1e721000 fmov d0, #-4.000000000000000000e+00 + 7a0: 1e723000 fmov d0, #-4.250000000000000000e+00 + 7a4: 1e741000 fmov d0, #-8.000000000000000000e+00 + 7a8: 1e743000 fmov d0, #-8.500000000000000000e+00 + 7ac: 1e761000 fmov d0, #-1.600000000000000000e+01 + 7b0: 1e763000 fmov d0, #-1.700000000000000000e+01 + 7b4: 1e781000 fmov d0, #-1.250000000000000000e-01 + 7b8: 1e783000 fmov d0, #-1.328125000000000000e-01 + 7bc: 1e7a1000 fmov d0, #-2.500000000000000000e-01 + 7c0: 1e7a3000 fmov d0, #-2.656250000000000000e-01 + 7c4: 1e7c1000 fmov d0, #-5.000000000000000000e-01 + 7c8: 1e7c3000 fmov d0, #-5.312500000000000000e-01 + 7cc: 1e7e1000 fmov d0, #-1.000000000000000000e+00 + 7d0: 1e7e3000 fmov d0, #-1.062500000000000000e+00 + 7d4: f8388098 swp x24, x24, [x4] + 7d8: f8340010 ldadd x20, x16, [x0] + 7dc: f8241175 ldclr x4, x21, [x11] + 7e0: f83e22d0 ldeor x30, x16, [x22] + 7e4: f82432ef ldset x4, x15, [x23] + 7e8: f83a5186 ldsmin x26, x6, [x12] + 7ec: f82f41ee ldsmax x15, x14, [x15] + 7f0: f82973b9 ldumin x9, x25, [x29] + 7f4: f82b6194 ldumax x11, x20, [x12] + 7f8: f8b28216 swpa x18, x22, [x16] + 7fc: f8b50358 ldadda x21, x24, [x26] + 800: f8a61206 ldclra x6, x6, [x16] + 804: f8b02219 ldeora x16, x25, [x16] + 808: f8bc3218 ldseta x28, x24, [x16] + 80c: f8ba514f ldsmina x26, x15, [x10] + 810: f8ad428e ldsmaxa x13, x14, [x20] + 814: f8a173d7 ldumina x1, x23, [x30] + 818: f8ae60c2 ldumaxa x14, x2, [x6] + 81c: f8e38328 swpal x3, x8, [x25] + 820: f8e003db ldaddal x0, x27, [x30] + 824: f8e513c5 ldclral x5, x5, [x30] + 828: f8eb2019 ldeoral x11, x25, [x0] + 82c: f8ff3260 ldsetal xzr, x0, [x19] + 830: f8fd513a ldsminal x29, x26, [x9] + 834: f8fa41ec ldsmaxal x26, x12, [x15] + 838: f8eb724b lduminal x11, x11, [x18] + 83c: f8f96316 ldumaxal x25, x22, [x24] + 840: f8608171 swpl x0, x17, [x11] + 844: f86600dd ldaddl x6, x29, [x6] + 848: f86512a5 ldclrl x5, x5, [x21] + 84c: f8732250 ldeorl x19, x16, [x18] + 850: f87e339b ldsetl x30, x27, [x28] + 854: f861503c ldsminl x1, x28, [x1] + 858: f874421d ldsmaxl x20, x29, [x16] + 85c: f86d73aa lduminl x13, x10, [x29] + 860: f87d62d3 ldumaxl x29, x19, [x22] + 864: b82a83e4 swp w10, w4, [sp] + 868: b83503e8 ldadd w21, w8, [sp] + 86c: b833138a ldclr w19, w10, [x28] + 870: b82220b9 ldeor w2, w25, [x5] + 874: b82332c8 ldset w3, w8, [x22] + 878: b83350ad ldsmin w19, w13, [x5] + 87c: b83d42b8 ldsmax w29, w24, [x21] + 880: b83a7078 ldumin w26, w24, [x3] + 884: b83862fa ldumax w24, w26, [x23] + 888: b8af8075 swpa w15, w21, [x3] + 88c: b8b80328 ldadda w24, w8, [x25] + 890: b8b41230 ldclra w20, w16, [x17] + 894: b8a22001 ldeora w2, w1, [x0] + 898: b8b83064 ldseta w24, w4, [x3] + 89c: b8ac539f ldsmina w12, wzr, [x28] + 8a0: b8aa405a ldsmaxa w10, w26, [x2] + 8a4: b8ac73f2 ldumina w12, w18, [sp] + 8a8: b8a163ad ldumaxa w1, w13, [x29] + 8ac: b8e08193 swpal w0, w19, [x12] + 8b0: b8f101b6 ldaddal w17, w22, [x13] + 8b4: b8fc13fe ldclral w28, w30, [sp] + 8b8: b8e1239a ldeoral w1, w26, [x28] + 8bc: b8e4309e ldsetal w4, w30, [x4] + 8c0: b8e6535e ldsminal w6, w30, [x26] + 8c4: b8f24109 ldsmaxal w18, w9, [x8] + 8c8: b8ec7280 lduminal w12, w0, [x20] + 8cc: b8e16058 ldumaxal w1, w24, [x2] + 8d0: b8608309 swpl w0, w9, [x24] + 8d4: b87a03d0 ldaddl w26, w16, [x30] + 8d8: b86312ea ldclrl w3, w10, [x23] + 8dc: b86a2244 ldeorl w10, w4, [x18] + 8e0: b862310b ldsetl w2, w11, [x8] + 8e4: b86a522f ldsminl w10, w15, [x17] + 8e8: b862418a ldsmaxl w2, w10, [x12] + 8ec: b86c71af lduminl w12, w15, [x13] + 8f0: b8626287 ldumaxl w2, w7, [x20] + 8f4: 042401f9 add z25.b, z15.b, z4.b + 8f8: 04b10564 sub z4.s, z11.s, z17.s + 8fc: 65ca0230 fadd z16.d, z17.d, z10.d + 900: 65d90996 fmul z22.d, z12.d, z25.d + 904: 65ca05dc fsub z28.d, z14.d, z10.d + 908: 0456afc1 abs z1.h, p3/m, z30.h + 90c: 0400044f add z15.b, p1/m, z15.b, z2.b + 910: 0490920d asr z13.s, p4/m, z13.s, z16.s + 914: 04daa163 cnt z3.d, p0/m, z11.d + 918: 04d389c5 lsl z5.d, p2/m, z5.d, z14.d + 91c: 0411829d lsr z29.b, p0/m, z29.b, z20.b + 920: 04901774 mul z20.s, p5/m, z20.s, z27.s + 924: 0417b89a neg z26.b, p6/m, z4.b + 928: 041eb3d6 not z22.b, p4/m, z30.b + 92c: 04480b6b smax z11.h, p2/m, z11.h, z27.h + 930: 048a17dc smin z28.s, p5/m, z28.s, z30.s + 934: 048105be sub z30.s, p1/m, z30.s, z13.s + 938: 04dcb35e fabs z30.d, p4/m, z26.d + 93c: 65808d6f fadd z15.s, p3/m, z15.s, z11.s + 940: 65cd9e06 fdiv z6.d, p7/m, z6.d, z16.d + 944: 65869cfb fmax z27.s, p7/m, z27.s, z7.s + 948: 65c78893 fmin z19.d, p2/m, z19.d, z4.d + 94c: 658292d1 fmul z17.s, p4/m, z17.s, z22.s + 950: 04ddaebc fneg z28.d, p3/m, z21.d + 954: 6582b452 frintm z18.s, p5/m, z2.s + 958: 6580ade6 frintn z6.s, p3/m, z15.s + 95c: 65c1b42c frintp z12.d, p5/m, z1.d + 960: 658da632 fsqrt z18.s, p1/m, z17.s + 964: 658195af fsub z15.s, p5/m, z15.s, z13.s + 968: 65eb1f74 fmla z20.d, p7/m, z27.d, z11.d + 96c: 65f723c3 fmls z3.d, p0/m, z30.d, z23.d + 970: 65ba4b71 fnmla z17.s, p2/m, z27.s, z26.s + 974: 65fe76c6 fnmls z6.d, p5/m, z22.d, z30.d + 978: 04525f42 mla z2.h, p7/m, z26.h, z18.h + 97c: 04117056 mls z22.b, p4/m, z2.b, z17.b + 980: 04363338 and z24.d, z25.d, z22.d + 984: 04a33192 eor z18.d, z12.d, z3.d + 988: 0470339d orr z29.d, z28.d, z16.d + 98c: 049a2b86 andv s6, p2, z28.s + 990: 045824e7 orv h7, p1, z7.h + 994: 04193509 eorv b9, p5, z8.b + 998: 040837db smaxv b27, p5, z30.b + 99c: 044a221a sminv h26, p0, z16.h + 9a0: 65c73903 fminv d3, p6, z8.d + 9a4: 65c63b55 fmaxv d21, p6, z26.d + 9a8: 65982096 fadda s22, p0, s22, z4.s + 9ac: 04412071 uaddv d17, p0, z3.h */ static const unsigned int insns[] = { 0x8b0d82fa, 0xcb49970c, 0xab889dfc, 0xeb9ee787, @@ -1320,34 +1484,34 @@ 0x0aa70f53, 0x2aaa0f06, 0x4a6176a4, 0x6a604eb0, 0x1105ed91, 0x3100583e, 0x5101f8bd, 0x710f0306, 0x9101a1a0, 0xb10a5cc8, 0xd10810aa, 0xf10fd061, 0x120cb166, 0x321764bc, 0x52174681, 0x720c0247, 0x9241018e, 0xb25a2969, 0xd278b411, 0xf26aad01, - 0x14000000, 0x17ffffd7, 0x140001f2, 0x94000000, - 0x97ffffd4, 0x940001ef, 0x3400000a, 0x34fffa2a, - 0x34003d8a, 0x35000008, 0x35fff9c8, 0x35003d28, - 0xb400000b, 0xb4fff96b, 0xb4003ccb, 0xb500001d, - 0xb5fff91d, 0xb5003c7d, 0x10000013, 0x10fff8b3, - 0x10003c13, 0x90000013, 0x36300016, 0x3637f836, - 0x36303b96, 0x3758000c, 0x375ff7cc, 0x37583b2c, + 0x14000000, 0x17ffffd7, 0x14000242, 0x94000000, + 0x97ffffd4, 0x9400023f, 0x3400000a, 0x34fffa2a, + 0x3400478a, 0x35000008, 0x35fff9c8, 0x35004728, + 0xb400000b, 0xb4fff96b, 0xb40046cb, 0xb500001d, + 0xb5fff91d, 0xb500467d, 0x10000013, 0x10fff8b3, + 0x10004613, 0x90000013, 0x36300016, 0x3637f836, + 0x36304596, 0x3758000c, 0x375ff7cc, 0x3758452c, 0x128313a0, 0x528a32c7, 0x7289173b, 0x92ab3acc, 0xd2a0bf94, 0xf2c285e8, 0x9358722f, 0x330e652f, 0x53067f3b, 0x93577c53, 0xb34a1aac, 0xd35a4016, 0x13946c63, 0x93c3dbc8, 0x54000000, 0x54fff5a0, - 0x54003900, 0x54000001, 0x54fff541, 0x540038a1, - 0x54000002, 0x54fff4e2, 0x54003842, 0x54000002, - 0x54fff482, 0x540037e2, 0x54000003, 0x54fff423, - 0x54003783, 0x54000003, 0x54fff3c3, 0x54003723, - 0x54000004, 0x54fff364, 0x540036c4, 0x54000005, - 0x54fff305, 0x54003665, 0x54000006, 0x54fff2a6, - 0x54003606, 0x54000007, 0x54fff247, 0x540035a7, - 0x54000008, 0x54fff1e8, 0x54003548, 0x54000009, - 0x54fff189, 0x540034e9, 0x5400000a, 0x54fff12a, - 0x5400348a, 0x5400000b, 0x54fff0cb, 0x5400342b, - 0x5400000c, 0x54fff06c, 0x540033cc, 0x5400000d, - 0x54fff00d, 0x5400336d, 0x5400000e, 0x54ffefae, - 0x5400330e, 0x5400000f, 0x54ffef4f, 0x540032af, + 0x54004300, 0x54000001, 0x54fff541, 0x540042a1, + 0x54000002, 0x54fff4e2, 0x54004242, 0x54000002, + 0x54fff482, 0x540041e2, 0x54000003, 0x54fff423, + 0x54004183, 0x54000003, 0x54fff3c3, 0x54004123, + 0x54000004, 0x54fff364, 0x540040c4, 0x54000005, + 0x54fff305, 0x54004065, 0x54000006, 0x54fff2a6, + 0x54004006, 0x54000007, 0x54fff247, 0x54003fa7, + 0x54000008, 0x54fff1e8, 0x54003f48, 0x54000009, + 0x54fff189, 0x54003ee9, 0x5400000a, 0x54fff12a, + 0x54003e8a, 0x5400000b, 0x54fff0cb, 0x54003e2b, + 0x5400000c, 0x54fff06c, 0x54003dcc, 0x5400000d, + 0x54fff00d, 0x54003d6d, 0x5400000e, 0x54ffefae, + 0x54003d0e, 0x5400000f, 0x54ffef4f, 0x54003caf, 0xd40658e1, 0xd4014d22, 0xd4046543, 0xd4273f60, 0xd44cad80, 0xd503201f, 0xd69f03e0, 0xd6bf03e0, 0xd5033fdf, 0xd5033e9f, 0xd50332bf, 0xd61f0200, 0xd63f0280, 0xc80a7d1b, 0xc800fea1, 0xc85f7fb1, 0xc85fff9d, 0xc89ffee1, 0xc8dffe95, 0x88167e7b, @@ -1375,11 +1539,11 @@ 0xb8bef956, 0xfc6afabd, 0xbc734963, 0xfc3d5b8d, 0xbc25fbb7, 0xf9189d05, 0xb91ecb1d, 0x39187a33, 0x791f226d, 0xf95aa2f3, 0xb9587bb7, 0x395f7176, 0x795d9143, 0x399e7e08, 0x799a2697, 0x79df3422, 0xb99c2624, 0xfd5c2374, 0xbd5fa1d9, 0xfd1d595a, - 0xbd1b1869, 0x580022fb, 0x1800000b, 0xf8945060, + 0xbd1b1869, 0x58002cfb, 0x1800000b, 0xf8945060, 0xd8000000, 0xf8ae6ba0, 0xf99a0080, 0x1a070035, 0x3a0700a8, 0x5a0e0367, 0x7a11009b, 0x9a000380, 0xba1e030c, 0xda0f0320, 0xfa030301, 0x0b340b12, 0x2b2a278d, 0xcb22aa0f, 0x6b2d29bd, 0x8b2cce8c, 0xab2b877e, 0xcb21c8ee, 0xeb3ba47d, 0x3a4d400e, @@ -1419,36 +1583,56 @@ 0x0de2eb2c, 0xce648376, 0xce6184c7, 0xcec081fa, 0xce6d89a2, 0xba5fd3e3, 0x3a5f03e5, 0xfa411be4, 0x7a42cbe2, 0x93df03ff, 0xc820ffff, 0x8822fc7f, 0xc8247cbf, 0x88267fff, 0x4e010fe0, 0x4e081fe1, 0x4e0c1fe1, 0x4e0a1fe1, 0x4e071fe1, 0x4cc0ac3f, - 0x1e601000, 0x1e603000, 0x1e621000, 0x1e623000, - 0x1e641000, 0x1e643000, 0x1e661000, 0x1e663000, - 0x1e681000, 0x1e683000, 0x1e6a1000, 0x1e6a3000, - 0x1e6c1000, 0x1e6c3000, 0x1e6e1000, 0x1e6e3000, - 0x1e701000, 0x1e703000, 0x1e721000, 0x1e723000, - 0x1e741000, 0x1e743000, 0x1e761000, 0x1e763000, - 0x1e781000, 0x1e783000, 0x1e7a1000, 0x1e7a3000, - 0x1e7c1000, 0x1e7c3000, 0x1e7e1000, 0x1e7e3000, - 0xf8388098, 0xf8340010, 0xf8241175, 0xf83e22d0, - 0xf82432ef, 0xf83a5186, 0xf82f41ee, 0xf82973b9, - 0xf82b6194, 0xf8b28216, 0xf8b50358, 0xf8a61206, - 0xf8b02219, 0xf8bc3218, 0xf8ba514f, 0xf8ad428e, - 0xf8a173d7, 0xf8ae60c2, 0xf8e38328, 0xf8e003db, - 0xf8e513c5, 0xf8eb2019, 0xf8ff3260, 0xf8fd513a, - 0xf8fa41ec, 0xf8eb724b, 0xf8f96316, 0xf8608171, - 0xf86600dd, 0xf86512a5, 0xf8732250, 0xf87e339b, - 0xf861503c, 0xf874421d, 0xf86d73aa, 0xf87d62d3, - 0xb82a83e4, 0xb83503e8, 0xb833138a, 0xb82220b9, - 0xb82332c8, 0xb83350ad, 0xb83d42b8, 0xb83a7078, - 0xb83862fa, 0xb8af8075, 0xb8b80328, 0xb8b41230, - 0xb8a22001, 0xb8b83064, 0xb8ac539f, 0xb8aa405a, - 0xb8ac73f2, 0xb8a163ad, 0xb8e08193, 0xb8f101b6, - 0xb8fc13fe, 0xb8e1239a, 0xb8e4309e, 0xb8e6535e, - 0xb8f24109, 0xb8ec7280, 0xb8e16058, 0xb8608309, - 0xb87a03d0, 0xb86312ea, 0xb86a2244, 0xb862310b, - 0xb86a522f, 0xb862418a, 0xb86c71af, 0xb8626287, + 0x05a08020, 0x04b0e3e0, 0x0470e7e1, 0x042f9c20, + 0x043f9c35, 0x047f9c20, 0x04ff9c20, 0x04299420, + 0x04319160, 0x0461943e, 0x04a19020, 0x042053ff, + 0x047f5401, 0x25208028, 0x2538cfe0, 0x2578d001, + 0x25b8efe2, 0x25f8f007, 0xa400a3e0, 0xa4a8a7ea, + 0xa547a814, 0xa4084ffe, 0xa55c53e0, 0xa5e1540b, + 0xe400fbf6, 0xe408ffff, 0xe547e400, 0xe4014be0, + 0xe4a84fe0, 0xe5f25000, 0x858043e0, 0x85a043ff, + 0xe59f5d08, 0x1e601000, 0x1e603000, 0x1e621000, + 0x1e623000, 0x1e641000, 0x1e643000, 0x1e661000, + 0x1e663000, 0x1e681000, 0x1e683000, 0x1e6a1000, + 0x1e6a3000, 0x1e6c1000, 0x1e6c3000, 0x1e6e1000, + 0x1e6e3000, 0x1e701000, 0x1e703000, 0x1e721000, + 0x1e723000, 0x1e741000, 0x1e743000, 0x1e761000, + 0x1e763000, 0x1e781000, 0x1e783000, 0x1e7a1000, + 0x1e7a3000, 0x1e7c1000, 0x1e7c3000, 0x1e7e1000, + 0x1e7e3000, 0xf8388098, 0xf8340010, 0xf8241175, + 0xf83e22d0, 0xf82432ef, 0xf83a5186, 0xf82f41ee, + 0xf82973b9, 0xf82b6194, 0xf8b28216, 0xf8b50358, + 0xf8a61206, 0xf8b02219, 0xf8bc3218, 0xf8ba514f, + 0xf8ad428e, 0xf8a173d7, 0xf8ae60c2, 0xf8e38328, + 0xf8e003db, 0xf8e513c5, 0xf8eb2019, 0xf8ff3260, + 0xf8fd513a, 0xf8fa41ec, 0xf8eb724b, 0xf8f96316, + 0xf8608171, 0xf86600dd, 0xf86512a5, 0xf8732250, + 0xf87e339b, 0xf861503c, 0xf874421d, 0xf86d73aa, + 0xf87d62d3, 0xb82a83e4, 0xb83503e8, 0xb833138a, + 0xb82220b9, 0xb82332c8, 0xb83350ad, 0xb83d42b8, + 0xb83a7078, 0xb83862fa, 0xb8af8075, 0xb8b80328, + 0xb8b41230, 0xb8a22001, 0xb8b83064, 0xb8ac539f, + 0xb8aa405a, 0xb8ac73f2, 0xb8a163ad, 0xb8e08193, + 0xb8f101b6, 0xb8fc13fe, 0xb8e1239a, 0xb8e4309e, + 0xb8e6535e, 0xb8f24109, 0xb8ec7280, 0xb8e16058, + 0xb8608309, 0xb87a03d0, 0xb86312ea, 0xb86a2244, + 0xb862310b, 0xb86a522f, 0xb862418a, 0xb86c71af, + 0xb8626287, 0x042401f9, 0x04b10564, 0x65ca0230, + 0x65d90996, 0x65ca05dc, 0x0456afc1, 0x0400044f, + 0x0490920d, 0x04daa163, 0x04d389c5, 0x0411829d, + 0x04901774, 0x0417b89a, 0x041eb3d6, 0x04480b6b, + 0x048a17dc, 0x048105be, 0x04dcb35e, 0x65808d6f, + 0x65cd9e06, 0x65869cfb, 0x65c78893, 0x658292d1, + 0x04ddaebc, 0x6582b452, 0x6580ade6, 0x65c1b42c, + 0x658da632, 0x658195af, 0x65eb1f74, 0x65f723c3, + 0x65ba4b71, 0x65fe76c6, 0x04525f42, 0x04117056, + 0x04363338, 0x04a33192, 0x0470339d, 0x049a2b86, + 0x045824e7, 0x04193509, 0x040837db, 0x044a221a, + 0x65c73903, 0x65c63b55, 0x65982096, 0x04412071, }; // END Generated code -- do not edit asm_check((unsigned int *)entry, insns, sizeof insns / sizeof insns[0]); diff a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp --- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp @@ -137,10 +137,13 @@ // Dispatch table base REGISTER_DECLARATION(Register, rdispatch, r21); // Java stack pointer REGISTER_DECLARATION(Register, esp, r20); +// Preserved predicate register with all elements set TRUE. +REGISTER_DECLARATION(PRegister, ptrue, p7); + #define assert_cond(ARG1) assert(ARG1, #ARG1) namespace asm_util { uint32_t encode_logical_immediate(bool is32, uint64_t imm); }; @@ -561,10 +564,22 @@ } void lea(MacroAssembler *, Register) const; static bool offset_ok_for_immed(int64_t offset, uint shift); + + static bool offset_ok_for_sve_immed(long offset, int shift, int vl /* sve vector length */) { + if (offset % vl == 0) { + // Convert address offset into sve imm offset (MUL VL). + int sve_offset = offset / vl; + if (((-(1 << (shift - 1))) <= sve_offset) && (sve_offset < (1 << (shift - 1)))) { + // sve_offset can be encoded + return true; + } + } + return false; + } }; // Convience classes class RuntimeAddress: public Address { @@ -2464,17 +2479,22 @@ assert(T != Q, "invalid register variant"); f(0b01101110000, 31, 21), f(((didx<<1)|1)<<(int)T, 20, 16), f(0, 15); f(sidx<<(int)T, 14, 11), f(1, 10), rf(Vn, 5), rf(Vd, 0); } - void umov(Register Rd, FloatRegister Vn, SIMD_RegVariant T, int idx) { - starti; - f(0, 31), f(T==D ? 1:0, 30), f(0b001110000, 29, 21); - f(((idx<<1)|1)<<(int)T, 20, 16), f(0b001111, 15, 10); - rf(Vn, 5), rf(Rd, 0); +#define INSN(NAME, op) \ + void NAME(Register Rd, FloatRegister Vn, SIMD_RegVariant T, int idx) { \ + starti; \ + f(0, 31), f(T==D ? 1:0, 30), f(0b001110000, 29, 21); \ + f(((idx<<1)|1)<<(int)T, 20, 16), f(op, 15, 10); \ + rf(Vn, 5), rf(Rd, 0); \ } + INSN(umov, 0b001111); + INSN(smov, 0b001011); +#undef INSN + #define INSN(NAME, opc, opc2, isSHR) \ void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, int shift){ \ starti; \ /* The encodings for the immh:immb fields (bits 22:16) in *SHR are \ * 0001 xxx 8B/16B, shift = 16 - UInt(immh:immb) \ @@ -2698,15 +2718,294 @@ f(0, 31), f((int)T & 1, 30), f(0b101110000, 29, 21); rf(Vm, 16), f(0, 15), f(index, 14, 11); f(0, 10), rf(Vn, 5), rf(Vd, 0); } - void sve_inc(Register Xdn, SIMD_RegVariant T, unsigned imm4 = 1, int pattern = 0b11111) { +// SVE arithmetics - unpredicated +#define INSN(NAME, opcode) \ + void NAME(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, FloatRegister Zm) { \ + starti; \ + assert(T != Q, "invalid register variant"); \ + f(0b00000100, 31, 24), f(T, 23, 22), f(1, 21), \ + rf(Zm, 16), f(0, 15, 13), f(opcode, 12, 10), rf(Zn, 5), rf(Zd, 0); \ + } + INSN(sve_add, 0b000); + INSN(sve_sub, 0b001); +#undef INSN + +// SVE floating-point arithmetic - unpredicated +#define INSN(NAME, opcode) \ + void NAME(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, FloatRegister Zm) { \ + starti; \ + assert(T == S || T == D, "invalid register variant"); \ + f(0b01100101, 31, 24), f(T, 23, 22), f(0, 21), \ + rf(Zm, 16), f(0, 15, 13), f(opcode, 12, 10), rf(Zn, 5), rf(Zd, 0); \ + } + + INSN(sve_fadd, 0b000); + INSN(sve_fmul, 0b010); + INSN(sve_fsub, 0b001); +#undef INSN + +private: + void sve_predicate_reg_insn(unsigned op24, unsigned op13, + FloatRegister Zd_or_Vd, SIMD_RegVariant T, + PRegister Pg, FloatRegister Zn_or_Vn) { + starti; + f(op24, 31, 24), f(T, 23, 22), f(op13, 21, 13); + pgrf(Pg, 10), rf(Zn_or_Vn, 5), rf(Zd_or_Vd, 0); + } + +public: + +// SVE integer arithmetics - predicate +#define INSN(NAME, op1, op2) \ + void NAME(FloatRegister Zdn_or_Zd_or_Vd, SIMD_RegVariant T, PRegister Pg, FloatRegister Znm_or_Vn) { \ + assert(T != Q, "invalid register variant"); \ + sve_predicate_reg_insn(op1, op2, Zdn_or_Zd_or_Vd, T, Pg, Znm_or_Vn); \ + } + + INSN(sve_abs, 0b00000100, 0b010110101); // vector abs, unary + INSN(sve_add, 0b00000100, 0b000000000); // vector add + INSN(sve_andv, 0b00000100, 0b011010001); // bitwise and reduction to scalar + INSN(sve_asr, 0b00000100, 0b010000100); // vector arithmetic shift right + INSN(sve_cnt, 0b00000100, 0b011010101) // count non-zero bits + INSN(sve_cpy, 0b00000101, 0b100000100); // copy scalar to each active vector element + INSN(sve_eorv, 0b00000100, 0b011001001); // bitwise xor reduction to scalar + INSN(sve_lsl, 0b00000100, 0b010011100); // vector logical shift left + INSN(sve_lsr, 0b00000100, 0b010001100); // vector logical shift right + INSN(sve_mul, 0b00000100, 0b010000000); // vector mul + INSN(sve_neg, 0b00000100, 0b010111101); // vector neg, unary + INSN(sve_not, 0b00000100, 0b011110101); // bitwise invert vector, unary + INSN(sve_orv, 0b00000100, 0b011000001); // bitwise or reduction to scalar + INSN(sve_smax, 0b00000100, 0b001000000); // signed maximum vectors + INSN(sve_smaxv, 0b00000100, 0b001000001); // signed maximum reduction to scalar + INSN(sve_smin, 0b00000100, 0b001010000); // signed minimum vectors + INSN(sve_sminv, 0b00000100, 0b001010001); // signed minimum reduction to scalar + INSN(sve_sub, 0b00000100, 0b000001000); // vector sub + INSN(sve_uaddv, 0b00000100, 0b000001001); // unsigned add reduction to scalar +#undef INSN + +// SVE floating-point arithmetics - predicate +#define INSN(NAME, op1, op2) \ + void NAME(FloatRegister Zd_or_Zdn_or_Vd, SIMD_RegVariant T, PRegister Pg, FloatRegister Zn_or_Zm) { \ + assert(T == S || T == D, "invalid register variant"); \ + sve_predicate_reg_insn(op1, op2, Zd_or_Zdn_or_Vd, T, Pg, Zn_or_Zm); \ + } + + INSN(sve_fabs, 0b00000100, 0b011100101); + INSN(sve_fadd, 0b01100101, 0b000000100); + INSN(sve_fadda, 0b01100101, 0b011000001); // add strictly-ordered reduction to scalar Vd + INSN(sve_fdiv, 0b01100101, 0b001101100); + INSN(sve_fmax, 0b01100101, 0b000110100); // floating-point maximum + INSN(sve_fmaxv, 0b01100101, 0b000110001); // floating-point maximum recursive reduction to scalar + INSN(sve_fmin, 0b01100101, 0b000111100); // floating-point minimum + INSN(sve_fminv, 0b01100101, 0b000111001); // floating-point minimum recursive reduction to scalar + INSN(sve_fmul, 0b01100101, 0b000010100); + INSN(sve_fneg, 0b00000100, 0b011101101); + INSN(sve_frintm, 0b01100101, 0b000010101); // floating-point round to integral value, toward minus infinity + INSN(sve_frintn, 0b01100101, 0b000000101); // floating-point round to integral value, nearest with ties to even + INSN(sve_frintp, 0b01100101, 0b000001101); // floating-point round to integral value, toward plus infinity + INSN(sve_fsqrt, 0b01100101, 0b001101101); + INSN(sve_fsub, 0b01100101, 0b000001100); +#undef INSN + + // SVE multiple-add/sub - predicated +#define INSN(NAME, op0, op1, op2) \ + void NAME(FloatRegister Zda, SIMD_RegVariant T, PRegister Pg, FloatRegister Zn, FloatRegister Zm) { \ + starti; \ + assert(T != Q, "invalid size"); \ + f(op0, 31, 24), f(T, 23, 22), f(op1, 21), rf(Zm, 16); \ + f(op2, 15, 13), pgrf(Pg, 10), rf(Zn, 5), rf(Zda, 0); \ + } + + INSN(sve_fmla, 0b01100101, 1, 0b000); // floating-point fused multiply-add: Zda = Zda + Zn * Zm + INSN(sve_fmls, 0b01100101, 1, 0b001); // floating-point fused multiply-subtract: Zda = Zda + -Zn * Zm + INSN(sve_fnmla, 0b01100101, 1, 0b010); // floating-point negated fused multiply-add: Zda = -Zda + -Zn * Zm + INSN(sve_fnmls, 0b01100101, 1, 0b011); // floating-point negated fused multiply-subtract: Zda = -Zda + Zn * Zm + INSN(sve_mla, 0b00000100, 0, 0b010); // multiply-add: Zda = Zda + Zn*Zm + INSN(sve_mls, 0b00000100, 0, 0b011); // multiply-subtract: Zda = Zda + -Zn*Zm +#undef INSN + +// SVE bitwise logical - unpredicated +#define INSN(NAME, opc) \ + void NAME(FloatRegister Zd, FloatRegister Zn, FloatRegister Zm) { \ + starti; \ + f(0b00000100, 31, 24), f(opc, 23, 22), f(1, 21), \ + rf(Zm, 16), f(0b001100, 15, 10), rf(Zn, 5), rf(Zd, 0); \ + } + INSN(sve_and, 0b00); + INSN(sve_eor, 0b10); + INSN(sve_orr, 0b01); +#undef INSN + +// SVE shift immediate - unpredicated +#define INSN(NAME, opc, isSHR) \ + void NAME(FloatRegister Zd, SIMD_RegVariant T, FloatRegister Zn, int shift) { \ + starti; \ + /* The encodings for the tszh:tszl:imm3 fields (bits 23:22 20:19 18:16) \ + * for shift right is calculated as: \ + * 0001 xxx B, shift = 16 - UInt(tszh:tszl:imm3) \ + * 001x xxx H, shift = 32 - UInt(tszh:tszl:imm3) \ + * 01xx xxx S, shift = 64 - UInt(tszh:tszl:imm3) \ + * 1xxx xxx D, shift = 128 - UInt(tszh:tszl:imm3) \ + * for shift left is calculated as: \ + * 0001 xxx B, shift = UInt(tszh:tszl:imm3) - 8 \ + * 001x xxx H, shift = UInt(tszh:tszl:imm3) - 16 \ + * 01xx xxx S, shift = UInt(tszh:tszl:imm3) - 32 \ + * 1xxx xxx D, shift = UInt(tszh:tszl:imm3) - 64 \ + */ \ + assert(T != Q, "Invalid register variant"); \ + if (isSHR) { \ + assert(((1 << (T + 3)) >= shift) && (shift > 0) , "Invalid shift value"); \ + } else { \ + assert(((1 << (T + 3)) > shift) && (shift >= 0) , "Invalid shift value"); \ + } \ + int cVal = (1 << ((T + 3) + (isSHR ? 1 : 0))); \ + int encodedShift = isSHR ? cVal - shift : cVal + shift; \ + int tszh = encodedShift >> 5; \ + int tszl_imm = encodedShift & 0x1f; \ + f(0b00000100, 31, 24); \ + f(tszh, 23, 22), f(1,21), f(tszl_imm, 20, 16); \ + f(0b100, 15, 13), f(opc, 12, 10), rf(Zn, 5), rf(Zd, 0); \ + } + + INSN(sve_asr, 0b100, /* isSHR = */ true); + INSN(sve_lsl, 0b111, /* isSHR = */ false); + INSN(sve_lsr, 0b101, /* isSHR = */ true); +#undef INSN + +private: + + // Scalar base + immediate index + void sve_ld_st1(FloatRegister Zt, Register Xn, int imm, PRegister Pg, + SIMD_RegVariant T, int op1, int type, int op2) { + starti; + assert_cond(T >= type); + f(op1, 31, 25), f(type, 24, 23), f(T, 22, 21); + f(0, 20), sf(imm, 19, 16), f(op2, 15, 13); + pgrf(Pg, 10), srf(Xn, 5), rf(Zt, 0); + } + + // Scalar base + scalar index + void sve_ld_st1(FloatRegister Zt, Register Xn, Register Xm, PRegister Pg, + SIMD_RegVariant T, int op1, int type, int op2) { + starti; + assert_cond(T >= type); + f(op1, 31, 25), f(type, 24, 23), f(T, 22, 21); + rf(Xm, 16), f(op2, 15, 13); + pgrf(Pg, 10), srf(Xn, 5), rf(Zt, 0); + } + + void sve_ld_st1(FloatRegister Zt, PRegister Pg, + SIMD_RegVariant T, const Address &a, + int op1, int type, int imm_op2, int scalar_op2) { + switch (a.getMode()) { + case Address::base_plus_offset: + sve_ld_st1(Zt, a.base(), a.offset(), Pg, T, op1, type, imm_op2); + break; + case Address::base_plus_offset_reg: + sve_ld_st1(Zt, a.base(), a.index(), Pg, T, op1, type, scalar_op2); + break; + default: + ShouldNotReachHere(); + } + } + +public: + +// SVE load/store - predicated +#define INSN(NAME, op1, type, imm_op2, scalar_op2) \ + void NAME(FloatRegister Zt, SIMD_RegVariant T, PRegister Pg, const Address &a) { \ + assert(T != Q, "invalid register variant"); \ + sve_ld_st1(Zt, Pg, T, a, op1, type, imm_op2, scalar_op2); \ + } + + INSN(sve_ld1b, 0b1010010, 0b00, 0b101, 0b010); + INSN(sve_st1b, 0b1110010, 0b00, 0b111, 0b010); + INSN(sve_ld1h, 0b1010010, 0b01, 0b101, 0b010); + INSN(sve_st1h, 0b1110010, 0b01, 0b111, 0b010); + INSN(sve_ld1w, 0b1010010, 0b10, 0b101, 0b010); + INSN(sve_st1w, 0b1110010, 0b10, 0b111, 0b010); + INSN(sve_ld1d, 0b1010010, 0b11, 0b101, 0b010); + INSN(sve_st1d, 0b1110010, 0b11, 0b111, 0b010); +#undef INSN + +// SVE load/store - unpredicated +#define INSN(NAME, op1) \ + void NAME(FloatRegister Zt, const Address &a) { \ + starti; \ + assert(a.index() == noreg, "invalid address variant"); \ + f(op1, 31, 29), f(0b0010110, 28, 22), sf(a.offset() >> 3, 21, 16), \ + f(0b010, 15, 13), f(a.offset() & 0x7, 12, 10), srf(a.base(), 5), rf(Zt, 0); \ + } + + INSN(sve_ldr, 0b100); // LDR (vector) + INSN(sve_str, 0b111); // STR (vector) +#undef INSN + +#define INSN(NAME, op) \ + void NAME(Register Xd, Register Xn, int imm6) { \ + starti; \ + f(0b000001000, 31, 23), f(op, 22, 21); \ + srf(Xn, 16), f(0b01010, 15, 11), sf(imm6, 10, 5), srf(Xd, 0); \ + } + + INSN(sve_addvl, 0b01); + INSN(sve_addpl, 0b11); +#undef INSN + +// SVE inc/dec register by element count +#define INSN(NAME, op) \ + void NAME(Register Xdn, SIMD_RegVariant T, unsigned imm4 = 1, int pattern = 0b11111) { \ + starti; \ + assert(T != Q, "invalid size"); \ + f(0b00000100,31, 24), f(T, 23, 22), f(0b11, 21, 20); \ + f(imm4 - 1, 19, 16), f(0b11100, 15, 11), f(op, 10), f(pattern, 9, 5), rf(Xdn, 0); \ + } + + INSN(sve_inc, 0); + INSN(sve_dec, 1); +#undef INSN + + // SVE predicate count + void sve_cntp(Register Xd, SIMD_RegVariant T, PRegister Pg, PRegister Pn) { + starti; + assert(T != Q, "invalid size"); + f(0b00100101, 31, 24), f(T, 23, 22), f(0b10000010, 21, 14); + prf(Pg, 10), f(0, 9), prf(Pn, 5), rf(Xd, 0); + } + + // SVE dup scalar + void sve_dup(FloatRegister Zd, SIMD_RegVariant T, Register Rn) { starti; assert(T != Q, "invalid size"); - f(0b00000100,31, 24), f(T, 23, 22), f(0b11, 21, 20); - f(imm4 - 1, 19, 16), f(0b111000, 15, 10), f(pattern, 9, 5), rf(Xdn, 0); + f(0b00000101, 31, 24), f(T, 23, 22), f(0b100000001110, 21, 10); + srf(Rn, 5), rf(Zd, 0); + } + + // SVE dup imm + void sve_dup(FloatRegister Zd, SIMD_RegVariant T, int imm8) { + starti; + assert(T != Q, "invalid size"); + int sh = 0; + if (imm8 <= 127 && imm8 >= -128) { + sh = 0; + } else if (T != B && imm8 <= 32512 && imm8 >= -32768 && (imm8 & 0xff) == 0) { + sh = 1; + imm8 = (imm8 >> 8); + } else { + guarantee(false, "invalid immediate"); + } + f(0b00100101, 31, 24), f(T, 23, 22), f(0b11100011, 21, 14); + f(sh, 13), sf(imm8, 12, 5), rf(Zd, 0); + } + + void sve_ptrue(PRegister pd, SIMD_RegVariant esize, int pattern = 0b11111) { + starti; + f(0b00100101, 31, 24), f(esize, 23, 22), f(0b011000111000, 21, 10); + f(pattern, 9, 5), f(0b0, 4), prf(pd, 0); } Assembler(CodeBuffer* code) : AbstractAssembler(code) { } diff a/src/hotspot/cpu/aarch64/gc/z/zBarrierSetAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/gc/z/zBarrierSetAssembler_aarch64.cpp --- a/src/hotspot/cpu/aarch64/gc/z/zBarrierSetAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/gc/z/zBarrierSetAssembler_aarch64.cpp @@ -429,12 +429,16 @@ { ZSaveLiveRegisters save_live_registers(masm, stub); ZSetupArguments setup_arguments(masm, stub); __ mov(rscratch1, stub->slow_path()); __ blr(rscratch1); + if (UseSVE > 0) { + // Reinitialize the ptrue predicate register, in case the external runtime + // call clobbers ptrue reg, as we may return to SVE compiled code. + __ reinitialize_ptrue(); + } } - // Stub exit __ b(*stub->continuation()); } #undef __ diff a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp @@ -2649,27 +2649,45 @@ } pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2) - exclude, sp); } -void MacroAssembler::push_CPU_state(bool save_vectors) { - int step = (save_vectors ? 8 : 4) * wordSize; +void MacroAssembler::push_CPU_state(bool save_vectors, bool use_sve, + int sve_vector_size_in_bytes) { push(0x3fffffff, sp); // integer registers except lr & sp - mov(rscratch1, -step); - sub(sp, sp, step); - for (int i = 28; i >= 4; i -= 4) { - st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), - as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); + if (save_vectors && use_sve) { + assert(sve_vector_size_in_bytes >= 16, "illegal scalable vector size"); + sub(sp, sp, sve_vector_size_in_bytes * FloatRegisterImpl::number_of_registers); + for (int i = 0; i < FloatRegisterImpl::number_of_registers; i++) { + sve_str(as_FloatRegister(i), Address(sp, i)); + } + } else { + int step = (save_vectors ? 8 : 4) * wordSize; + mov(rscratch1, -step); + sub(sp, sp, step); + for (int i = 28; i >= 4; i -= 4) { + st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), + as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); + } + st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); } - st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); } -void MacroAssembler::pop_CPU_state(bool restore_vectors) { - int step = (restore_vectors ? 8 : 4) * wordSize; - for (int i = 0; i <= 28; i += 4) - ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), - as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); +void MacroAssembler::pop_CPU_state(bool restore_vectors, bool use_sve, + int sve_vector_size_in_bytes) { + if (restore_vectors && use_sve) { + assert(sve_vector_size_in_bytes >= 16, "illegal scalable vector size"); + for (int i = FloatRegisterImpl::number_of_registers - 1; i >= 0; i--) { + sve_ldr(as_FloatRegister(i), Address(sp, i)); + } + add(sp, sp, sve_vector_size_in_bytes * FloatRegisterImpl::number_of_registers); + } else { + int step = (restore_vectors ? 8 : 4) * wordSize; + for (int i = 0; i <= 28; i += 4) + ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), + as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); + } pop(0x3fffffff, sp); // integer registers except lr & sp } /** * Helpers for multiply_to_len(). @@ -2714,10 +2732,25 @@ } return Address(base, offset); } +Address MacroAssembler::sve_spill_address(int sve_reg_size_in_bytes, int offset, Register tmp) { + assert(offset >= 0, "spill to negative address?"); + + Register base = sp; + + // An immediate offset in the range 0 to 255 which is multiplied + // by the current vector or predicate register size in bytes. + if (offset % sve_reg_size_in_bytes == 0 && offset < ((1<<8)*sve_reg_size_in_bytes)) { + return Address(base, offset / sve_reg_size_in_bytes); + } + + add(tmp, base, offset); + return Address(tmp); +} + // Checks whether offset is aligned. // Returns true if it is, else false. bool MacroAssembler::merge_alignment_check(Register base, size_t size, int64_t cur_offset, @@ -5234,5 +5267,15 @@ subsw(zr, rscratch1, VM_Version::get_initial_sve_vector_length()); br(EQ, verify_ok); stop("Error: SVE vector length has changed since jvm startup"); bind(verify_ok); } + +void MacroAssembler::verify_ptrue() { + Label verify_ok; + assert(UseSVE > 0, "should only be used for SVE"); + sve_cntp(rscratch1, B, ptrue, ptrue); // get true elements count. + sve_dec(rscratch1, B); + cbz(rscratch1, verify_ok); + stop("Error: the preserved predicate register (p7) elements are not all true"); + bind(verify_ok); +} diff a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp @@ -886,12 +886,14 @@ // if heap base register is used - reinit it with the correct value void reinit_heapbase(); DEBUG_ONLY(void verify_heapbase(const char* msg);) - void push_CPU_state(bool save_vectors = false); - void pop_CPU_state(bool restore_vectors = false) ; + void push_CPU_state(bool save_vectors = false, bool use_sve = false, + int sve_vector_size_in_bytes = 0); + void pop_CPU_state(bool restore_vectors = false, bool use_sve = false, + int sve_vector_size_in_bytes = 0); // Round up to a power of two void round_to(Register reg, int modulus); // allocation @@ -968,10 +970,14 @@ Label* L_slow_path = NULL); Address argument_address(RegisterOrConstant arg_slot, int extra_slot_offset = 0); void verify_sve_vector_length(); + void reinitialize_ptrue() { + sve_ptrue(ptrue, B); + } + void verify_ptrue(); // Debugging // only if +VerifyOops void verify_oop(Register reg, const char* s = "broken oop"); @@ -1317,10 +1323,11 @@ private: // Returns an address on the stack which is reachable with a ldr/str of size // Uses rscratch2 if the address is not directly reachable Address spill_address(int size, int offset, Register tmp=rscratch2); + Address sve_spill_address(int sve_reg_size_in_bytes, int offset, Register tmp=rscratch2); bool merge_alignment_check(Register base, size_t size, int64_t cur_offset, int64_t prev_offset) const; // Check whether two loads/stores can be merged into ldp/stp. bool ldst_can_merge(Register rx, const Address &adr, size_t cur_size_in_bytes, bool is_store) const; @@ -1340,20 +1347,26 @@ } } void spill(FloatRegister Vx, SIMD_RegVariant T, int offset) { str(Vx, T, spill_address(1 << (int)T, offset)); } + void spill_sve_vector(FloatRegister Zx, int offset, int vector_reg_size_in_bytes) { + sve_str(Zx, sve_spill_address(vector_reg_size_in_bytes, offset)); + } void unspill(Register Rx, bool is64, int offset) { if (is64) { ldr(Rx, spill_address(8, offset)); } else { ldrw(Rx, spill_address(4, offset)); } } void unspill(FloatRegister Vx, SIMD_RegVariant T, int offset) { ldr(Vx, T, spill_address(1 << (int)T, offset)); } + void unspill_sve_vector(FloatRegister Zx, int offset, int vector_reg_size_in_bytes) { + sve_ldr(Zx, sve_spill_address(vector_reg_size_in_bytes, offset)); + } void spill_copy128(int src_offset, int dst_offset, Register tmp1=rscratch1, Register tmp2=rscratch2) { if (src_offset < 512 && (src_offset & 7) == 0 && dst_offset < 512 && (dst_offset & 7) == 0) { ldp(tmp1, tmp2, Address(sp, src_offset)); @@ -1363,11 +1376,19 @@ spill(tmp1, true, dst_offset); unspill(tmp1, true, src_offset+8); spill(tmp1, true, dst_offset+8); } } - + void spill_copy_sve_vector_stack_to_stack(int src_offset, int dst_offset, + int sve_vec_reg_size_in_bytes) { + assert(sve_vec_reg_size_in_bytes % 16 == 0, "unexpected sve vector reg size"); + for (int i = 0; i < sve_vec_reg_size_in_bytes / 16; i++) { + spill_copy128(src_offset, dst_offset); + src_offset += 16; + dst_offset += 16; + } + } void cache_wb(Address line); void cache_wbsync(bool is_pre); }; #ifdef ASSERT diff a/src/hotspot/cpu/aarch64/register_definitions_aarch64.cpp b/src/hotspot/cpu/aarch64/register_definitions_aarch64.cpp --- a/src/hotspot/cpu/aarch64/register_definitions_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/register_definitions_aarch64.cpp @@ -1,7 +1,7 @@ /* - * Copyright (c) 2002, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2002, 2020, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014, Red Hat Inc. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as @@ -202,5 +202,7 @@ REGISTER_DEFINITION(PRegister, p11); REGISTER_DEFINITION(PRegister, p12); REGISTER_DEFINITION(PRegister, p13); REGISTER_DEFINITION(PRegister, p14); REGISTER_DEFINITION(PRegister, p15); + +REGISTER_DEFINITION(PRegister, ptrue); diff a/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp b/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp --- a/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp @@ -113,15 +113,32 @@ reg_save_size = return_off + RegisterImpl::max_slots_per_register}; }; OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) { + bool use_sve = false; + int sve_vector_size_in_bytes = 0; + int sve_vector_size_in_slots = 0; + +#ifdef COMPILER2 + use_sve = Matcher::supports_scalable_vector(); + sve_vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE); + sve_vector_size_in_slots = Matcher::scalable_vector_reg_size(T_FLOAT); +#endif + #if COMPILER2_OR_JVMCI if (save_vectors) { + int vect_words = 0; + int extra_save_slots_per_register = 0; // Save upper half of vector registers - int vect_words = FloatRegisterImpl::number_of_registers * FloatRegisterImpl::extra_save_slots_per_neon_register / - VMRegImpl::slots_per_word; + if (use_sve) { + extra_save_slots_per_register = sve_vector_size_in_slots - FloatRegisterImpl::save_slots_per_register; + } else { + extra_save_slots_per_register = FloatRegisterImpl::extra_save_slots_per_neon_register; + } + vect_words = FloatRegisterImpl::number_of_registers * extra_save_slots_per_register / + VMRegImpl::slots_per_word; additional_frame_words += vect_words; } #else assert(!save_vectors, "vectors are generated only by C2 and JVMCI"); #endif @@ -136,11 +153,11 @@ int frame_size_in_words = frame_size_in_bytes / wordSize; *total_frame_words = frame_size_in_words; // Save Integer and Float registers. __ enter(); - __ push_CPU_state(save_vectors); + __ push_CPU_state(save_vectors, use_sve, sve_vector_size_in_bytes); // Set an oopmap for the call site. This oopmap will map all // oop-registers and debug-info registers as callee-saved. This // will allow deoptimization at this safepoint to find all possible // debug-info recordings, as well as let GC find all oops. @@ -160,24 +177,32 @@ } } for (int i = 0; i < FloatRegisterImpl::number_of_registers; i++) { FloatRegister r = as_FloatRegister(i); - int sp_offset = save_vectors ? (FloatRegisterImpl::max_slots_per_register * i) : - (FloatRegisterImpl::save_slots_per_register * i); + int sp_offset = 0; + if (save_vectors) { + sp_offset = use_sve ? (sve_vector_size_in_slots * i) : + (FloatRegisterImpl::slots_per_neon_register * i); + } else { + sp_offset = FloatRegisterImpl::save_slots_per_register * i; + } oop_map->set_callee_saved(VMRegImpl::stack2reg(sp_offset), r->as_VMReg()); } return oop_map; } void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) { #if !COMPILER2_OR_JVMCI assert(!restore_vectors, "vectors are generated only by C2 and JVMCI"); -#endif __ pop_CPU_state(restore_vectors); +#else + __ pop_CPU_state(restore_vectors, Matcher::supports_scalable_vector(), + Matcher::scalable_vector_reg_size(T_BYTE)); +#endif __ leave(); } void RegisterSaver::restore_result_registers(MacroAssembler* masm) { @@ -2777,10 +2802,16 @@ __ reset_last_Java_frame(false); __ maybe_isb(); __ membar(Assembler::LoadLoad | Assembler::LoadStore); + if (UseSVE > 0 && save_vectors) { + // Reinitialize the ptrue predicate register, in case the external runtime + // call clobbers ptrue reg, as we may return to SVE compiled code. + __ reinitialize_ptrue(); + } + __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); __ cbz(rscratch1, noException); // Exception pending diff a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -486,10 +486,15 @@ __ mov(r19, lr); BLOCK_COMMENT("call exception_handler_for_return_address"); __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1); + if (UseSVE > 0 ) { + // Reinitialize the ptrue predicate register, in case the external runtime + // call clobbers ptrue reg, as we may return to SVE compiled code. + __ reinitialize_ptrue(); + } // we should not really care that lr is no longer the callee // address. we saved the value the handler needs in r19 so we can // just copy it to r3. however, the C2 handler will push its own // frame and then calls into the VM and the VM code asserts that // the PC for the frame above the handler belongs to a compiled @@ -5016,10 +5021,16 @@ oop_maps->add_gc_map(the_pc - start, map); __ reset_last_Java_frame(true); __ maybe_isb(); + if (UseSVE > 0) { + // Reinitialize the ptrue predicate register, in case the external runtime + // call clobbers ptrue reg, as we may return to SVE compiled code. + __ reinitialize_ptrue(); + } + __ leave(); // check for pending exceptions #ifdef ASSERT Label L; diff a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -92,12 +92,15 @@ } //------------------------------transform_loop--------------------------- void SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) { assert(UseSuperWord, "should be"); - // Do vectors exist on this architecture? - if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return; + // SuperWord only works with power of two vector sizes. + int vector_width = Matcher::vector_width_in_bytes(T_BYTE); + if (vector_width < 2 || !is_power_of_2(vector_width)) { + return; + } assert(lpt->_head->is_CountedLoop(), "must be"); CountedLoopNode *cl = lpt->_head->as_CountedLoop(); if (!cl->is_valid_counted_loop()) return; // skip malformed counted loop diff a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -816,11 +816,11 @@ bool ReductionNode::implemented(int opc, uint vlen, BasicType bt) { if (is_java_primitive(bt) && (vlen > 1) && is_power_of_2(vlen) && Matcher::vector_size_supported(bt, vlen)) { int vopc = ReductionNode::opcode(opc, bt); - return vopc != opc && Matcher::match_rule_supported(vopc); + return vopc != opc && Matcher::match_rule_supported_vector(vopc, vlen, bt); } return false; } MacroLogicVNode* MacroLogicVNode::make(PhaseGVN& gvn, Node* in1, Node* in2, Node* in3,