--- old/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp 2020-04-02 18:03:44.944854099 -0700 +++ new/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp 2020-04-02 18:03:44.784854099 -0700 @@ -28,6 +28,8 @@ // C2_MacroAssembler contains high-level macros for C2 public: + Assembler::AvxVectorLen vector_length_encoding(int vlen_in_bytes); + // special instructions for EVEX void setvectmask(Register dst, Register src); void restorevectmask(); @@ -71,18 +73,61 @@ void vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr); void vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr); void vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr); + + void pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, + XMMRegister tmp = xnoreg); + void vpminmax(int opcode, BasicType elem_bt, + XMMRegister dst, XMMRegister src1, XMMRegister src2, + int vlen_enc); + + void vminmax_fp(int opcode, BasicType elem_bt, + XMMRegister dst, XMMRegister a, XMMRegister b, + XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, + int vlen_enc); + void evminmax_fp(int opcode, BasicType elem_bt, + XMMRegister dst, XMMRegister a, XMMRegister b, + KRegister ktmp, XMMRegister atmp, XMMRegister btmp, + int vlen_enc); + void vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len); void vextendbw(bool sign, XMMRegister dst, XMMRegister src); - void vshiftd(int opcode, XMMRegister dst, XMMRegister src); - void vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); - void vshiftw(int opcode, XMMRegister dst, XMMRegister src); - void vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); - void vshiftq(int opcode, XMMRegister dst, XMMRegister src); - void vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len); + void vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len); - // Reductions for vectors of ints, longs, floats, and doubles. + void vshiftd(int opcode, XMMRegister dst, XMMRegister shift); + void vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc); + void vshiftw(int opcode, XMMRegister dst, XMMRegister shift); + void vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc); + void vshiftq(int opcode, XMMRegister dst, XMMRegister shift); + void vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc); + void varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc); + void varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc); + void varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister vtmp = xnoreg); + void varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch); + void evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch); + + void insert(BasicType typ, XMMRegister dst, Register val, int idx); + void vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx); + void vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len); + void evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len); + void evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len); + + // extract + void extract(BasicType typ, Register dst, XMMRegister src, int idx); + XMMRegister get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex); + void get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex); + void get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp = noreg, XMMRegister vtmp = xnoreg); + + // blend + void evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch = rscratch1); + void evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len); - // dst = src1 + reduce(op, src2) using vtmp as temps + void load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt); + void load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes); + + // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. + + // dst = src1 reduce(op, src2) using vtmp as temps void reduceI(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); #ifdef _LP64 void reduceL(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); @@ -92,32 +137,62 @@ void reduce_fp(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2 = xnoreg); + void reduceB(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void mulreduceB(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void reduceS(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, + XMMRegister dst, XMMRegister src, + XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, XMMRegister xmm_0, XMMRegister xmm_1 = xnoreg); + void reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, + XMMRegister dst, XMMRegister src, + XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, XMMRegister xmm_0, XMMRegister xmm_1 = xnoreg); private: void reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2); void reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2); + // Int Reduction void reduce2I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); void reduce4I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); void reduce8I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); void reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + // Byte Reduction + void reduce8B (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void mulreduce8B (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + + // Short Reduction + void reduce4S (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void reduce8S (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + void reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); + + // Long Reduction #ifdef _LP64 void reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); void reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); void reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2); #endif // _LP64 + // Float Reduction void reduce2F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp); void reduce4F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp); void reduce8F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2); void reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2); + // Double Reduction void reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp); void reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2); void reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2); - void reduce_operation_128(int opcode, XMMRegister dst, XMMRegister src); - void reduce_operation_256(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2); + // Base reduction instruction + void reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src); + void reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2); public: