1354 __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1355 assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1356 __ end_a_stub();
1357 return offset;
1358 }
1359
1360
1361 //=============================================================================
1362
1363 // Float masks come from different places depending on platform.
1364 #ifdef _LP64
1365 static address float_signmask() { return StubRoutines::x86::float_sign_mask(); }
1366 static address float_signflip() { return StubRoutines::x86::float_sign_flip(); }
1367 static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1368 static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1369 static address vector_float_signmask() { return StubRoutines::x86::vector_float_sign_mask(); }
1370 static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip(); }
1371 static address vector_double_signmask() { return StubRoutines::x86::vector_double_sign_mask(); }
1372 static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip(); }
1373 static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
1374 static address vector_byte_bitset() { return StubRoutines::x86::vector_byte_bitset(); }
1375 static address vector_long_perm_mask() { return StubRoutines::x86::vector_long_perm_mask(); }
1376 static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1377 static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
1378 static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
1379 static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
1380 static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
1381 static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
1382 static address vector_all_ones_mask() { return StubRoutines::x86::vector_all_ones_mask(); }
1383 static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
1384 static address vector_int_sizemask() { return StubRoutines::x86::vector_int_size_mask(); }
1385 static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
1386 static address vector_short_sizemask() { return StubRoutines::x86::vector_short_size_mask(); }
1387 static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
1388 static address vector_long_sizemask() { return StubRoutines::x86::vector_long_size_mask(); }
1389 #else
1390 static address float_signmask() { return (address)float_signmask_pool; }
1391 static address float_signflip() { return (address)float_signflip_pool; }
1392 static address double_signmask() { return (address)double_signmask_pool; }
1393 static address double_signflip() { return (address)double_signflip_pool; }
2110 while(bit_width < 32) {
2111 val |= (val << bit_width);
2112 bit_width <<= 1;
2113 }
2114 return val;
2115 }
2116
2117 static inline jlong replicate8_imm(int con, int width) {
2118 // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
2119 assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
2120 int bit_width = width * 8;
2121 jlong val = con;
2122 val &= (((jlong) 1) << bit_width) - 1; // mask off sign bits
2123 while(bit_width < 64) {
2124 val |= (val << bit_width);
2125 bit_width <<= 1;
2126 }
2127 return val;
2128 }
2129
2130 #ifndef PRODUCT
2131 void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
2132 st->print("nop \t# %d bytes pad for loops and calls", _count);
2133 }
2134 #endif
2135
2136 void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
2137 MacroAssembler _masm(&cbuf);
2138 __ nop(_count);
2139 }
2140
2141 uint MachNopNode::size(PhaseRegAlloc*) const {
2142 return _count;
2143 }
2144
2145 #ifndef PRODUCT
2146 void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
2147 st->print("# breakpoint");
2148 }
2149 #endif
23079 __ vpabsd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23080 __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23081 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23082 %}
23083 ins_pipe( pipe_slow );
23084 %}
23085
23086 instruct storemask8l(vecD dst, vecZ src, rRegL scratch) %{
23087 predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 8);
23088 match(Set dst (VectorStoreMask src));
23089 effect(TEMP scratch);
23090 format %{ "vpcmpeqq k2,$src,0xFFFFFFFF\n\t"
23091 "vmovdqub $dst,k2,0x01010101\t! store mask (8L to 8B)" %}
23092 ins_encode %{
23093 int vector_len = 2;
23094 KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
23095 Assembler::ComparisonPredicate cp = Assembler::eq;
23096 __ evpcmpq(ktmp, k0, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), cp, vector_len, $scratch$$Register);
23097 // The dst is only 128-bit - thus we can do a smaller move.
23098 __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_byte_bitset()), false, 0, $scratch$$Register);
23099 %}
23100 ins_pipe( pipe_slow );
23101 %}
23102
23103 //-------------------------------- LOAD_SHUFFLE ----------------------------------
23104
23105 instruct loadshuffle8b(vecD dst, vecD src) %{
23106 predicate(UseSSE > 1 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
23107 match(Set dst (VectorLoadShuffle src));
23108 format %{ "movdqu $dst, $src\t! load shuffle (load 8B for 8BRearrange)" %}
23109 ins_encode %{
23110 __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
23111 %}
23112 ins_pipe( pipe_slow );
23113 %}
23114
23115 instruct loadshuffle16b(vecX dst, vecX src) %{
23116 predicate(UseSSE > 1 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
23117 match(Set dst (VectorLoadShuffle src));
23118 format %{ "movdqu $dst, $src\t! load shuffle (load 16B for 16BRearrange)" %}
|
1354 __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
1355 assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow %d", (__ offset() - offset));
1356 __ end_a_stub();
1357 return offset;
1358 }
1359
1360
1361 //=============================================================================
1362
1363 // Float masks come from different places depending on platform.
1364 #ifdef _LP64
1365 static address float_signmask() { return StubRoutines::x86::float_sign_mask(); }
1366 static address float_signflip() { return StubRoutines::x86::float_sign_flip(); }
1367 static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
1368 static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
1369 static address vector_float_signmask() { return StubRoutines::x86::vector_float_sign_mask(); }
1370 static address vector_float_signflip() { return StubRoutines::x86::vector_float_sign_flip(); }
1371 static address vector_double_signmask() { return StubRoutines::x86::vector_double_sign_mask(); }
1372 static address vector_double_signflip() { return StubRoutines::x86::vector_double_sign_flip(); }
1373 static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
1374 static address vector_iota_indices() { return StubRoutines::x86::vector_iota_indices(); }
1375 static address vector_byte_bitset() { return StubRoutines::x86::vector_byte_bitset(); }
1376 static address vector_long_perm_mask() { return StubRoutines::x86::vector_long_perm_mask(); }
1377 static address vector_short_to_byte_mask() { return StubRoutines::x86::vector_short_to_byte_mask(); }
1378 static address vector_byte_perm_mask() { return StubRoutines::x86::vector_byte_perm_mask(); }
1379 static address vector_int_to_byte_mask() { return StubRoutines::x86::vector_int_to_byte_mask(); }
1380 static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
1381 static address vector_32_bit_mask() { return StubRoutines::x86::vector_32_bit_mask(); }
1382 static address vector_64_bit_mask() { return StubRoutines::x86::vector_64_bit_mask(); }
1383 static address vector_all_ones_mask() { return StubRoutines::x86::vector_all_ones_mask(); }
1384 static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
1385 static address vector_int_sizemask() { return StubRoutines::x86::vector_int_size_mask(); }
1386 static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
1387 static address vector_short_sizemask() { return StubRoutines::x86::vector_short_size_mask(); }
1388 static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
1389 static address vector_long_sizemask() { return StubRoutines::x86::vector_long_size_mask(); }
1390 #else
1391 static address float_signmask() { return (address)float_signmask_pool; }
1392 static address float_signflip() { return (address)float_signflip_pool; }
1393 static address double_signmask() { return (address)double_signmask_pool; }
1394 static address double_signflip() { return (address)double_signflip_pool; }
2111 while(bit_width < 32) {
2112 val |= (val << bit_width);
2113 bit_width <<= 1;
2114 }
2115 return val;
2116 }
2117
2118 static inline jlong replicate8_imm(int con, int width) {
2119 // Load a constant of "width" (in bytes) and replicate it to fill 64bit.
2120 assert(width == 1 || width == 2 || width == 4, "only byte, short or int types here");
2121 int bit_width = width * 8;
2122 jlong val = con;
2123 val &= (((jlong) 1) << bit_width) - 1; // mask off sign bits
2124 while(bit_width < 64) {
2125 val |= (val << bit_width);
2126 bit_width <<= 1;
2127 }
2128 return val;
2129 }
2130
2131
2132 #ifndef PRODUCT
2133 void MachNopNode::format(PhaseRegAlloc*, outputStream* st) const {
2134 st->print("nop \t# %d bytes pad for loops and calls", _count);
2135 }
2136 #endif
2137
2138 void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc*) const {
2139 MacroAssembler _masm(&cbuf);
2140 __ nop(_count);
2141 }
2142
2143 uint MachNopNode::size(PhaseRegAlloc*) const {
2144 return _count;
2145 }
2146
2147 #ifndef PRODUCT
2148 void MachBreakpointNode::format(PhaseRegAlloc*, outputStream* st) const {
2149 st->print("# breakpoint");
2150 }
2151 #endif
23081 __ vpabsd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23082 __ vpackusdw($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23083 __ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
23084 %}
23085 ins_pipe( pipe_slow );
23086 %}
23087
23088 instruct storemask8l(vecD dst, vecZ src, rRegL scratch) %{
23089 predicate(UseAVX > 2 && VM_Version::supports_avx512bw() && n->as_Vector()->length() == 8 && static_cast<const VectorStoreMaskNode*>(n)->GetInputMaskSize() == 8);
23090 match(Set dst (VectorStoreMask src));
23091 effect(TEMP scratch);
23092 format %{ "vpcmpeqq k2,$src,0xFFFFFFFF\n\t"
23093 "vmovdqub $dst,k2,0x01010101\t! store mask (8L to 8B)" %}
23094 ins_encode %{
23095 int vector_len = 2;
23096 KRegister ktmp = k2; // Use a hardcoded temp due to no k register allocation.
23097 Assembler::ComparisonPredicate cp = Assembler::eq;
23098 __ evpcmpq(ktmp, k0, $src$$XMMRegister, ExternalAddress(vector_all_bits_set()), cp, vector_len, $scratch$$Register);
23099 // The dst is only 128-bit - thus we can do a smaller move.
23100 __ evmovdqub($dst$$XMMRegister, ktmp, ExternalAddress(vector_byte_bitset()), false, 0, $scratch$$Register);
23101 %}
23102 ins_pipe( pipe_slow );
23103 %}
23104
23105 //-------------------------------- LOAD_IOTA_INDICES----------------------------------
23106
23107 instruct loadcon4b(vecS dst, immI0 src, rRegI scratch) %{
23108 predicate(UseSSE > 1 && n->as_Vector()->length() == 4 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
23109 match(Set dst (VectorLoadConst src));
23110 effect(TEMP scratch);
23111 format %{ "movdqu $dst, CONSTANT_MEMORY\t! load iota indices" %}
23112 ins_encode %{
23113 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_iota_indices()), $scratch$$Register);
23114 %}
23115 ins_pipe( pipe_slow );
23116 %}
23117
23118 instruct loadcon8b(vecD dst, immI0 src, rRegI scratch) %{
23119 predicate(UseSSE > 1 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
23120 match(Set dst (VectorLoadConst src));
23121 effect(TEMP scratch);
23122 format %{ "movdqu $dst, CONSTANT_MEMORY\t! load iota indices" %}
23123 ins_encode %{
23124 __ movdqu($dst$$XMMRegister, ExternalAddress(vector_iota_indices()), $scratch$$Register);
23125 %}
23126 ins_pipe( pipe_slow );
23127 %}
23128
23129 instruct loadcon16b(vecX dst, immI0 src, rRegI scratch) %{
23130 predicate(UseAVX > 0 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
23131 match(Set dst (VectorLoadConst src));
23132 effect(TEMP scratch);
23133 format %{ "vpmovdqu $dst, CONSTANT_MEMORY\t! load iota indices" %}
23134 ins_encode %{
23135 __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_iota_indices()), $scratch$$Register);
23136 %}
23137 ins_pipe( pipe_slow );
23138 %}
23139
23140 instruct loadcon32b(vecY dst, immI0 src, rRegI scratch) %{
23141 predicate(UseAVX > 0 && n->as_Vector()->length() == 32 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
23142 match(Set dst (VectorLoadConst src));
23143 effect(TEMP scratch);
23144 format %{ "vmovdqu $dst, CONSTANT_MEMORY\t! load iota indices" %}
23145 ins_encode %{
23146 __ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_iota_indices()), $scratch$$Register);
23147 %}
23148 ins_pipe( pipe_slow );
23149 %}
23150
23151 instruct loadcon64b(vecZ dst, immI0 src, rRegL scratch) %{
23152 predicate(UseAVX > 2 && n->as_Vector()->length() == 64 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
23153 match(Set dst (VectorLoadConst src));
23154 effect(TEMP scratch);
23155 format %{ "vmovdqub $dst,k0, CONSTANT_MEMORY\t! load iota indices" %}
23156 ins_encode %{
23157 int vector_len = 2;
23158 __ evmovdqub($dst$$XMMRegister, k0, ExternalAddress(vector_iota_indices()), false, vector_len, $scratch$$Register);
23159 %}
23160 ins_pipe( pipe_slow );
23161 %}
23162
23163 //-------------------------------- LOAD_SHUFFLE ----------------------------------
23164
23165 instruct loadshuffle8b(vecD dst, vecD src) %{
23166 predicate(UseSSE > 1 && n->as_Vector()->length() == 8 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
23167 match(Set dst (VectorLoadShuffle src));
23168 format %{ "movdqu $dst, $src\t! load shuffle (load 8B for 8BRearrange)" %}
23169 ins_encode %{
23170 __ movdqu($dst$$XMMRegister, $src$$XMMRegister);
23171 %}
23172 ins_pipe( pipe_slow );
23173 %}
23174
23175 instruct loadshuffle16b(vecX dst, vecX src) %{
23176 predicate(UseSSE > 1 && n->as_Vector()->length() == 16 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
23177 match(Set dst (VectorLoadShuffle src));
23178 format %{ "movdqu $dst, $src\t! load shuffle (load 16B for 16BRearrange)" %}
|