src/cpu/x86/vm/assembler_x86.hpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File 8076276 Sdiff src/cpu/x86/vm

src/cpu/x86/vm/assembler_x86.hpp

Print this page
rev 8344 : 8076276: Add support for AVX512
Reviewed-by: kvn, roland
Contributed-by: michael.c.berg@intel.com


 421 
 422 // x86 can do array addressing as a single operation since disp can be an absolute
 423 // address amd64 can't. We create a class that expresses the concept but does extra
 424 // magic on amd64 to get the final result
 425 
 426 class ArrayAddress VALUE_OBJ_CLASS_SPEC {
 427   private:
 428 
 429   AddressLiteral _base;
 430   Address        _index;
 431 
 432   public:
 433 
 434   ArrayAddress() {};
 435   ArrayAddress(AddressLiteral base, Address index): _base(base), _index(index) {};
 436   AddressLiteral base() { return _base; }
 437   Address index() { return _index; }
 438 
 439 };
 440 
 441 const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY( 512 / wordSize);
 442 
 443 // The Intel x86/Amd64 Assembler: Pure assembler doing NO optimizations on the instruction
 444 // level (e.g. mov rax, 0 is not translated into xor rax, rax!); i.e., what you write
 445 // is what you get. The Assembler is generating code into a CodeBuffer.
 446 
 447 class Assembler : public AbstractAssembler  {
 448   friend class AbstractAssembler; // for the non-virtual hack
 449   friend class LIR_Assembler; // as_Address()
 450   friend class StubGenerator;
 451 
 452  public:
 453   enum Condition {                     // The x86 condition codes used for conditional jumps/moves.
 454     zero          = 0x4,
 455     notZero       = 0x5,
 456     equal         = 0x4,
 457     notEqual      = 0x5,
 458     less          = 0xc,
 459     lessEqual     = 0xe,
 460     greater       = 0xf,
 461     greaterEqual  = 0xd,


 486 
 487     REX_B      = 0x41,
 488     REX_X      = 0x42,
 489     REX_XB     = 0x43,
 490     REX_R      = 0x44,
 491     REX_RB     = 0x45,
 492     REX_RX     = 0x46,
 493     REX_RXB    = 0x47,
 494 
 495     REX_W      = 0x48,
 496 
 497     REX_WB     = 0x49,
 498     REX_WX     = 0x4A,
 499     REX_WXB    = 0x4B,
 500     REX_WR     = 0x4C,
 501     REX_WRB    = 0x4D,
 502     REX_WRX    = 0x4E,
 503     REX_WRXB   = 0x4F,
 504 
 505     VEX_3bytes = 0xC4,
 506     VEX_2bytes = 0xC5

 507   };
 508 
 509   enum VexPrefix {
 510     VEX_B = 0x20,
 511     VEX_X = 0x40,
 512     VEX_R = 0x80,
 513     VEX_W = 0x80
 514   };
 515 








 516   enum VexSimdPrefix {
 517     VEX_SIMD_NONE = 0x0,
 518     VEX_SIMD_66   = 0x1,
 519     VEX_SIMD_F3   = 0x2,
 520     VEX_SIMD_F2   = 0x3
 521   };
 522 
 523   enum VexOpcode {
 524     VEX_OPCODE_NONE  = 0x0,
 525     VEX_OPCODE_0F    = 0x1,
 526     VEX_OPCODE_0F_38 = 0x2,
 527     VEX_OPCODE_0F_3A = 0x3
 528   };
 529 































 530   enum WhichOperand {
 531     // input to locate_operand, and format code for relocations
 532     imm_operand  = 0,            // embedded 32-bit|64-bit immediate operand
 533     disp32_operand = 1,          // embedded 32-bit displacement or address
 534     call32_operand = 2,          // embedded 32-bit self-relative displacement
 535 #ifndef _LP64
 536     _WhichOperand_limit = 3
 537 #else
 538      narrow_oop_operand = 3,     // embedded 32-bit immediate narrow oop
 539     _WhichOperand_limit = 4
 540 #endif
 541   };
 542 
 543 
 544 
 545   // NOTE: The general philopsophy of the declarations here is that 64bit versions
 546   // of instructions are freely declared without the need for wrapping them an ifdef.
 547   // (Some dangerous instructions are ifdef's out of inappropriate jvm's.)
 548   // In the .cpp file the implementations are wrapped so that they are dropped out
 549   // of the resulting jvm. This is done mostly to keep the footprint of MINIMAL
 550   // to the size it was prior to merging up the 32bit and 64bit assemblers.
 551   //
 552   // This does mean you'll get a linker/runtime error if you use a 64bit only instruction
 553   // in a 32bit vm. This is somewhat unfortunate but keeps the ifdef noise down.
 554 
 555 private:
 556 





 557 
 558   // 64bit prefixes
 559   int prefix_and_encode(int reg_enc, bool byteinst = false);
 560   int prefixq_and_encode(int reg_enc);
 561 
 562   int prefix_and_encode(int dst_enc, int src_enc, bool byteinst = false);
 563   int prefixq_and_encode(int dst_enc, int src_enc);
 564 
 565   void prefix(Register reg);
 566   void prefix(Address adr);
 567   void prefixq(Address adr);
 568 
 569   void prefix(Address adr, Register reg,  bool byteinst = false);
 570   void prefix(Address adr, XMMRegister reg);
 571   void prefixq(Address adr, Register reg);
 572   void prefixq(Address adr, XMMRegister reg);
 573 
 574   void prefetch_prefix(Address src);
 575 
 576   void rex_prefix(Address adr, XMMRegister xreg,
 577                   VexSimdPrefix pre, VexOpcode opc, bool rex_w);
 578   int  rex_prefix_and_encode(int dst_enc, int src_enc,
 579                              VexSimdPrefix pre, VexOpcode opc, bool rex_w);
 580 
 581   void vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w,
 582                   int nds_enc, VexSimdPrefix pre, VexOpcode opc,
 583                   bool vector256);





 584 
 585   void vex_prefix(Address adr, int nds_enc, int xreg_enc,
 586                   VexSimdPrefix pre, VexOpcode opc,
 587                   bool vex_w, bool vector256);

 588 
 589   void vex_prefix(XMMRegister dst, XMMRegister nds, Address src,
 590                   VexSimdPrefix pre, bool vector256 = false) {









 591     int dst_enc = dst->encoding();
 592     int nds_enc = nds->is_valid() ? nds->encoding() : 0;
 593     vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, false, vector256);
 594   }
 595 
 596   void vex_prefix_0F38(Register dst, Register nds, Address src) {
 597     bool vex_w = false;
 598     bool vector256 = false;
 599     vex_prefix(src, nds->encoding(), dst->encoding(),
 600                VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);

 601   }
 602 
 603   void vex_prefix_0F38_q(Register dst, Register nds, Address src) {
 604     bool vex_w = true;
 605     bool vector256 = false;
 606     vex_prefix(src, nds->encoding(), dst->encoding(),
 607                VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);

 608   }
 609   int  vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
 610                              VexSimdPrefix pre, VexOpcode opc,
 611                              bool vex_w, bool vector256);

 612 
 613   int  vex_prefix_0F38_and_encode(Register dst, Register nds, Register src) {
 614     bool vex_w = false;
 615     bool vector256 = false;
 616     return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
 617                                  VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);

 618   }
 619   int  vex_prefix_0F38_and_encode_q(Register dst, Register nds, Register src) {
 620     bool vex_w = true;
 621     bool vector256 = false;
 622     return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
 623                                  VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);

 624   }
 625   int  vex_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
 626                              VexSimdPrefix pre, bool vector256 = false,
 627                              VexOpcode opc = VEX_OPCODE_0F) {

 628     int src_enc = src->encoding();
 629     int dst_enc = dst->encoding();
 630     int nds_enc = nds->is_valid() ? nds->encoding() : 0;
 631     return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, false, vector256);
 632   }
 633 
 634   void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr,
 635                    VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
 636                    bool rex_w = false, bool vector256 = false);
 637 
 638   void simd_prefix(XMMRegister dst, Address src,
 639                    VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
 640     simd_prefix(dst, xnoreg, src, pre, opc);
 641   }
 642 
 643   void simd_prefix(Address dst, XMMRegister src, VexSimdPrefix pre) {
 644     simd_prefix(src, dst, pre);
 645   }
 646   void simd_prefix_q(XMMRegister dst, XMMRegister nds, Address src,
 647                      VexSimdPrefix pre) {
 648     bool rex_w = true;
 649     simd_prefix(dst, nds, src, pre, VEX_OPCODE_0F, rex_w);
 650   }
 651 
 652   int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
 653                              VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
 654                              bool rex_w = false, bool vector256 = false);












 655 
 656   // Move/convert 32-bit integer value.
 657   int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, Register src,
 658                              VexSimdPrefix pre) {
 659     // It is OK to cast from Register to XMMRegister to pass argument here
 660     // since only encoding is used in simd_prefix_and_encode() and number of
 661     // Gen and Xmm registers are the same.
 662     return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre);
 663   }
 664   int simd_prefix_and_encode(XMMRegister dst, Register src, VexSimdPrefix pre) {
 665     return simd_prefix_and_encode(dst, xnoreg, src, pre);
 666   }
 667   int simd_prefix_and_encode(Register dst, XMMRegister src,
 668                              VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
 669     return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, opc);

 670   }
 671 
 672   // Move/convert 64-bit integer value.
 673   int simd_prefix_and_encode_q(XMMRegister dst, XMMRegister nds, Register src,
 674                                VexSimdPrefix pre) {
 675     bool rex_w = true;
 676     return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, VEX_OPCODE_0F, rex_w);
 677   }
 678   int simd_prefix_and_encode_q(XMMRegister dst, Register src, VexSimdPrefix pre) {
 679     return simd_prefix_and_encode_q(dst, xnoreg, src, pre);
 680   }
 681   int simd_prefix_and_encode_q(Register dst, XMMRegister src,
 682                              VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {

 683     bool rex_w = true;
 684     return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, opc, rex_w);
 685   }
 686 
 687   // Helper functions for groups of instructions
 688   void emit_arith_b(int op1, int op2, Register dst, int imm8);
 689 
 690   void emit_arith(int op1, int op2, Register dst, int32_t imm32);
 691   // Force generation of a 4 byte immediate value even if it fits into 8bit
 692   void emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32);
 693   void emit_arith(int op1, int op2, Register dst, Register src);
 694 
 695   void emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre);
 696   void emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre);
 697   void emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre);
 698   void emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre);




 699   void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
 700                       Address src, VexSimdPrefix pre, bool vector256);




 701   void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
 702                       XMMRegister src, VexSimdPrefix pre, bool vector256);






 703 
 704   void emit_operand(Register reg,
 705                     Register base, Register index, Address::ScaleFactor scale,
 706                     int disp,
 707                     RelocationHolder const& rspec,
 708                     int rip_relative_correction = 0);
 709 
 710   void emit_operand(Register reg, Address adr, int rip_relative_correction = 0);
 711 
 712   // operands that only take the original 32bit registers
 713   void emit_operand32(Register reg, Address adr);
 714 
 715   void emit_operand(XMMRegister reg,
 716                     Register base, Register index, Address::ScaleFactor scale,
 717                     int disp,
 718                     RelocationHolder const& rspec);
 719 
 720   void emit_operand(XMMRegister reg, Address adr);
 721 
 722   void emit_operand(MMXRegister reg, Address adr);


 808   // Move Scalar Double-Precision Floating-Point Values
 809   void movsd(XMMRegister dst, Address src);
 810   void movsd(XMMRegister dst, XMMRegister src);
 811   void movsd(Address dst, XMMRegister src);
 812   void movlpd(XMMRegister dst, Address src);
 813 
 814   // New cpus require use of movaps and movapd to avoid partial register stall
 815   // when moving between registers.
 816   void movaps(XMMRegister dst, XMMRegister src);
 817   void movapd(XMMRegister dst, XMMRegister src);
 818 
 819   // End avoid using directly
 820 
 821 
 822   // Instruction prefixes
 823   void prefix(Prefix p);
 824 
 825   public:
 826 
 827   // Creation
 828   Assembler(CodeBuffer* code) : AbstractAssembler(code) {}


 829 
 830   // Decoding
 831   static address locate_operand(address inst, WhichOperand which);
 832   static address locate_next_instruction(address inst);
 833 
 834   // Utilities
 835   static bool is_polling_page_far() NOT_LP64({ return false;});


 836 
 837   // Generic instructions
 838   // Does 32bit or 64bit as needed for the platform. In some sense these
 839   // belong in macro assembler but there is no need for both varieties to exist
 840 








 841   void lea(Register dst, Address src);
 842 
 843   void mov(Register dst, Register src);
 844 
 845   void pusha();
 846   void popa();
 847 
 848   void pushf();
 849   void popf();
 850 
 851   void push(int32_t imm32);
 852 
 853   void push(Register src);
 854 
 855   void pop(Register dst);
 856 
 857   // These are dummies to prevent surprise implicit conversions to Register
 858   void push(void* v);
 859   void pop(void* v);
 860 


1321         if (offset < -128) {
1322           offset = -128;
1323         }
1324 
1325         lock();
1326         addl(Address(rsp, offset), 0);// Assert the lock# signal here
1327       }
1328     }
1329   }
1330 
1331   void mfence();
1332 
1333   // Moves
1334 
1335   void mov64(Register dst, int64_t imm64);
1336 
1337   void movb(Address dst, Register src);
1338   void movb(Address dst, int imm8);
1339   void movb(Register dst, Address src);
1340 






1341   void movdl(XMMRegister dst, Register src);
1342   void movdl(Register dst, XMMRegister src);
1343   void movdl(XMMRegister dst, Address src);
1344   void movdl(Address dst, XMMRegister src);
1345 
1346   // Move Double Quadword
1347   void movdq(XMMRegister dst, Register src);
1348   void movdq(Register dst, XMMRegister src);
1349 
1350   // Move Aligned Double Quadword
1351   void movdqa(XMMRegister dst, XMMRegister src);
1352   void movdqa(XMMRegister dst, Address src);
1353 
1354   // Move Unaligned Double Quadword
1355   void movdqu(Address     dst, XMMRegister src);
1356   void movdqu(XMMRegister dst, Address src);
1357   void movdqu(XMMRegister dst, XMMRegister src);
1358 
1359   // Move Unaligned 256bit Vector
1360   void vmovdqu(Address dst, XMMRegister src);
1361   void vmovdqu(XMMRegister dst, Address src);
1362   void vmovdqu(XMMRegister dst, XMMRegister src);
1363 





1364   // Move lower 64bit to high 64bit in 128bit register
1365   void movlhps(XMMRegister dst, XMMRegister src);
1366 
1367   void movl(Register dst, int32_t imm32);
1368   void movl(Address dst, int32_t imm32);
1369   void movl(Register dst, Register src);
1370   void movl(Register dst, Address src);
1371   void movl(Address dst, Register src);
1372 
1373   // These dummies prevent using movl from converting a zero (like NULL) into Register
1374   // by giving the compiler two choices it can't resolve
1375 
1376   void movl(Address  dst, void* junk);
1377   void movl(Register dst, void* junk);
1378 
1379 #ifdef _LP64
1380   void movq(Register dst, Register src);
1381   void movq(Register dst, Address src);
1382   void movq(Address  dst, Register src);
1383 #endif


1469 
1470   void notl(Register dst);
1471 
1472 #ifdef _LP64
1473   void notq(Register dst);
1474 #endif
1475 
1476   void orl(Address dst, int32_t imm32);
1477   void orl(Register dst, int32_t imm32);
1478   void orl(Register dst, Address src);
1479   void orl(Register dst, Register src);
1480 
1481   void orq(Address dst, int32_t imm32);
1482   void orq(Register dst, int32_t imm32);
1483   void orq(Register dst, Address src);
1484   void orq(Register dst, Register src);
1485 
1486   // Pack with unsigned saturation
1487   void packuswb(XMMRegister dst, XMMRegister src);
1488   void packuswb(XMMRegister dst, Address src);
1489   void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1490 
1491   // Pemutation of 64bit words
1492   void vpermq(XMMRegister dst, XMMRegister src, int imm8, bool vector256);
1493 
1494   void pause();
1495 
1496   // SSE4.2 string instructions
1497   void pcmpestri(XMMRegister xmm1, XMMRegister xmm2, int imm8);
1498   void pcmpestri(XMMRegister xmm1, Address src, int imm8);
1499 
1500   // SSE 4.1 extract
1501   void pextrd(Register dst, XMMRegister src, int imm8);
1502   void pextrq(Register dst, XMMRegister src, int imm8);
1503 
1504   // SSE 4.1 insert
1505   void pinsrd(XMMRegister dst, Register src, int imm8);
1506   void pinsrq(XMMRegister dst, Register src, int imm8);
1507 
1508   // SSE4.1 packed move
1509   void pmovzxbw(XMMRegister dst, XMMRegister src);
1510   void pmovzxbw(XMMRegister dst, Address src);
1511 
1512 #ifndef _LP64 // no 32bit push/pop on amd64


1717   void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src);
1718   void vdivsd(XMMRegister dst, XMMRegister nds, Address src);
1719   void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
1720   void vdivss(XMMRegister dst, XMMRegister nds, Address src);
1721   void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src);
1722   void vmulsd(XMMRegister dst, XMMRegister nds, Address src);
1723   void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
1724   void vmulss(XMMRegister dst, XMMRegister nds, Address src);
1725   void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src);
1726   void vsubsd(XMMRegister dst, XMMRegister nds, Address src);
1727   void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
1728   void vsubss(XMMRegister dst, XMMRegister nds, Address src);
1729   void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src);
1730 
1731 
1732   //====================VECTOR ARITHMETIC=====================================
1733 
1734   // Add Packed Floating-Point Values
1735   void addpd(XMMRegister dst, XMMRegister src);
1736   void addps(XMMRegister dst, XMMRegister src);
1737   void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1738   void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1739   void vaddpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1740   void vaddps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1741 
1742   // Subtract Packed Floating-Point Values
1743   void subpd(XMMRegister dst, XMMRegister src);
1744   void subps(XMMRegister dst, XMMRegister src);
1745   void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1746   void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1747   void vsubpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1748   void vsubps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1749 
1750   // Multiply Packed Floating-Point Values
1751   void mulpd(XMMRegister dst, XMMRegister src);
1752   void mulps(XMMRegister dst, XMMRegister src);
1753   void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1754   void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1755   void vmulpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1756   void vmulps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1757 
1758   // Divide Packed Floating-Point Values
1759   void divpd(XMMRegister dst, XMMRegister src);
1760   void divps(XMMRegister dst, XMMRegister src);
1761   void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1762   void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1763   void vdivpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1764   void vdivps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1765 
1766   // Bitwise Logical AND of Packed Floating-Point Values
1767   void andpd(XMMRegister dst, XMMRegister src);
1768   void andps(XMMRegister dst, XMMRegister src);
1769   void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1770   void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1771   void vandpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1772   void vandps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1773 
1774   // Bitwise Logical XOR of Packed Floating-Point Values
1775   void xorpd(XMMRegister dst, XMMRegister src);
1776   void xorps(XMMRegister dst, XMMRegister src);
1777   void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1778   void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1779   void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1780   void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1781 
1782   // Add horizontal packed integers
1783   void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1784   void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1785   void phaddw(XMMRegister dst, XMMRegister src);
1786   void phaddd(XMMRegister dst, XMMRegister src);
1787 
1788   // Add packed integers
1789   void paddb(XMMRegister dst, XMMRegister src);
1790   void paddw(XMMRegister dst, XMMRegister src);
1791   void paddd(XMMRegister dst, XMMRegister src);
1792   void paddq(XMMRegister dst, XMMRegister src);
1793   void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1794   void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1795   void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1796   void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1797   void vpaddb(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1798   void vpaddw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1799   void vpaddd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1800   void vpaddq(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1801 
1802   // Sub packed integers
1803   void psubb(XMMRegister dst, XMMRegister src);
1804   void psubw(XMMRegister dst, XMMRegister src);
1805   void psubd(XMMRegister dst, XMMRegister src);
1806   void psubq(XMMRegister dst, XMMRegister src);
1807   void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1808   void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1809   void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1810   void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1811   void vpsubb(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1812   void vpsubw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1813   void vpsubd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1814   void vpsubq(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1815 
1816   // Multiply packed integers (only shorts and ints)
1817   void pmullw(XMMRegister dst, XMMRegister src);
1818   void pmulld(XMMRegister dst, XMMRegister src);
1819   void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1820   void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1821   void vpmullw(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1822   void vpmulld(XMMRegister dst, XMMRegister nds, Address src, bool vector256);


1823 
1824   // Shift left packed integers
1825   void psllw(XMMRegister dst, int shift);
1826   void pslld(XMMRegister dst, int shift);
1827   void psllq(XMMRegister dst, int shift);
1828   void psllw(XMMRegister dst, XMMRegister shift);
1829   void pslld(XMMRegister dst, XMMRegister shift);
1830   void psllq(XMMRegister dst, XMMRegister shift);
1831   void vpsllw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
1832   void vpslld(XMMRegister dst, XMMRegister src, int shift, bool vector256);
1833   void vpsllq(XMMRegister dst, XMMRegister src, int shift, bool vector256);
1834   void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
1835   void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
1836   void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
1837 
1838   // Logical shift right packed integers
1839   void psrlw(XMMRegister dst, int shift);
1840   void psrld(XMMRegister dst, int shift);
1841   void psrlq(XMMRegister dst, int shift);
1842   void psrlw(XMMRegister dst, XMMRegister shift);
1843   void psrld(XMMRegister dst, XMMRegister shift);
1844   void psrlq(XMMRegister dst, XMMRegister shift);
1845   void vpsrlw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
1846   void vpsrld(XMMRegister dst, XMMRegister src, int shift, bool vector256);
1847   void vpsrlq(XMMRegister dst, XMMRegister src, int shift, bool vector256);
1848   void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
1849   void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
1850   void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
1851 
1852   // Arithmetic shift right packed integers (only shorts and ints, no instructions for longs)
1853   void psraw(XMMRegister dst, int shift);
1854   void psrad(XMMRegister dst, int shift);
1855   void psraw(XMMRegister dst, XMMRegister shift);
1856   void psrad(XMMRegister dst, XMMRegister shift);
1857   void vpsraw(XMMRegister dst, XMMRegister src, int shift, bool vector256);
1858   void vpsrad(XMMRegister dst, XMMRegister src, int shift, bool vector256);
1859   void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
1860   void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, bool vector256);
1861 
1862   // And packed integers
1863   void pand(XMMRegister dst, XMMRegister src);
1864   void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1865   void vpand(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1866 
1867   // Or packed integers
1868   void por(XMMRegister dst, XMMRegister src);
1869   void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1870   void vpor(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1871 
1872   // Xor packed integers
1873   void pxor(XMMRegister dst, XMMRegister src);
1874   void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
1875   void vpxor(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
1876 
1877   // Copy low 128bit into high 128bit of YMM registers.
1878   void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
1879   void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
1880   void vextractf128h(XMMRegister dst, XMMRegister src);

1881 
1882   // Load/store high 128bit of YMM registers which does not destroy other half.
1883   void vinsertf128h(XMMRegister dst, Address src);
1884   void vinserti128h(XMMRegister dst, Address src);
1885   void vextractf128h(Address dst, XMMRegister src);
1886   void vextracti128h(Address dst, XMMRegister src);
1887 













1888   // duplicate 4-bytes integer data from src into 8 locations in dest
1889   void vpbroadcastd(XMMRegister dst, XMMRegister src);
1890 



1891   // Carry-Less Multiplication Quadword
1892   void pclmulqdq(XMMRegister dst, XMMRegister src, int mask);
1893   void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask);
1894 
1895   // AVX instruction which is used to clear upper 128 bits of YMM registers and
1896   // to avoid transaction penalty between AVX and SSE states. There is no
1897   // penalty if legacy SSE instructions are encoded using VEX prefix because
1898   // they always clear upper 128 bits. It should be used before calling
1899   // runtime code and native libraries.
1900   void vzeroupper();
1901 
1902  protected:
1903   // Next instructions require address alignment 16 bytes SSE mode.
1904   // They should be called only from corresponding MacroAssembler instructions.
1905   void andpd(XMMRegister dst, Address src);
1906   void andps(XMMRegister dst, Address src);
1907   void xorpd(XMMRegister dst, Address src);
1908   void xorps(XMMRegister dst, Address src);
1909 
1910 };


 421 
 422 // x86 can do array addressing as a single operation since disp can be an absolute
 423 // address amd64 can't. We create a class that expresses the concept but does extra
 424 // magic on amd64 to get the final result
 425 
 426 class ArrayAddress VALUE_OBJ_CLASS_SPEC {
 427   private:
 428 
 429   AddressLiteral _base;
 430   Address        _index;
 431 
 432   public:
 433 
 434   ArrayAddress() {};
 435   ArrayAddress(AddressLiteral base, Address index): _base(base), _index(index) {};
 436   AddressLiteral base() { return _base; }
 437   Address index() { return _index; }
 438 
 439 };
 440 
 441 const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY( 512*2 / wordSize);
 442 
 443 // The Intel x86/Amd64 Assembler: Pure assembler doing NO optimizations on the instruction
 444 // level (e.g. mov rax, 0 is not translated into xor rax, rax!); i.e., what you write
 445 // is what you get. The Assembler is generating code into a CodeBuffer.
 446 
 447 class Assembler : public AbstractAssembler  {
 448   friend class AbstractAssembler; // for the non-virtual hack
 449   friend class LIR_Assembler; // as_Address()
 450   friend class StubGenerator;
 451 
 452  public:
 453   enum Condition {                     // The x86 condition codes used for conditional jumps/moves.
 454     zero          = 0x4,
 455     notZero       = 0x5,
 456     equal         = 0x4,
 457     notEqual      = 0x5,
 458     less          = 0xc,
 459     lessEqual     = 0xe,
 460     greater       = 0xf,
 461     greaterEqual  = 0xd,


 486 
 487     REX_B      = 0x41,
 488     REX_X      = 0x42,
 489     REX_XB     = 0x43,
 490     REX_R      = 0x44,
 491     REX_RB     = 0x45,
 492     REX_RX     = 0x46,
 493     REX_RXB    = 0x47,
 494 
 495     REX_W      = 0x48,
 496 
 497     REX_WB     = 0x49,
 498     REX_WX     = 0x4A,
 499     REX_WXB    = 0x4B,
 500     REX_WR     = 0x4C,
 501     REX_WRB    = 0x4D,
 502     REX_WRX    = 0x4E,
 503     REX_WRXB   = 0x4F,
 504 
 505     VEX_3bytes = 0xC4,
 506     VEX_2bytes = 0xC5,
 507     EVEX_4bytes = 0x62
 508   };
 509 
 510   enum VexPrefix {
 511     VEX_B = 0x20,
 512     VEX_X = 0x40,
 513     VEX_R = 0x80,
 514     VEX_W = 0x80
 515   };
 516 
 517   enum ExexPrefix {
 518     EVEX_F  = 0x04,
 519     EVEX_V  = 0x08,
 520     EVEX_Rb = 0x10,
 521     EVEX_X  = 0x40,
 522     EVEX_Z  = 0x80
 523   };
 524 
 525   enum VexSimdPrefix {
 526     VEX_SIMD_NONE = 0x0,
 527     VEX_SIMD_66   = 0x1,
 528     VEX_SIMD_F3   = 0x2,
 529     VEX_SIMD_F2   = 0x3
 530   };
 531 
 532   enum VexOpcode {
 533     VEX_OPCODE_NONE  = 0x0,
 534     VEX_OPCODE_0F    = 0x1,
 535     VEX_OPCODE_0F_38 = 0x2,
 536     VEX_OPCODE_0F_3A = 0x3
 537   };
 538 
 539   enum AvxVectorLen {
 540     AVX_128bit = 0x0,
 541     AVX_256bit = 0x1,
 542     AVX_512bit = 0x2,
 543     AVX_NoVec  = 0x4
 544   };
 545 
 546   enum EvexTupleType {
 547     EVEX_FV   = 0,
 548     EVEX_HV   = 4,
 549     EVEX_FVM  = 6,
 550     EVEX_T1S  = 7,
 551     EVEX_T1F  = 11,
 552     EVEX_T2   = 13,
 553     EVEX_T4   = 15,
 554     EVEX_T8   = 17,
 555     EVEX_HVM  = 18,
 556     EVEX_QVM  = 19,
 557     EVEX_OVM  = 20,
 558     EVEX_M128 = 21,
 559     EVEX_DUP  = 22,
 560     EVEX_ETUP = 23
 561   };
 562 
 563   enum EvexInputSizeInBits {
 564     EVEX_8bit  = 0,
 565     EVEX_16bit = 1,
 566     EVEX_32bit = 2,
 567     EVEX_64bit = 3
 568   };
 569 
 570   enum WhichOperand {
 571     // input to locate_operand, and format code for relocations
 572     imm_operand  = 0,            // embedded 32-bit|64-bit immediate operand
 573     disp32_operand = 1,          // embedded 32-bit displacement or address
 574     call32_operand = 2,          // embedded 32-bit self-relative displacement
 575 #ifndef _LP64
 576     _WhichOperand_limit = 3
 577 #else
 578      narrow_oop_operand = 3,     // embedded 32-bit immediate narrow oop
 579     _WhichOperand_limit = 4
 580 #endif
 581   };
 582 
 583 
 584 
 585   // NOTE: The general philopsophy of the declarations here is that 64bit versions
 586   // of instructions are freely declared without the need for wrapping them an ifdef.
 587   // (Some dangerous instructions are ifdef's out of inappropriate jvm's.)
 588   // In the .cpp file the implementations are wrapped so that they are dropped out
 589   // of the resulting jvm. This is done mostly to keep the footprint of MINIMAL
 590   // to the size it was prior to merging up the 32bit and 64bit assemblers.
 591   //
 592   // This does mean you'll get a linker/runtime error if you use a 64bit only instruction
 593   // in a 32bit vm. This is somewhat unfortunate but keeps the ifdef noise down.
 594 
 595 private:
 596 
 597   int evex_encoding;
 598   int input_size_in_bits;
 599   int avx_vector_len;
 600   int tuple_type;
 601   bool is_evex_instruction;
 602 
 603   // 64bit prefixes
 604   int prefix_and_encode(int reg_enc, bool byteinst = false);
 605   int prefixq_and_encode(int reg_enc);
 606 
 607   int prefix_and_encode(int dst_enc, int src_enc, bool byteinst = false);
 608   int prefixq_and_encode(int dst_enc, int src_enc);
 609 
 610   void prefix(Register reg);
 611   void prefix(Address adr);
 612   void prefixq(Address adr);
 613 
 614   void prefix(Address adr, Register reg,  bool byteinst = false);
 615   void prefix(Address adr, XMMRegister reg);
 616   void prefixq(Address adr, Register reg);
 617   void prefixq(Address adr, XMMRegister reg);
 618 
 619   void prefetch_prefix(Address src);
 620 
 621   void rex_prefix(Address adr, XMMRegister xreg,
 622                   VexSimdPrefix pre, VexOpcode opc, bool rex_w);
 623   int  rex_prefix_and_encode(int dst_enc, int src_enc,
 624                              VexSimdPrefix pre, VexOpcode opc, bool rex_w);
 625 
 626   void vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w,
 627                   int nds_enc, VexSimdPrefix pre, VexOpcode opc,
 628                   int vector_len);
 629 
 630   void evex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w, bool evex_r, bool evex_v,
 631                    int nds_enc, VexSimdPrefix pre, VexOpcode opc,
 632                    bool is_extended_context, bool is_merge_context,
 633                    int vector_len, bool no_mask_reg );
 634 
 635   void vex_prefix(Address adr, int nds_enc, int xreg_enc,
 636                   VexSimdPrefix pre, VexOpcode opc,
 637                   bool vex_w, int vector_len,
 638                   bool legacy_mode = false, bool no_mask_reg = false);
 639 
 640   void vex_prefix(XMMRegister dst, XMMRegister nds, Address src,
 641                   VexSimdPrefix pre, int vector_len = AVX_128bit,
 642                   bool no_mask_reg = false, bool legacy_mode = false) {
 643     int dst_enc = dst->encoding();
 644     int nds_enc = nds->is_valid() ? nds->encoding() : 0;
 645     vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, false, vector_len, legacy_mode, no_mask_reg);
 646   }
 647 
 648   void vex_prefix_q(XMMRegister dst, XMMRegister nds, Address src,
 649                     VexSimdPrefix pre, int vector_len = AVX_128bit,
 650                     bool no_mask_reg = false) {
 651     int dst_enc = dst->encoding();
 652     int nds_enc = nds->is_valid() ? nds->encoding() : 0;
 653     vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, true, vector_len, false, no_mask_reg);
 654   }
 655 
 656   void vex_prefix_0F38(Register dst, Register nds, Address src, bool no_mask_reg = false) {
 657     bool vex_w = false;
 658     int vector_len = AVX_128bit;
 659     vex_prefix(src, nds->encoding(), dst->encoding(),
 660                VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
 661                vector_len, no_mask_reg);
 662   }
 663 
 664   void vex_prefix_0F38_q(Register dst, Register nds, Address src, bool no_mask_reg = false) {
 665     bool vex_w = true;
 666     int vector_len = AVX_128bit;
 667     vex_prefix(src, nds->encoding(), dst->encoding(),
 668                VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
 669                vector_len, no_mask_reg);
 670   }
 671   int  vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
 672                              VexSimdPrefix pre, VexOpcode opc,
 673                              bool vex_w, int vector_len,
 674                              bool legacy_mode, bool no_mask_reg);
 675 
 676   int  vex_prefix_0F38_and_encode(Register dst, Register nds, Register src, bool no_mask_reg = false) {
 677     bool vex_w = false;
 678     int vector_len = AVX_128bit;
 679     return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
 680                                  VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
 681                                  false, no_mask_reg);
 682   }
 683   int  vex_prefix_0F38_and_encode_q(Register dst, Register nds, Register src, bool no_mask_reg = false) {
 684     bool vex_w = true;
 685     int vector_len = AVX_128bit;
 686     return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
 687                                  VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
 688                                  false, no_mask_reg);
 689   }
 690   int  vex_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
 691                              VexSimdPrefix pre, int vector_len = AVX_128bit,
 692                              VexOpcode opc = VEX_OPCODE_0F, bool legacy_mode = false,
 693                              bool no_mask_reg = false) {
 694     int src_enc = src->encoding();
 695     int dst_enc = dst->encoding();
 696     int nds_enc = nds->is_valid() ? nds->encoding() : 0;
 697     return vex_prefix_and_encode(dst_enc, nds_enc, src_enc, pre, opc, false, vector_len, legacy_mode, no_mask_reg);
 698   }
 699 
 700   void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr,
 701                    VexSimdPrefix pre, bool no_mask_reg, VexOpcode opc = VEX_OPCODE_0F,
 702                    bool rex_w = false, int vector_len = AVX_128bit, bool legacy_mode = false);
 703 
 704   void simd_prefix(XMMRegister dst, Address src, VexSimdPrefix pre,
 705                    bool no_mask_reg, VexOpcode opc = VEX_OPCODE_0F) {
 706     simd_prefix(dst, xnoreg, src, pre, no_mask_reg, opc);
 707   }
 708 
 709   void simd_prefix(Address dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg) {
 710     simd_prefix(src, dst, pre, no_mask_reg);
 711   }
 712   void simd_prefix_q(XMMRegister dst, XMMRegister nds, Address src,
 713                      VexSimdPrefix pre, bool no_mask_reg = false) {
 714     bool rex_w = true;
 715     simd_prefix(dst, nds, src, pre, no_mask_reg, VEX_OPCODE_0F, rex_w);
 716   }
 717 
 718   int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
 719                              VexSimdPrefix pre, bool no_mask_reg,
 720                              VexOpcode opc = VEX_OPCODE_0F,
 721                              bool rex_w = false, int vector_len = AVX_128bit,
 722                              bool legacy_mode = false);
 723 
 724   int kreg_prefix_and_encode(KRegister dst, KRegister nds, KRegister src,
 725                              VexSimdPrefix pre, bool no_mask_reg,
 726                              VexOpcode opc = VEX_OPCODE_0F,
 727                              bool rex_w = false, int vector_len = AVX_128bit);
 728 
 729   int kreg_prefix_and_encode(KRegister dst, KRegister nds, Register src,
 730                              VexSimdPrefix pre, bool no_mask_reg,
 731                              VexOpcode opc = VEX_OPCODE_0F,
 732                              bool rex_w = false, int vector_len = AVX_128bit);
 733 
 734   // Move/convert 32-bit integer value.
 735   int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, Register src,
 736                              VexSimdPrefix pre, bool no_mask_reg) {
 737     // It is OK to cast from Register to XMMRegister to pass argument here
 738     // since only encoding is used in simd_prefix_and_encode() and number of
 739     // Gen and Xmm registers are the same.
 740     return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, no_mask_reg, VEX_OPCODE_0F);
 741   }
 742   int simd_prefix_and_encode(XMMRegister dst, Register src, VexSimdPrefix pre, bool no_mask_reg) {
 743     return simd_prefix_and_encode(dst, xnoreg, src, pre, no_mask_reg);
 744   }
 745   int simd_prefix_and_encode(Register dst, XMMRegister src,
 746                              VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
 747                              bool no_mask_reg = false) {
 748     return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, no_mask_reg, opc);
 749   }
 750 
 751   // Move/convert 64-bit integer value.
 752   int simd_prefix_and_encode_q(XMMRegister dst, XMMRegister nds, Register src,
 753                                VexSimdPrefix pre, bool no_mask_reg = false) {
 754     bool rex_w = true;
 755     return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, no_mask_reg, VEX_OPCODE_0F, rex_w);
 756   }
 757   int simd_prefix_and_encode_q(XMMRegister dst, Register src, VexSimdPrefix pre, bool no_mask_reg) {
 758     return simd_prefix_and_encode_q(dst, xnoreg, src, pre, no_mask_reg);
 759   }
 760   int simd_prefix_and_encode_q(Register dst, XMMRegister src,
 761                                VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
 762                                bool no_mask_reg = false) {
 763     bool rex_w = true;
 764     return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, no_mask_reg, opc, rex_w);
 765   }
 766 
 767   // Helper functions for groups of instructions
 768   void emit_arith_b(int op1, int op2, Register dst, int imm8);
 769 
 770   void emit_arith(int op1, int op2, Register dst, int32_t imm32);
 771   // Force generation of a 4 byte immediate value even if it fits into 8bit
 772   void emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32);
 773   void emit_arith(int op1, int op2, Register dst, Register src);
 774 
 775   void emit_simd_arith(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false);
 776   void emit_simd_arith_q(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false);
 777   void emit_simd_arith(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false);
 778   void emit_simd_arith_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false);
 779   void emit_simd_arith_nonds(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false);
 780   void emit_simd_arith_nonds_q(int opcode, XMMRegister dst, Address src, VexSimdPrefix pre, bool no_mask_reg = false);
 781   void emit_simd_arith_nonds(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false, bool legacy_mode = false);
 782   void emit_simd_arith_nonds_q(int opcode, XMMRegister dst, XMMRegister src, VexSimdPrefix pre, bool no_mask_reg = false);
 783   void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
 784                       Address src, VexSimdPrefix pre, int vector_len,
 785                       bool no_mask_reg = false, bool legacy_mode = false);
 786   void emit_vex_arith_q(int opcode, XMMRegister dst, XMMRegister nds,
 787                         Address src, VexSimdPrefix pre, int vector_len,
 788                         bool no_mask_reg = false);
 789   void emit_vex_arith(int opcode, XMMRegister dst, XMMRegister nds,
 790                       XMMRegister src, VexSimdPrefix pre, int vector_len,
 791                       bool no_mask_reg = false, bool legacy_mode = false);
 792   void emit_vex_arith_q(int opcode, XMMRegister dst, XMMRegister nds,
 793                         XMMRegister src, VexSimdPrefix pre, int vector_len,
 794                         bool no_mask_reg = false);
 795 
 796   bool emit_compressed_disp_byte(int &disp);
 797 
 798   void emit_operand(Register reg,
 799                     Register base, Register index, Address::ScaleFactor scale,
 800                     int disp,
 801                     RelocationHolder const& rspec,
 802                     int rip_relative_correction = 0);
 803 
 804   void emit_operand(Register reg, Address adr, int rip_relative_correction = 0);
 805 
 806   // operands that only take the original 32bit registers
 807   void emit_operand32(Register reg, Address adr);
 808 
 809   void emit_operand(XMMRegister reg,
 810                     Register base, Register index, Address::ScaleFactor scale,
 811                     int disp,
 812                     RelocationHolder const& rspec);
 813 
 814   void emit_operand(XMMRegister reg, Address adr);
 815 
 816   void emit_operand(MMXRegister reg, Address adr);


 902   // Move Scalar Double-Precision Floating-Point Values
 903   void movsd(XMMRegister dst, Address src);
 904   void movsd(XMMRegister dst, XMMRegister src);
 905   void movsd(Address dst, XMMRegister src);
 906   void movlpd(XMMRegister dst, Address src);
 907 
 908   // New cpus require use of movaps and movapd to avoid partial register stall
 909   // when moving between registers.
 910   void movaps(XMMRegister dst, XMMRegister src);
 911   void movapd(XMMRegister dst, XMMRegister src);
 912 
 913   // End avoid using directly
 914 
 915 
 916   // Instruction prefixes
 917   void prefix(Prefix p);
 918 
 919   public:
 920 
 921   // Creation
 922   Assembler(CodeBuffer* code) : AbstractAssembler(code) {
 923     init_attributes();
 924   }
 925 
 926   // Decoding
 927   static address locate_operand(address inst, WhichOperand which);
 928   static address locate_next_instruction(address inst);
 929 
 930   // Utilities
 931   static bool is_polling_page_far() NOT_LP64({ return false;});
 932   static bool query_compressed_disp_byte(int disp, bool is_evex_inst, int vector_len,
 933                                          int cur_tuple_type, int in_size_in_bits, int cur_encoding);
 934 
 935   // Generic instructions
 936   // Does 32bit or 64bit as needed for the platform. In some sense these
 937   // belong in macro assembler but there is no need for both varieties to exist
 938 
 939   void init_attributes(void) {
 940     evex_encoding = 0;
 941     input_size_in_bits = 0;
 942     avx_vector_len = AVX_NoVec;
 943     tuple_type = EVEX_ETUP;
 944     is_evex_instruction = false;
 945   }
 946 
 947   void lea(Register dst, Address src);
 948 
 949   void mov(Register dst, Register src);
 950 
 951   void pusha();
 952   void popa();
 953 
 954   void pushf();
 955   void popf();
 956 
 957   void push(int32_t imm32);
 958 
 959   void push(Register src);
 960 
 961   void pop(Register dst);
 962 
 963   // These are dummies to prevent surprise implicit conversions to Register
 964   void push(void* v);
 965   void pop(void* v);
 966 


1427         if (offset < -128) {
1428           offset = -128;
1429         }
1430 
1431         lock();
1432         addl(Address(rsp, offset), 0);// Assert the lock# signal here
1433       }
1434     }
1435   }
1436 
1437   void mfence();
1438 
1439   // Moves
1440 
1441   void mov64(Register dst, int64_t imm64);
1442 
1443   void movb(Address dst, Register src);
1444   void movb(Address dst, int imm8);
1445   void movb(Register dst, Address src);
1446 
1447   void kmovq(KRegister dst, KRegister src);
1448   void kmovql(KRegister dst, Register src);
1449   void kmovdl(KRegister dst, Register src);
1450   void kmovq(Address dst, KRegister src);
1451   void kmovq(KRegister dst, Address src);
1452 
1453   void movdl(XMMRegister dst, Register src);
1454   void movdl(Register dst, XMMRegister src);
1455   void movdl(XMMRegister dst, Address src);
1456   void movdl(Address dst, XMMRegister src);
1457 
1458   // Move Double Quadword
1459   void movdq(XMMRegister dst, Register src);
1460   void movdq(Register dst, XMMRegister src);
1461 
1462   // Move Aligned Double Quadword
1463   void movdqa(XMMRegister dst, XMMRegister src);
1464   void movdqa(XMMRegister dst, Address src);
1465 
1466   // Move Unaligned Double Quadword
1467   void movdqu(Address     dst, XMMRegister src);
1468   void movdqu(XMMRegister dst, Address src);
1469   void movdqu(XMMRegister dst, XMMRegister src);
1470 
1471   // Move Unaligned 256bit Vector
1472   void vmovdqu(Address dst, XMMRegister src);
1473   void vmovdqu(XMMRegister dst, Address src);
1474   void vmovdqu(XMMRegister dst, XMMRegister src);
1475 
1476    // Move Unaligned 512bit Vector
1477   void evmovdqu(Address dst, XMMRegister src, int vector_len);
1478   void evmovdqu(XMMRegister dst, Address src, int vector_len);
1479   void evmovdqu(XMMRegister dst, XMMRegister src, int vector_len);
1480 
1481   // Move lower 64bit to high 64bit in 128bit register
1482   void movlhps(XMMRegister dst, XMMRegister src);
1483 
1484   void movl(Register dst, int32_t imm32);
1485   void movl(Address dst, int32_t imm32);
1486   void movl(Register dst, Register src);
1487   void movl(Register dst, Address src);
1488   void movl(Address dst, Register src);
1489 
1490   // These dummies prevent using movl from converting a zero (like NULL) into Register
1491   // by giving the compiler two choices it can't resolve
1492 
1493   void movl(Address  dst, void* junk);
1494   void movl(Register dst, void* junk);
1495 
1496 #ifdef _LP64
1497   void movq(Register dst, Register src);
1498   void movq(Register dst, Address src);
1499   void movq(Address  dst, Register src);
1500 #endif


1586 
1587   void notl(Register dst);
1588 
1589 #ifdef _LP64
1590   void notq(Register dst);
1591 #endif
1592 
1593   void orl(Address dst, int32_t imm32);
1594   void orl(Register dst, int32_t imm32);
1595   void orl(Register dst, Address src);
1596   void orl(Register dst, Register src);
1597 
1598   void orq(Address dst, int32_t imm32);
1599   void orq(Register dst, int32_t imm32);
1600   void orq(Register dst, Address src);
1601   void orq(Register dst, Register src);
1602 
1603   // Pack with unsigned saturation
1604   void packuswb(XMMRegister dst, XMMRegister src);
1605   void packuswb(XMMRegister dst, Address src);
1606   void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1607 
1608   // Pemutation of 64bit words
1609   void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
1610 
1611   void pause();
1612 
1613   // SSE4.2 string instructions
1614   void pcmpestri(XMMRegister xmm1, XMMRegister xmm2, int imm8);
1615   void pcmpestri(XMMRegister xmm1, Address src, int imm8);
1616 
1617   // SSE 4.1 extract
1618   void pextrd(Register dst, XMMRegister src, int imm8);
1619   void pextrq(Register dst, XMMRegister src, int imm8);
1620 
1621   // SSE 4.1 insert
1622   void pinsrd(XMMRegister dst, Register src, int imm8);
1623   void pinsrq(XMMRegister dst, Register src, int imm8);
1624 
1625   // SSE4.1 packed move
1626   void pmovzxbw(XMMRegister dst, XMMRegister src);
1627   void pmovzxbw(XMMRegister dst, Address src);
1628 
1629 #ifndef _LP64 // no 32bit push/pop on amd64


1834   void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src);
1835   void vdivsd(XMMRegister dst, XMMRegister nds, Address src);
1836   void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
1837   void vdivss(XMMRegister dst, XMMRegister nds, Address src);
1838   void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src);
1839   void vmulsd(XMMRegister dst, XMMRegister nds, Address src);
1840   void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
1841   void vmulss(XMMRegister dst, XMMRegister nds, Address src);
1842   void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src);
1843   void vsubsd(XMMRegister dst, XMMRegister nds, Address src);
1844   void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
1845   void vsubss(XMMRegister dst, XMMRegister nds, Address src);
1846   void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src);
1847 
1848 
1849   //====================VECTOR ARITHMETIC=====================================
1850 
1851   // Add Packed Floating-Point Values
1852   void addpd(XMMRegister dst, XMMRegister src);
1853   void addps(XMMRegister dst, XMMRegister src);
1854   void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1855   void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1856   void vaddpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1857   void vaddps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1858 
1859   // Subtract Packed Floating-Point Values
1860   void subpd(XMMRegister dst, XMMRegister src);
1861   void subps(XMMRegister dst, XMMRegister src);
1862   void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1863   void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1864   void vsubpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1865   void vsubps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1866 
1867   // Multiply Packed Floating-Point Values
1868   void mulpd(XMMRegister dst, XMMRegister src);
1869   void mulps(XMMRegister dst, XMMRegister src);
1870   void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1871   void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1872   void vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1873   void vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1874 
1875   // Divide Packed Floating-Point Values
1876   void divpd(XMMRegister dst, XMMRegister src);
1877   void divps(XMMRegister dst, XMMRegister src);
1878   void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1879   void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1880   void vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1881   void vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1882 
1883   // Bitwise Logical AND of Packed Floating-Point Values
1884   void andpd(XMMRegister dst, XMMRegister src);
1885   void andps(XMMRegister dst, XMMRegister src);
1886   void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1887   void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1888   void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1889   void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1890 
1891   // Bitwise Logical XOR of Packed Floating-Point Values
1892   void xorpd(XMMRegister dst, XMMRegister src);
1893   void xorps(XMMRegister dst, XMMRegister src);
1894   void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1895   void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1896   void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1897   void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1898 
1899   // Add horizontal packed integers
1900   void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1901   void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1902   void phaddw(XMMRegister dst, XMMRegister src);
1903   void phaddd(XMMRegister dst, XMMRegister src);
1904 
1905   // Add packed integers
1906   void paddb(XMMRegister dst, XMMRegister src);
1907   void paddw(XMMRegister dst, XMMRegister src);
1908   void paddd(XMMRegister dst, XMMRegister src);
1909   void paddq(XMMRegister dst, XMMRegister src);
1910   void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1911   void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1912   void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1913   void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1914   void vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1915   void vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1916   void vpaddd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1917   void vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1918 
1919   // Sub packed integers
1920   void psubb(XMMRegister dst, XMMRegister src);
1921   void psubw(XMMRegister dst, XMMRegister src);
1922   void psubd(XMMRegister dst, XMMRegister src);
1923   void psubq(XMMRegister dst, XMMRegister src);
1924   void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1925   void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1926   void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1927   void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1928   void vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1929   void vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1930   void vpsubd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1931   void vpsubq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1932 
1933   // Multiply packed integers (only shorts and ints)
1934   void pmullw(XMMRegister dst, XMMRegister src);
1935   void pmulld(XMMRegister dst, XMMRegister src);
1936   void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1937   void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1938   void vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1939   void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1940   void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1941   void vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1942 
1943   // Shift left packed integers
1944   void psllw(XMMRegister dst, int shift);
1945   void pslld(XMMRegister dst, int shift);
1946   void psllq(XMMRegister dst, int shift);
1947   void psllw(XMMRegister dst, XMMRegister shift);
1948   void pslld(XMMRegister dst, XMMRegister shift);
1949   void psllq(XMMRegister dst, XMMRegister shift);
1950   void vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
1951   void vpslld(XMMRegister dst, XMMRegister src, int shift, int vector_len);
1952   void vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
1953   void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
1954   void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
1955   void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
1956 
1957   // Logical shift right packed integers
1958   void psrlw(XMMRegister dst, int shift);
1959   void psrld(XMMRegister dst, int shift);
1960   void psrlq(XMMRegister dst, int shift);
1961   void psrlw(XMMRegister dst, XMMRegister shift);
1962   void psrld(XMMRegister dst, XMMRegister shift);
1963   void psrlq(XMMRegister dst, XMMRegister shift);
1964   void vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
1965   void vpsrld(XMMRegister dst, XMMRegister src, int shift, int vector_len);
1966   void vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
1967   void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
1968   void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
1969   void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
1970 
1971   // Arithmetic shift right packed integers (only shorts and ints, no instructions for longs)
1972   void psraw(XMMRegister dst, int shift);
1973   void psrad(XMMRegister dst, int shift);
1974   void psraw(XMMRegister dst, XMMRegister shift);
1975   void psrad(XMMRegister dst, XMMRegister shift);
1976   void vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
1977   void vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len);
1978   void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
1979   void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
1980 
1981   // And packed integers
1982   void pand(XMMRegister dst, XMMRegister src);
1983   void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1984   void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1985 
1986   // Or packed integers
1987   void por(XMMRegister dst, XMMRegister src);
1988   void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1989   void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1990 
1991   // Xor packed integers
1992   void pxor(XMMRegister dst, XMMRegister src);
1993   void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1994   void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1995 
1996   // Copy low 128bit into high 128bit of YMM registers.
1997   void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
1998   void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
1999   void vextractf128h(XMMRegister dst, XMMRegister src);
2000   void vextracti128h(XMMRegister dst, XMMRegister src);
2001 
2002   // Load/store high 128bit of YMM registers which does not destroy other half.
2003   void vinsertf128h(XMMRegister dst, Address src);
2004   void vinserti128h(XMMRegister dst, Address src);
2005   void vextractf128h(Address dst, XMMRegister src);
2006   void vextracti128h(Address dst, XMMRegister src);
2007 
2008   // Copy low 256bit into high 256bit of ZMM registers.
2009   void vinserti64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src);
2010   void vinsertf64x4h(XMMRegister dst, XMMRegister nds, XMMRegister src);
2011   void vextracti64x4h(XMMRegister dst, XMMRegister src);
2012   void vextractf64x4h(XMMRegister dst, XMMRegister src);
2013   void vextractf64x4h(Address dst, XMMRegister src);
2014   void vinsertf64x4h(XMMRegister dst, Address src);
2015 
2016   // Copy targeted 128bit segments of the ZMM registers
2017   void vextracti64x2h(XMMRegister dst, XMMRegister src, int value);
2018   void vextractf64x2h(XMMRegister dst, XMMRegister src, int value);
2019   void vextractf32x4h(XMMRegister dst, XMMRegister src, int value);
2020 
2021   // duplicate 4-bytes integer data from src into 8 locations in dest
2022   void vpbroadcastd(XMMRegister dst, XMMRegister src);
2023 
2024   // duplicate 4-bytes integer data from src into vector_len locations in dest
2025   void evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len);
2026 
2027   // Carry-Less Multiplication Quadword
2028   void pclmulqdq(XMMRegister dst, XMMRegister src, int mask);
2029   void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask);
2030 
2031   // AVX instruction which is used to clear upper 128 bits of YMM registers and
2032   // to avoid transaction penalty between AVX and SSE states. There is no
2033   // penalty if legacy SSE instructions are encoded using VEX prefix because
2034   // they always clear upper 128 bits. It should be used before calling
2035   // runtime code and native libraries.
2036   void vzeroupper();
2037 
2038  protected:
2039   // Next instructions require address alignment 16 bytes SSE mode.
2040   // They should be called only from corresponding MacroAssembler instructions.
2041   void andpd(XMMRegister dst, Address src);
2042   void andps(XMMRegister dst, Address src);
2043   void xorpd(XMMRegister dst, Address src);
2044   void xorps(XMMRegister dst, Address src);
2045 
2046 };
src/cpu/x86/vm/assembler_x86.hpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File