1636 void orq(Register dst, Address src);
1637 void orq(Register dst, Register src);
1638
1639 // Pack with unsigned saturation
1640 void packuswb(XMMRegister dst, XMMRegister src);
1641 void packuswb(XMMRegister dst, Address src);
1642 void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1643
1644 // Pemutation of 64bit words
1645 void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
1646
1647 void pause();
1648
1649 // SSE4.2 string instructions
1650 void pcmpestri(XMMRegister xmm1, XMMRegister xmm2, int imm8);
1651 void pcmpestri(XMMRegister xmm1, Address src, int imm8);
1652
1653 // SSE 4.1 extract
1654 void pextrd(Register dst, XMMRegister src, int imm8);
1655 void pextrq(Register dst, XMMRegister src, int imm8);
1656
1657 // SSE 4.1 insert
1658 void pinsrd(XMMRegister dst, Register src, int imm8);
1659 void pinsrq(XMMRegister dst, Register src, int imm8);
1660
1661 // SSE4.1 packed move
1662 void pmovzxbw(XMMRegister dst, XMMRegister src);
1663 void pmovzxbw(XMMRegister dst, Address src);
1664
1665 #ifndef _LP64 // no 32bit push/pop on amd64
1666 void popl(Address dst);
1667 #endif
1668
1669 #ifdef _LP64
1670 void popq(Address dst);
1671 #endif
1672
1673 void popcntl(Register dst, Address src);
1674 void popcntl(Register dst, Register src);
1675
1676 #ifdef _LP64
1677 void popcntq(Register dst, Address src);
1678 void popcntq(Register dst, Register src);
1679 #endif
1889 //====================VECTOR ARITHMETIC=====================================
1890
1891 // Add Packed Floating-Point Values
1892 void addpd(XMMRegister dst, XMMRegister src);
1893 void addps(XMMRegister dst, XMMRegister src);
1894 void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1895 void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1896 void vaddpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1897 void vaddps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1898
1899 // Subtract Packed Floating-Point Values
1900 void subpd(XMMRegister dst, XMMRegister src);
1901 void subps(XMMRegister dst, XMMRegister src);
1902 void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1903 void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1904 void vsubpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1905 void vsubps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1906
1907 // Multiply Packed Floating-Point Values
1908 void mulpd(XMMRegister dst, XMMRegister src);
1909 void mulps(XMMRegister dst, XMMRegister src);
1910 void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1911 void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1912 void vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1913 void vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1914
1915 // Divide Packed Floating-Point Values
1916 void divpd(XMMRegister dst, XMMRegister src);
1917 void divps(XMMRegister dst, XMMRegister src);
1918 void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1919 void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1920 void vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1921 void vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1922
1923 // Bitwise Logical AND of Packed Floating-Point Values
1924 void andpd(XMMRegister dst, XMMRegister src);
1925 void andps(XMMRegister dst, XMMRegister src);
1926 void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1927 void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1928 void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1929 void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1930
1931 // Bitwise Logical XOR of Packed Floating-Point Values
1932 void xorpd(XMMRegister dst, XMMRegister src);
1933 void xorps(XMMRegister dst, XMMRegister src);
1934 void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1935 void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1936 void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1937 void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1938
1939 // Add horizontal packed integers
1940 void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1941 void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1942 void phaddw(XMMRegister dst, XMMRegister src);
1943 void phaddd(XMMRegister dst, XMMRegister src);
1944
1945 // Add packed integers
1946 void paddb(XMMRegister dst, XMMRegister src);
1947 void paddw(XMMRegister dst, XMMRegister src);
1948 void paddd(XMMRegister dst, XMMRegister src);
1949 void paddq(XMMRegister dst, XMMRegister src);
1950 void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2003 void psrlq(XMMRegister dst, XMMRegister shift);
2004 void vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2005 void vpsrld(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2006 void vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2007 void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2008 void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2009 void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2010
2011 // Arithmetic shift right packed integers (only shorts and ints, no instructions for longs)
2012 void psraw(XMMRegister dst, int shift);
2013 void psrad(XMMRegister dst, int shift);
2014 void psraw(XMMRegister dst, XMMRegister shift);
2015 void psrad(XMMRegister dst, XMMRegister shift);
2016 void vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2017 void vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2018 void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2019 void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2020
2021 // And packed integers
2022 void pand(XMMRegister dst, XMMRegister src);
2023 void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2024 void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2025
2026 // Or packed integers
2027 void por(XMMRegister dst, XMMRegister src);
2028 void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2029 void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2030
2031 // Xor packed integers
2032 void pxor(XMMRegister dst, XMMRegister src);
2033 void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2034 void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2035
2036 // Copy low 128bit into high 128bit of YMM registers.
2037 void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
2038 void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
2039 void vextractf128h(XMMRegister dst, XMMRegister src);
2040 void vextracti128h(XMMRegister dst, XMMRegister src);
2041
2042 // Load/store high 128bit of YMM registers which does not destroy other half.
|
1636 void orq(Register dst, Address src);
1637 void orq(Register dst, Register src);
1638
1639 // Pack with unsigned saturation
1640 void packuswb(XMMRegister dst, XMMRegister src);
1641 void packuswb(XMMRegister dst, Address src);
1642 void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1643
1644 // Pemutation of 64bit words
1645 void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
1646
1647 void pause();
1648
1649 // SSE4.2 string instructions
1650 void pcmpestri(XMMRegister xmm1, XMMRegister xmm2, int imm8);
1651 void pcmpestri(XMMRegister xmm1, Address src, int imm8);
1652
1653 // SSE 4.1 extract
1654 void pextrd(Register dst, XMMRegister src, int imm8);
1655 void pextrq(Register dst, XMMRegister src, int imm8);
1656 // SSE 2 extract
1657 void pextrw(Register dst, XMMRegister src, int imm8);
1658
1659 // SSE 4.1 insert
1660 void pinsrd(XMMRegister dst, Register src, int imm8);
1661 void pinsrq(XMMRegister dst, Register src, int imm8);
1662 // SSE 2 insert
1663 void pinsrw(XMMRegister dst, Register src, int imm8);
1664
1665 // SSE4.1 packed move
1666 void pmovzxbw(XMMRegister dst, XMMRegister src);
1667 void pmovzxbw(XMMRegister dst, Address src);
1668
1669 #ifndef _LP64 // no 32bit push/pop on amd64
1670 void popl(Address dst);
1671 #endif
1672
1673 #ifdef _LP64
1674 void popq(Address dst);
1675 #endif
1676
1677 void popcntl(Register dst, Address src);
1678 void popcntl(Register dst, Register src);
1679
1680 #ifdef _LP64
1681 void popcntq(Register dst, Address src);
1682 void popcntq(Register dst, Register src);
1683 #endif
1893 //====================VECTOR ARITHMETIC=====================================
1894
1895 // Add Packed Floating-Point Values
1896 void addpd(XMMRegister dst, XMMRegister src);
1897 void addps(XMMRegister dst, XMMRegister src);
1898 void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1899 void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1900 void vaddpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1901 void vaddps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1902
1903 // Subtract Packed Floating-Point Values
1904 void subpd(XMMRegister dst, XMMRegister src);
1905 void subps(XMMRegister dst, XMMRegister src);
1906 void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1907 void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1908 void vsubpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1909 void vsubps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1910
1911 // Multiply Packed Floating-Point Values
1912 void mulpd(XMMRegister dst, XMMRegister src);
1913 void mulpd(XMMRegister dst, Address src);
1914 void mulps(XMMRegister dst, XMMRegister src);
1915 void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1916 void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1917 void vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1918 void vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1919
1920 // Divide Packed Floating-Point Values
1921 void divpd(XMMRegister dst, XMMRegister src);
1922 void divps(XMMRegister dst, XMMRegister src);
1923 void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1924 void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1925 void vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1926 void vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1927
1928 // Bitwise Logical AND of Packed Floating-Point Values
1929 void andpd(XMMRegister dst, XMMRegister src);
1930 void andps(XMMRegister dst, XMMRegister src);
1931 void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1932 void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1933 void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1934 void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1935
1936 void unpckhpd(XMMRegister dst, XMMRegister src);
1937 void unpcklpd(XMMRegister dst, XMMRegister src);
1938
1939 // Bitwise Logical XOR of Packed Floating-Point Values
1940 void xorpd(XMMRegister dst, XMMRegister src);
1941 void xorps(XMMRegister dst, XMMRegister src);
1942 void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1943 void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1944 void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1945 void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1946
1947 // Add horizontal packed integers
1948 void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1949 void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
1950 void phaddw(XMMRegister dst, XMMRegister src);
1951 void phaddd(XMMRegister dst, XMMRegister src);
1952
1953 // Add packed integers
1954 void paddb(XMMRegister dst, XMMRegister src);
1955 void paddw(XMMRegister dst, XMMRegister src);
1956 void paddd(XMMRegister dst, XMMRegister src);
1957 void paddq(XMMRegister dst, XMMRegister src);
1958 void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2011 void psrlq(XMMRegister dst, XMMRegister shift);
2012 void vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2013 void vpsrld(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2014 void vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2015 void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2016 void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2017 void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2018
2019 // Arithmetic shift right packed integers (only shorts and ints, no instructions for longs)
2020 void psraw(XMMRegister dst, int shift);
2021 void psrad(XMMRegister dst, int shift);
2022 void psraw(XMMRegister dst, XMMRegister shift);
2023 void psrad(XMMRegister dst, XMMRegister shift);
2024 void vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2025 void vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len);
2026 void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2027 void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
2028
2029 // And packed integers
2030 void pand(XMMRegister dst, XMMRegister src);
2031 void pandn(XMMRegister dst, XMMRegister src);
2032 void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2033 void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2034
2035 // Or packed integers
2036 void por(XMMRegister dst, XMMRegister src);
2037 void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2038 void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2039
2040 // Xor packed integers
2041 void pxor(XMMRegister dst, XMMRegister src);
2042 void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
2043 void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
2044
2045 // Copy low 128bit into high 128bit of YMM registers.
2046 void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
2047 void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
2048 void vextractf128h(XMMRegister dst, XMMRegister src);
2049 void vextracti128h(XMMRegister dst, XMMRegister src);
2050
2051 // Load/store high 128bit of YMM registers which does not destroy other half.
|