1655 void orq(Register dst, Register src); 1656 1657 // Pack with unsigned saturation 1658 void packuswb(XMMRegister dst, XMMRegister src); 1659 void packuswb(XMMRegister dst, Address src); 1660 void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1661 1662 // Pemutation of 64bit words 1663 void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len); 1664 void vpermq(XMMRegister dst, XMMRegister src, int imm8); 1665 1666 void pause(); 1667 1668 // SSE4.2 string instructions 1669 void pcmpestri(XMMRegister xmm1, XMMRegister xmm2, int imm8); 1670 void pcmpestri(XMMRegister xmm1, Address src, int imm8); 1671 1672 // SSE 4.1 extract 1673 void pextrd(Register dst, XMMRegister src, int imm8); 1674 void pextrq(Register dst, XMMRegister src, int imm8); 1675 1676 // SSE 4.1 insert 1677 void pinsrd(XMMRegister dst, Register src, int imm8); 1678 void pinsrq(XMMRegister dst, Register src, int imm8); 1679 1680 // SSE4.1 packed move 1681 void pmovzxbw(XMMRegister dst, XMMRegister src); 1682 void pmovzxbw(XMMRegister dst, Address src); 1683 1684 #ifndef _LP64 // no 32bit push/pop on amd64 1685 void popl(Address dst); 1686 #endif 1687 1688 #ifdef _LP64 1689 void popq(Address dst); 1690 #endif 1691 1692 void popcntl(Register dst, Address src); 1693 void popcntl(Register dst, Register src); 1694 1695 #ifdef _LP64 1696 void popcntq(Register dst, Address src); 1697 void popcntq(Register dst, Register src); 1698 #endif 1908 //====================VECTOR ARITHMETIC===================================== 1909 1910 // Add Packed Floating-Point Values 1911 void addpd(XMMRegister dst, XMMRegister src); 1912 void addps(XMMRegister dst, XMMRegister src); 1913 void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1914 void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1915 void vaddpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1916 void vaddps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1917 1918 // Subtract Packed Floating-Point Values 1919 void subpd(XMMRegister dst, XMMRegister src); 1920 void subps(XMMRegister dst, XMMRegister src); 1921 void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1922 void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1923 void vsubpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1924 void vsubps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1925 1926 // Multiply Packed Floating-Point Values 1927 void mulpd(XMMRegister dst, XMMRegister src); 1928 void mulps(XMMRegister dst, XMMRegister src); 1929 void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1930 void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1931 void vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1932 void vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1933 1934 // Divide Packed Floating-Point Values 1935 void divpd(XMMRegister dst, XMMRegister src); 1936 void divps(XMMRegister dst, XMMRegister src); 1937 void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1938 void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1939 void vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1940 void vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1941 1942 // Sqrt Packed Floating-Point Values - Double precision only 1943 void vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len); 1944 void vsqrtpd(XMMRegister dst, Address src, int vector_len); 1945 1946 // Bitwise Logical AND of Packed Floating-Point Values 1947 void andpd(XMMRegister dst, XMMRegister src); 1948 void andps(XMMRegister dst, XMMRegister src); 1949 void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1950 void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1951 void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1952 void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1953 1954 // Bitwise Logical XOR of Packed Floating-Point Values 1955 void xorpd(XMMRegister dst, XMMRegister src); 1956 void xorps(XMMRegister dst, XMMRegister src); 1957 void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1958 void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1959 void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1960 void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1961 1962 // Add horizontal packed integers 1963 void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1964 void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1965 void phaddw(XMMRegister dst, XMMRegister src); 1966 void phaddd(XMMRegister dst, XMMRegister src); 1967 1968 // Add packed integers 1969 void paddb(XMMRegister dst, XMMRegister src); 1970 void paddw(XMMRegister dst, XMMRegister src); 1971 void paddd(XMMRegister dst, XMMRegister src); 1972 void paddq(XMMRegister dst, XMMRegister src); 1973 void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 2028 void vpsrld(XMMRegister dst, XMMRegister src, int shift, int vector_len); 2029 void vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_len); 2030 void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); 2031 void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); 2032 void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); 2033 2034 // Arithmetic shift right packed integers (only shorts and ints, no instructions for longs) 2035 void psraw(XMMRegister dst, int shift); 2036 void psrad(XMMRegister dst, int shift); 2037 void psraw(XMMRegister dst, XMMRegister shift); 2038 void psrad(XMMRegister dst, XMMRegister shift); 2039 void vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len); 2040 void vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len); 2041 void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); 2042 void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); 2043 2044 // And packed integers 2045 void pand(XMMRegister dst, XMMRegister src); 2046 void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 2047 void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 2048 2049 // Or packed integers 2050 void por(XMMRegister dst, XMMRegister src); 2051 void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 2052 void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 2053 2054 // Xor packed integers 2055 void pxor(XMMRegister dst, XMMRegister src); 2056 void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 2057 void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 2058 2059 // Copy low 128bit into high 128bit of YMM registers. 2060 void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src); 2061 void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src); 2062 void vextractf128h(XMMRegister dst, XMMRegister src); 2063 void vextracti128h(XMMRegister dst, XMMRegister src); 2064 2065 // Load/store high 128bit of YMM registers which does not destroy other half. 2066 void vinsertf128h(XMMRegister dst, Address src); 2067 void vinserti128h(XMMRegister dst, Address src); | 1655 void orq(Register dst, Register src); 1656 1657 // Pack with unsigned saturation 1658 void packuswb(XMMRegister dst, XMMRegister src); 1659 void packuswb(XMMRegister dst, Address src); 1660 void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1661 1662 // Pemutation of 64bit words 1663 void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len); 1664 void vpermq(XMMRegister dst, XMMRegister src, int imm8); 1665 1666 void pause(); 1667 1668 // SSE4.2 string instructions 1669 void pcmpestri(XMMRegister xmm1, XMMRegister xmm2, int imm8); 1670 void pcmpestri(XMMRegister xmm1, Address src, int imm8); 1671 1672 // SSE 4.1 extract 1673 void pextrd(Register dst, XMMRegister src, int imm8); 1674 void pextrq(Register dst, XMMRegister src, int imm8); 1675 // SSE 2 extract 1676 void pextrw(Register dst, XMMRegister src, int imm8); 1677 1678 // SSE 4.1 insert 1679 void pinsrd(XMMRegister dst, Register src, int imm8); 1680 void pinsrq(XMMRegister dst, Register src, int imm8); 1681 // SSE 2 insert 1682 void pinsrw(XMMRegister dst, Register src, int imm8); 1683 1684 // SSE4.1 packed move 1685 void pmovzxbw(XMMRegister dst, XMMRegister src); 1686 void pmovzxbw(XMMRegister dst, Address src); 1687 1688 #ifndef _LP64 // no 32bit push/pop on amd64 1689 void popl(Address dst); 1690 #endif 1691 1692 #ifdef _LP64 1693 void popq(Address dst); 1694 #endif 1695 1696 void popcntl(Register dst, Address src); 1697 void popcntl(Register dst, Register src); 1698 1699 #ifdef _LP64 1700 void popcntq(Register dst, Address src); 1701 void popcntq(Register dst, Register src); 1702 #endif 1912 //====================VECTOR ARITHMETIC===================================== 1913 1914 // Add Packed Floating-Point Values 1915 void addpd(XMMRegister dst, XMMRegister src); 1916 void addps(XMMRegister dst, XMMRegister src); 1917 void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1918 void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1919 void vaddpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1920 void vaddps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1921 1922 // Subtract Packed Floating-Point Values 1923 void subpd(XMMRegister dst, XMMRegister src); 1924 void subps(XMMRegister dst, XMMRegister src); 1925 void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1926 void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1927 void vsubpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1928 void vsubps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1929 1930 // Multiply Packed Floating-Point Values 1931 void mulpd(XMMRegister dst, XMMRegister src); 1932 void mulpd(XMMRegister dst, Address src); 1933 void mulps(XMMRegister dst, XMMRegister src); 1934 void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1935 void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1936 void vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1937 void vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1938 1939 // Divide Packed Floating-Point Values 1940 void divpd(XMMRegister dst, XMMRegister src); 1941 void divps(XMMRegister dst, XMMRegister src); 1942 void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1943 void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1944 void vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1945 void vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1946 1947 // Sqrt Packed Floating-Point Values - Double precision only 1948 void vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len); 1949 void vsqrtpd(XMMRegister dst, Address src, int vector_len); 1950 1951 // Bitwise Logical AND of Packed Floating-Point Values 1952 void andpd(XMMRegister dst, XMMRegister src); 1953 void andps(XMMRegister dst, XMMRegister src); 1954 void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1955 void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1956 void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1957 void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1958 1959 void unpckhpd(XMMRegister dst, XMMRegister src); 1960 void unpcklpd(XMMRegister dst, XMMRegister src); 1961 1962 // Bitwise Logical XOR of Packed Floating-Point Values 1963 void xorpd(XMMRegister dst, XMMRegister src); 1964 void xorps(XMMRegister dst, XMMRegister src); 1965 void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1966 void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1967 void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1968 void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 1969 1970 // Add horizontal packed integers 1971 void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1972 void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 1973 void phaddw(XMMRegister dst, XMMRegister src); 1974 void phaddd(XMMRegister dst, XMMRegister src); 1975 1976 // Add packed integers 1977 void paddb(XMMRegister dst, XMMRegister src); 1978 void paddw(XMMRegister dst, XMMRegister src); 1979 void paddd(XMMRegister dst, XMMRegister src); 1980 void paddq(XMMRegister dst, XMMRegister src); 1981 void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 2036 void vpsrld(XMMRegister dst, XMMRegister src, int shift, int vector_len); 2037 void vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_len); 2038 void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); 2039 void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); 2040 void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); 2041 2042 // Arithmetic shift right packed integers (only shorts and ints, no instructions for longs) 2043 void psraw(XMMRegister dst, int shift); 2044 void psrad(XMMRegister dst, int shift); 2045 void psraw(XMMRegister dst, XMMRegister shift); 2046 void psrad(XMMRegister dst, XMMRegister shift); 2047 void vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len); 2048 void vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len); 2049 void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); 2050 void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); 2051 2052 // And packed integers 2053 void pand(XMMRegister dst, XMMRegister src); 2054 void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 2055 void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 2056 2057 // Andn packed integers 2058 void pandn(XMMRegister dst, XMMRegister src); 2059 2060 // Or packed integers 2061 void por(XMMRegister dst, XMMRegister src); 2062 void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 2063 void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 2064 2065 // Xor packed integers 2066 void pxor(XMMRegister dst, XMMRegister src); 2067 void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); 2068 void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len); 2069 2070 // Copy low 128bit into high 128bit of YMM registers. 2071 void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src); 2072 void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src); 2073 void vextractf128h(XMMRegister dst, XMMRegister src); 2074 void vextracti128h(XMMRegister dst, XMMRegister src); 2075 2076 // Load/store high 128bit of YMM registers which does not destroy other half. 2077 void vinsertf128h(XMMRegister dst, Address src); 2078 void vinserti128h(XMMRegister dst, Address src); |