--- /dev/null 2017-01-22 10:16:57.869617664 -0800 +++ new/src/jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.lir.amd64/src/org/graalvm/compiler/lir/amd64/AMD64MathIntrinsicUnaryOp.java 2017-02-15 17:04:59.914371270 -0800 @@ -0,0 +1,3843 @@ +/* + * Copyright (c) 2011, 2015, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.graalvm.compiler.lir.amd64; + +import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL; +import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG; +import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.STACK; +import static jdk.vm.ci.code.ValueUtil.asRegister; + +import org.graalvm.compiler.asm.Label; +import org.graalvm.compiler.asm.amd64.AMD64Address; +import org.graalvm.compiler.asm.amd64.AMD64Address.Scale; +import org.graalvm.compiler.asm.amd64.AMD64Assembler.ConditionFlag; +import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler; +import org.graalvm.compiler.core.common.LIRKind; +import org.graalvm.compiler.debug.GraalError; +import org.graalvm.compiler.lir.LIRInstructionClass; +import org.graalvm.compiler.lir.Opcode; +import org.graalvm.compiler.lir.asm.ArrayDataPointerConstant; +import org.graalvm.compiler.lir.asm.CompilationResultBuilder; +import org.graalvm.compiler.lir.gen.LIRGeneratorTool; + +import jdk.vm.ci.amd64.AMD64; +import jdk.vm.ci.amd64.AMD64.CPUFeature; +import jdk.vm.ci.amd64.AMD64Kind; +import jdk.vm.ci.code.Register; +import jdk.vm.ci.meta.AllocatableValue; +import jdk.vm.ci.meta.Value; + +public final class AMD64MathIntrinsicUnaryOp extends AMD64LIRInstruction { + public static final LIRInstructionClass TYPE = LIRInstructionClass.create(AMD64MathIntrinsicUnaryOp.class); + + public enum UnaryIntrinsicOpcode { + LOG, + LOG10, + SIN, + COS, + TAN, + EXP + } + + @Opcode private final UnaryIntrinsicOpcode opcode; + @Def protected Value result; + @Use protected Value input; + @Temp({REG, ILLEGAL}) protected Value xmm1Temp = Value.ILLEGAL; + @Temp({REG, ILLEGAL}) protected Value xmm2Temp = Value.ILLEGAL; + @Temp({REG, ILLEGAL}) protected Value xmm3Temp = Value.ILLEGAL; + @Temp({REG, ILLEGAL}) protected Value xmm4Temp = Value.ILLEGAL; + @Temp({REG, ILLEGAL}) protected Value xmm5Temp = Value.ILLEGAL; + @Temp({REG, ILLEGAL}) protected Value xmm6Temp = Value.ILLEGAL; + @Temp({REG, ILLEGAL}) protected Value xmm7Temp = Value.ILLEGAL; + @Temp({REG, ILLEGAL}) protected Value xmm8Temp = Value.ILLEGAL; + @Temp({REG, ILLEGAL}) protected Value xmm9Temp = Value.ILLEGAL; + @Temp({REG, ILLEGAL}) protected Value xmm10Temp = Value.ILLEGAL; + @Temp({REG, ILLEGAL}) protected Value gpr1Temp = Value.ILLEGAL; + @Temp({REG, ILLEGAL}) protected Value gpr2Temp = Value.ILLEGAL; + @Temp protected AllocatableValue rcxTemp; + @Temp({REG, ILLEGAL}) protected Value gpr4Temp = Value.ILLEGAL; + @Temp({REG, ILLEGAL}) protected Value gpr5Temp = Value.ILLEGAL; + @Temp({REG, ILLEGAL}) protected Value gpr6Temp = Value.ILLEGAL; + @Temp({REG, ILLEGAL}) protected Value gpr7Temp = Value.ILLEGAL; + @Temp({REG, ILLEGAL}) protected Value gpr8Temp = Value.ILLEGAL; + @Temp({REG, ILLEGAL}) protected Value gpr9Temp = Value.ILLEGAL; + @Temp({REG, ILLEGAL}) protected Value gpr10Temp = Value.ILLEGAL; + @Temp({STACK, ILLEGAL}) protected Value stackTemp = Value.ILLEGAL; + + CompilationResultBuilder internalCrb; + + public AMD64MathIntrinsicUnaryOp(LIRGeneratorTool tool, UnaryIntrinsicOpcode opcode, Value result, Value input, Value stackTemp) { + super(TYPE); + this.opcode = opcode; + this.result = result; + this.input = input; + if (opcode == UnaryIntrinsicOpcode.LOG || opcode == UnaryIntrinsicOpcode.LOG10 || + opcode == UnaryIntrinsicOpcode.SIN || opcode == UnaryIntrinsicOpcode.COS || + opcode == UnaryIntrinsicOpcode.TAN || opcode == UnaryIntrinsicOpcode.EXP) { + this.gpr1Temp = tool.newVariable(LIRKind.value(AMD64Kind.QWORD)); + this.gpr2Temp = tool.newVariable(LIRKind.value(AMD64Kind.QWORD)); + this.rcxTemp = AMD64.rcx.asValue(LIRKind.value(AMD64Kind.QWORD)); + this.gpr4Temp = tool.newVariable(LIRKind.value(AMD64Kind.QWORD)); + this.xmm1Temp = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); + this.xmm2Temp = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); + this.xmm3Temp = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); + this.xmm4Temp = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); + this.xmm5Temp = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); + this.xmm6Temp = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); + this.xmm7Temp = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); + + if (opcode == UnaryIntrinsicOpcode.EXP) { + this.gpr5Temp = tool.newVariable(LIRKind.value(AMD64Kind.QWORD)); + this.xmm8Temp = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); + this.xmm9Temp = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); + this.xmm10Temp = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); + } + + if (opcode == UnaryIntrinsicOpcode.TAN) { + this.gpr5Temp = tool.newVariable(LIRKind.value(AMD64Kind.QWORD)); + this.gpr6Temp = tool.newVariable(LIRKind.value(AMD64Kind.QWORD)); + this.gpr7Temp = tool.newVariable(LIRKind.value(AMD64Kind.QWORD)); + this.gpr8Temp = tool.newVariable(LIRKind.value(AMD64Kind.QWORD)); + this.gpr9Temp = tool.newVariable(LIRKind.value(AMD64Kind.QWORD)); + this.gpr10Temp = tool.newVariable(LIRKind.value(AMD64Kind.QWORD)); + } + + if (opcode == UnaryIntrinsicOpcode.SIN || opcode == UnaryIntrinsicOpcode.COS) { + this.gpr5Temp = tool.newVariable(LIRKind.value(AMD64Kind.QWORD)); + this.gpr6Temp = tool.newVariable(LIRKind.value(AMD64Kind.QWORD)); + this.gpr7Temp = tool.newVariable(LIRKind.value(AMD64Kind.QWORD)); + this.gpr8Temp = tool.newVariable(LIRKind.value(AMD64Kind.QWORD)); + this.gpr9Temp = tool.newVariable(LIRKind.value(AMD64Kind.QWORD)); + this.gpr10Temp = tool.newVariable(LIRKind.value(AMD64Kind.QWORD)); + this.xmm8Temp = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); + this.xmm9Temp = tool.newVariable(LIRKind.value(AMD64Kind.DOUBLE)); + } + + this.stackTemp = stackTemp; + } + } + + public AMD64MathIntrinsicUnaryOp(LIRGeneratorTool tool, UnaryIntrinsicOpcode opcode, Value result, Value input) { + this(tool, opcode, result, input, Value.ILLEGAL); + } + + private void setCrb(CompilationResultBuilder crb) { + internalCrb = crb; + } + + private AMD64Address externalAddress(ArrayDataPointerConstant curPtr) { + return (AMD64Address) internalCrb.recordDataReferenceInCode(curPtr); + } + + @Override + public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) { + switch (opcode) { + case LOG: + logIntrinsic(asRegister(result, AMD64Kind.DOUBLE), asRegister(input, AMD64Kind.DOUBLE), crb, masm); + break; + case LOG10: + log10Intrinsic(asRegister(result, AMD64Kind.DOUBLE), asRegister(input, AMD64Kind.DOUBLE), crb, masm); + break; + case SIN: + sinIntrinsic(asRegister(result, AMD64Kind.DOUBLE), asRegister(input, AMD64Kind.DOUBLE), crb, masm); + break; + case COS: + cosIntrinsic(asRegister(result, AMD64Kind.DOUBLE), asRegister(input, AMD64Kind.DOUBLE), crb, masm); + break; + case TAN: + tanIntrinsic(asRegister(result, AMD64Kind.DOUBLE), asRegister(input, AMD64Kind.DOUBLE), crb, masm); + break; + case EXP: + expIntrinsic(asRegister(result, AMD64Kind.DOUBLE), asRegister(input, AMD64Kind.DOUBLE), crb, masm); + break; + default: + throw GraalError.shouldNotReachHere(); + } + } + + private static int[] logTwoTable = { + 0xfefa3800, 0x3fe62e42, 0x93c76730, 0x3d2ef357, 0xaa241800, + 0x3fe5ee82, 0x0cda46be, 0x3d220238, 0x5c364800, 0x3fe5af40, + 0xac10c9fb, 0x3d2dfa63, 0x26bb8c00, 0x3fe5707a, 0xff3303dd, + 0x3d09980b, 0x26867800, 0x3fe5322e, 0x5d257531, 0x3d05ccc4, + 0x835a5000, 0x3fe4f45a, 0x6d93b8fb, 0xbd2e6c51, 0x6f970c00, + 0x3fe4b6fd, 0xed4c541c, 0x3cef7115, 0x27e8a400, 0x3fe47a15, + 0xf94d60aa, 0xbd22cb6a, 0xf2f92400, 0x3fe43d9f, 0x481051f7, + 0xbcfd984f, 0x2125cc00, 0x3fe4019c, 0x30f0c74c, 0xbd26ce79, + 0x0c36c000, 0x3fe3c608, 0x7cfe13c2, 0xbd02b736, 0x17197800, + 0x3fe38ae2, 0xbb5569a4, 0xbd218b7a, 0xad9d8c00, 0x3fe35028, + 0x9527e6ac, 0x3d10b83f, 0x44340800, 0x3fe315da, 0xc5a0ed9c, + 0xbd274e93, 0x57b0e000, 0x3fe2dbf5, 0x07b9dc11, 0xbd17a6e5, + 0x6d0ec000, 0x3fe2a278, 0xe797882d, 0x3d206d2b, 0x1134dc00, + 0x3fe26962, 0x05226250, 0xbd0b61f1, 0xd8bebc00, 0x3fe230b0, + 0x6e48667b, 0x3d12fc06, 0x5fc61800, 0x3fe1f863, 0xc9fe81d3, + 0xbd2a7242, 0x49ae6000, 0x3fe1c078, 0xed70e667, 0x3cccacde, + 0x40f23c00, 0x3fe188ee, 0xf8ab4650, 0x3d14cc4e, 0xf6f29800, + 0x3fe151c3, 0xa293ae49, 0xbd2edd97, 0x23c75c00, 0x3fe11af8, + 0xbb9ddcb2, 0xbd258647, 0x8611cc00, 0x3fe0e489, 0x07801742, + 0x3d1c2998, 0xe2d05400, 0x3fe0ae76, 0x887e7e27, 0x3d1f486b, + 0x0533c400, 0x3fe078bf, 0x41edf5fd, 0x3d268122, 0xbe760400, + 0x3fe04360, 0xe79539e0, 0xbd04c45f, 0xe5b20800, 0x3fe00e5a, + 0xb1727b1c, 0xbd053ba3, 0xaf7a4800, 0x3fdfb358, 0x3c164935, + 0x3d0085fa, 0xee031800, 0x3fdf4aa7, 0x6f014a8b, 0x3d12cde5, + 0x56b41000, 0x3fdee2a1, 0x5a470251, 0x3d2f27f4, 0xc3ddb000, + 0x3fde7b42, 0x5372bd08, 0xbd246550, 0x1a272800, 0x3fde148a, + 0x07322938, 0xbd1326b2, 0x484c9800, 0x3fddae75, 0x60dc616a, + 0xbd1ea42d, 0x46def800, 0x3fdd4902, 0xe9a767a8, 0x3d235baf, + 0x18064800, 0x3fdce42f, 0x3ec7a6b0, 0xbd0797c3, 0xc7455800, + 0x3fdc7ff9, 0xc15249ae, 0xbd29b6dd, 0x693fa000, 0x3fdc1c60, + 0x7fe8e180, 0x3d2cec80, 0x1b80e000, 0x3fdbb961, 0xf40a666d, + 0x3d27d85b, 0x04462800, 0x3fdb56fa, 0x2d841995, 0x3d109525, + 0x5248d000, 0x3fdaf529, 0x52774458, 0xbd217cc5, 0x3c8ad800, + 0x3fda93ed, 0xbea77a5d, 0x3d1e36f2, 0x0224f800, 0x3fda3344, + 0x7f9d79f5, 0x3d23c645, 0xea15f000, 0x3fd9d32b, 0x10d0c0b0, + 0xbd26279e, 0x43135800, 0x3fd973a3, 0xa502d9f0, 0xbd152313, + 0x635bf800, 0x3fd914a8, 0x2ee6307d, 0xbd1766b5, 0xa88b3000, + 0x3fd8b639, 0xe5e70470, 0xbd205ae1, 0x776dc800, 0x3fd85855, + 0x3333778a, 0x3d2fd56f, 0x3bd81800, 0x3fd7fafa, 0xc812566a, + 0xbd272090, 0x687cf800, 0x3fd79e26, 0x2efd1778, 0x3d29ec7d, + 0x76c67800, 0x3fd741d8, 0x49dc60b3, 0x3d2d8b09, 0xe6af1800, + 0x3fd6e60e, 0x7c222d87, 0x3d172165, 0x3e9c6800, 0x3fd68ac8, + 0x2756eba0, 0x3d20a0d3, 0x0b3ab000, 0x3fd63003, 0xe731ae00, + 0xbd2db623, 0xdf596000, 0x3fd5d5bd, 0x08a465dc, 0xbd0a0b2a, + 0x53c8d000, 0x3fd57bf7, 0xee5d40ef, 0x3d1faded, 0x0738a000, + 0x3fd522ae, 0x8164c759, 0x3d2ebe70, 0x9e173000, 0x3fd4c9e0, + 0x1b0ad8a4, 0xbd2e2089, 0xc271c800, 0x3fd4718d, 0x0967d675, + 0xbd2f27ce, 0x23d5e800, 0x3fd419b4, 0xec90e09d, 0x3d08e436, + 0x77333000, 0x3fd3c252, 0xb606bd5c, 0x3d183b54, 0x76be1000, + 0x3fd36b67, 0xb0f177c8, 0x3d116ecd, 0xe1d36000, 0x3fd314f1, + 0xd3213cb8, 0xbd28e27a, 0x7cdc9000, 0x3fd2bef0, 0x4a5004f4, + 0x3d2a9cfa, 0x1134d800, 0x3fd26962, 0xdf5bb3b6, 0x3d2c93c1, + 0x6d0eb800, 0x3fd21445, 0xba46baea, 0x3d0a87de, 0x635a6800, + 0x3fd1bf99, 0x5147bdb7, 0x3d2ca6ed, 0xcbacf800, 0x3fd16b5c, + 0xf7a51681, 0x3d2b9acd, 0x8227e800, 0x3fd1178e, 0x63a5f01c, + 0xbd2c210e, 0x67616000, 0x3fd0c42d, 0x163ceae9, 0x3d27188b, + 0x604d5800, 0x3fd07138, 0x16ed4e91, 0x3cf89cdb, 0x5626c800, + 0x3fd01eae, 0x1485e94a, 0xbd16f08c, 0x6cb3b000, 0x3fcf991c, + 0xca0cdf30, 0x3d1bcbec, 0xe4dd0000, 0x3fcef5ad, 0x65bb8e11, + 0xbcca2115, 0xffe71000, 0x3fce530e, 0x6041f430, 0x3cc21227, + 0xb0d49000, 0x3fcdb13d, 0xf715b035, 0xbd2aff2a, 0xf2656000, + 0x3fcd1037, 0x75b6f6e4, 0xbd084a7e, 0xc6f01000, 0x3fcc6ffb, + 0xc5962bd2, 0xbcf1ec72, 0x383be000, 0x3fcbd087, 0x595412b6, + 0xbd2d4bc4, 0x575bd000, 0x3fcb31d8, 0x4eace1aa, 0xbd0c358d, + 0x3c8ae000, 0x3fca93ed, 0x50562169, 0xbd287243, 0x07089000, + 0x3fc9f6c4, 0x6865817a, 0x3d29904d, 0xdcf70000, 0x3fc95a5a, + 0x58a0ff6f, 0x3d07f228, 0xeb390000, 0x3fc8beaf, 0xaae92cd1, + 0xbd073d54, 0x6551a000, 0x3fc823c1, 0x9a631e83, 0x3d1e0ddb, + 0x85445000, 0x3fc7898d, 0x70914305, 0xbd1c6610, 0x8b757000, + 0x3fc6f012, 0xe59c21e1, 0xbd25118d, 0xbe8c1000, 0x3fc6574e, + 0x2c3c2e78, 0x3d19cf8b, 0x6b544000, 0x3fc5bf40, 0xeb68981c, + 0xbd127023, 0xe4a1b000, 0x3fc527e5, 0xe5697dc7, 0x3d2633e8, + 0x8333b000, 0x3fc4913d, 0x54fdb678, 0x3d258379, 0xa5993000, + 0x3fc3fb45, 0x7e6a354d, 0xbd2cd1d8, 0xb0159000, 0x3fc365fc, + 0x234b7289, 0x3cc62fa8, 0x0c868000, 0x3fc2d161, 0xcb81b4a1, + 0x3d039d6c, 0x2a49c000, 0x3fc23d71, 0x8fd3df5c, 0x3d100d23, + 0x7e23f000, 0x3fc1aa2b, 0x44389934, 0x3d2ca78e, 0x8227e000, + 0x3fc1178e, 0xce2d07f2, 0x3d21ef78, 0xb59e4000, 0x3fc08598, + 0x7009902c, 0xbd27e5dd, 0x39dbe000, 0x3fbfe891, 0x4fa10afd, + 0xbd2534d6, 0x830a2000, 0x3fbec739, 0xafe645e0, 0xbd2dc068, + 0x63844000, 0x3fbda727, 0x1fa71733, 0x3d1a8940, 0x01bc4000, + 0x3fbc8858, 0xc65aacd3, 0x3d2646d1, 0x8dad6000, 0x3fbb6ac8, + 0x2bf768e5, 0xbd139080, 0x40b1c000, 0x3fba4e76, 0xb94407c8, + 0xbd0e42b6, 0x5d594000, 0x3fb9335e, 0x3abd47da, 0x3d23115c, + 0x2f40e000, 0x3fb8197e, 0xf96ffdf7, 0x3d0f80dc, 0x0aeac000, + 0x3fb700d3, 0xa99ded32, 0x3cec1e8d, 0x4d97a000, 0x3fb5e95a, + 0x3c5d1d1e, 0xbd2c6906, 0x5d208000, 0x3fb4d311, 0x82f4e1ef, + 0xbcf53a25, 0xa7d1e000, 0x3fb3bdf5, 0xa5db4ed7, 0x3d2cc85e, + 0xa4472000, 0x3fb2aa04, 0xae9c697d, 0xbd20b6e8, 0xd1466000, + 0x3fb1973b, 0x560d9e9b, 0xbd25325d, 0xb59e4000, 0x3fb08598, + 0x7009902c, 0xbd17e5dd, 0xc006c000, 0x3faeea31, 0x4fc93b7b, + 0xbd0e113e, 0xcdddc000, 0x3faccb73, 0x47d82807, 0xbd1a68f2, + 0xd0fb0000, 0x3faaaef2, 0x353bb42e, 0x3d20fc1a, 0x149fc000, + 0x3fa894aa, 0xd05a267d, 0xbd197995, 0xf2d4c000, 0x3fa67c94, + 0xec19afa2, 0xbd029efb, 0xd42e0000, 0x3fa466ae, 0x75bdfd28, + 0xbd2c1673, 0x2f8d0000, 0x3fa252f3, 0xe021b67b, 0x3d283e9a, + 0x89e74000, 0x3fa0415d, 0x5cf1d753, 0x3d0111c0, 0xec148000, + 0x3f9c63d2, 0x3f9eb2f3, 0x3d2578c6, 0x28c90000, 0x3f984925, + 0x325a0c34, 0xbd2aa0ba, 0x25980000, 0x3f9432a9, 0x928637fe, + 0x3d098139, 0x58938000, 0x3f902056, 0x06e2f7d2, 0xbd23dc5b, + 0xa3890000, 0x3f882448, 0xda74f640, 0xbd275577, 0x75890000, + 0x3f801015, 0x999d2be8, 0xbd10c76b, 0x59580000, 0x3f700805, + 0xcb31c67b, 0x3d2166af, 0x00000000, 0x00000000, 0x00000000, + 0x80000000 + }; + + private static int[] logTwoData = { + 0xfefa3800, 0x3fa62e42, 0x93c76730, 0x3ceef357 + }; + + private static int[] coeffLogTwoData = { + 0x92492492, 0x3fc24924, 0x00000000, 0xbfd00000, 0x3d6fb175, + 0xbfc5555e, 0x55555555, 0x3fd55555, 0x9999999a, 0x3fc99999, + 0x00000000, 0xbfe00000 + }; + + /* + * Copyright (c) 2014, 2016, Intel Corporation. All rights reserved. Intel Math Library (LIBM) + * Source Code + * + * ALGORITHM DESCRIPTION - LOG() --------------------- + * + * x=2^k * mx, mx in [1,2) + * + * Get B~1/mx based on the output of rcpps instruction (B0) B = int((B0*2^7+0.5))/2^7 + * + * Reduced argument: r=B*mx-1.0 (computed accurately in high and low parts) + * + * Result: k*log(2) - log(B) + p(r) if |x-1| >= small value (2^-6) and p(r) is a degree 7 + * polynomial -log(B) read from data table (high, low parts) Result is formed from high and low + * parts. + * + * Special cases: log(NaN) = quiet NaN, and raise invalid exception log(+INF) = that INF log(0) + * = -INF with divide-by-zero exception raised log(1) = +0 log(x) = NaN with invalid exception + * raised if x < -0, including -INF + * + */ + + public void logIntrinsic(Register dest, Register value, CompilationResultBuilder crb, AMD64MacroAssembler masm) { + ArrayDataPointerConstant logTwoTablePtr = new ArrayDataPointerConstant(logTwoTable, 16); + ArrayDataPointerConstant logTwoDataPtr = new ArrayDataPointerConstant(logTwoData, 16); + ArrayDataPointerConstant coeffLogTwoDataPtr = new ArrayDataPointerConstant(coeffLogTwoData, 16); + + Label bb0 = new Label(); + Label bb1 = new Label(); + Label bb2 = new Label(); + Label bb3 = new Label(); + Label bb4 = new Label(); + Label bb5 = new Label(); + Label bb6 = new Label(); + Label bb7 = new Label(); + Label bb8 = new Label(); + + Register gpr1 = asRegister(gpr1Temp, AMD64Kind.QWORD); + Register gpr2 = asRegister(gpr2Temp, AMD64Kind.QWORD); + Register gpr3 = asRegister(rcxTemp, AMD64Kind.QWORD); + Register gpr4 = asRegister(gpr4Temp, AMD64Kind.QWORD); + + Register temp1 = asRegister(xmm1Temp, AMD64Kind.DOUBLE); + Register temp2 = asRegister(xmm2Temp, AMD64Kind.DOUBLE); + Register temp3 = asRegister(xmm3Temp, AMD64Kind.DOUBLE); + Register temp4 = asRegister(xmm4Temp, AMD64Kind.DOUBLE); + Register temp5 = asRegister(xmm5Temp, AMD64Kind.DOUBLE); + Register temp6 = asRegister(xmm6Temp, AMD64Kind.DOUBLE); + Register temp7 = asRegister(xmm7Temp, AMD64Kind.DOUBLE); + + AMD64Address stackSlot = (AMD64Address) crb.asAddress(stackTemp); + + setCrb(crb); + masm.movdq(stackSlot, value); + if (dest.encoding != value.encoding) { + masm.movdqu(dest, value); + } + masm.movq(gpr1, 0x3ff0000000000000L); + masm.movdq(temp2, gpr1); + masm.movq(gpr3, 0x77f0000000000000L); + masm.movdq(temp3, gpr3); + masm.movl(gpr2, 32768); + masm.movdl(temp4, gpr2); + masm.movq(gpr2, 0xffffe00000000000L); + masm.movdq(temp5, gpr2); + masm.movdqu(temp1, value); + masm.pextrw(gpr1, dest, 3); + masm.por(dest, temp2); + masm.movl(gpr2, 16352); + masm.psrlq(dest, 27); + masm.leaq(gpr4, externalAddress(logTwoTablePtr)); + masm.psrld(dest, 2); + masm.rcpps(dest, dest); + masm.psllq(temp1, 12); + masm.pshufd(temp6, temp5, 0xE4); + masm.psrlq(temp1, 12); + masm.subl(gpr1, 16); + masm.cmpl(gpr1, 32736); + masm.jcc(ConditionFlag.AboveEqual, bb0); + + masm.bind(bb1); + masm.paddd(dest, temp4); + masm.por(temp1, temp3); + masm.movdl(gpr3, dest); + masm.psllq(dest, 29); + masm.pand(temp5, temp1); + masm.pand(dest, temp6); + masm.subsd(temp1, temp5); + masm.mulpd(temp5, dest); + masm.andl(gpr1, 32752); + masm.subl(gpr1, gpr2); + masm.cvtsi2sdl(temp7, gpr1); + masm.mulsd(temp1, dest); + masm.movdq(temp6, externalAddress(logTwoDataPtr)); // 0xfefa3800, + // 0x3fa62e42 + masm.movdqu(temp3, externalAddress(coeffLogTwoDataPtr)); // 0x92492492, + // 0x3fc24924, + // 0x00000000, + // 0xbfd00000 + masm.subsd(temp5, temp2); + masm.andl(gpr3, 16711680); + masm.shrl(gpr3, 12); + masm.movdqu(dest, new AMD64Address(gpr4, gpr3, Scale.Times1, 0)); + masm.leaq(gpr4, externalAddress(coeffLogTwoDataPtr)); + masm.movdqu(temp4, new AMD64Address(gpr4, 16)); // 0x3d6fb175, + // 0xbfc5555e, + // 0x55555555, + // 0x3fd55555 + masm.addsd(temp1, temp5); + masm.movdqu(temp2, new AMD64Address(gpr4, 32)); // 0x9999999a, + // 0x3fc99999, + // 0x00000000, + // 0xbfe00000 + masm.mulsd(temp6, temp7); + if (masm.supports(CPUFeature.SSE3)) { + masm.movddup(temp5, temp1); + } else { + masm.movdqu(temp5, temp1); + masm.movlhps(temp5, temp5); + } + masm.leaq(gpr4, externalAddress(logTwoDataPtr)); + masm.mulsd(temp7, new AMD64Address(gpr4, 8)); // 0x93c76730, + // 0x3ceef357 + masm.mulsd(temp3, temp1); + masm.addsd(dest, temp6); + masm.mulpd(temp4, temp5); + masm.mulpd(temp5, temp5); + if (masm.supports(CPUFeature.SSE3)) { + masm.movddup(temp6, dest); + } else { + masm.movdqu(temp6, dest); + masm.movlhps(temp6, temp6); + } + masm.addsd(dest, temp1); + masm.addpd(temp4, temp2); + masm.mulpd(temp3, temp5); + masm.subsd(temp6, dest); + masm.mulsd(temp4, temp1); + masm.pshufd(temp2, dest, 0xEE); + masm.addsd(temp1, temp6); + masm.mulsd(temp5, temp5); + masm.addsd(temp7, temp2); + masm.addpd(temp4, temp3); + masm.addsd(temp1, temp7); + masm.mulpd(temp4, temp5); + masm.addsd(temp1, temp4); + masm.pshufd(temp5, temp4, 0xEE); + masm.addsd(temp1, temp5); + masm.addsd(dest, temp1); + masm.jmp(bb8); + + masm.bind(bb0); + masm.movdq(dest, stackSlot); + masm.movdq(temp1, stackSlot); + masm.addl(gpr1, 16); + masm.cmpl(gpr1, 32768); + masm.jcc(ConditionFlag.AboveEqual, bb2); + + masm.cmpl(gpr1, 16); + masm.jcc(ConditionFlag.Below, bb3); + + masm.bind(bb4); + masm.addsd(dest, dest); + masm.jmp(bb8); + + masm.bind(bb5); + masm.jcc(ConditionFlag.Above, bb4); + + masm.cmpl(gpr3, 0); + masm.jcc(ConditionFlag.Above, bb4); + + masm.jmp(bb6); + + masm.bind(bb3); + masm.xorpd(temp1, temp1); + masm.addsd(temp1, dest); + masm.movdl(gpr3, temp1); + masm.psrlq(temp1, 32); + masm.movdl(gpr2, temp1); + masm.orl(gpr3, gpr2); + masm.cmpl(gpr3, 0); + masm.jcc(ConditionFlag.Equal, bb7); + + masm.xorpd(temp1, temp1); + masm.movl(gpr1, 18416); + masm.pinsrw(temp1, gpr1, 3); + masm.mulsd(dest, temp1); + masm.movdqu(temp1, dest); + masm.pextrw(gpr1, dest, 3); + masm.por(dest, temp2); + masm.psrlq(dest, 27); + masm.movl(gpr2, 18416); + masm.psrld(dest, 2); + masm.rcpps(dest, dest); + masm.psllq(temp1, 12); + masm.pshufd(temp6, temp5, 0xE4); + masm.psrlq(temp1, 12); + masm.jmp(bb1); + + masm.bind(bb2); + masm.movdl(gpr3, temp1); + masm.psrlq(temp1, 32); + masm.movdl(gpr2, temp1); + masm.addl(gpr2, gpr2); + masm.cmpl(gpr2, -2097152); + masm.jcc(ConditionFlag.AboveEqual, bb5); + + masm.orl(gpr3, gpr2); + masm.cmpl(gpr3, 0); + masm.jcc(ConditionFlag.Equal, bb7); + + masm.bind(bb6); + masm.xorpd(temp1, temp1); + masm.xorpd(dest, dest); + masm.movl(gpr1, 32752); + masm.pinsrw(temp1, gpr1, 3); + masm.mulsd(dest, temp1); + masm.jmp(bb8); + + masm.bind(bb7); + masm.xorpd(temp1, temp1); + masm.xorpd(dest, dest); + masm.movl(gpr1, 49136); + masm.pinsrw(dest, gpr1, 3); + masm.divsd(dest, temp1); + + masm.bind(bb8); + } + + private static int[] highmaskLogTen = { + 0xf8000000, 0xffffffff, 0x00000000, 0xffffe000 + }; + + private static int[] logTenE = { + 0x00000000, 0x3fdbc000, 0xbf2e4108, 0x3f5a7a6c + }; + + private static int[] logTenTable = { + 0x509f7800, 0x3fd34413, 0x1f12b358, 0x3d1fef31, 0x80333400, + 0x3fd32418, 0xc671d9d0, 0xbcf542bf, 0x51195000, 0x3fd30442, + 0x78a4b0c3, 0x3d18216a, 0x6fc79400, 0x3fd2e490, 0x80fa389d, + 0xbc902869, 0x89d04000, 0x3fd2c502, 0x75c2f564, 0x3d040754, + 0x4ddd1c00, 0x3fd2a598, 0xd219b2c3, 0xbcfa1d84, 0x6baa7c00, + 0x3fd28651, 0xfd9abec1, 0x3d1be6d3, 0x94028800, 0x3fd2672d, + 0xe289a455, 0xbd1ede5e, 0x78b86400, 0x3fd2482c, 0x6734d179, + 0x3d1fe79b, 0xcca3c800, 0x3fd2294d, 0x981a40b8, 0xbced34ea, + 0x439c5000, 0x3fd20a91, 0xcc392737, 0xbd1a9cc3, 0x92752c00, + 0x3fd1ebf6, 0x03c9afe7, 0x3d1e98f8, 0x6ef8dc00, 0x3fd1cd7d, + 0x71dae7f4, 0x3d08a86c, 0x8fe4dc00, 0x3fd1af25, 0xee9185a1, + 0xbcff3412, 0xace59400, 0x3fd190ee, 0xc2cab353, 0x3cf17ed9, + 0x7e925000, 0x3fd172d8, 0x6952c1b2, 0x3cf1521c, 0xbe694400, + 0x3fd154e2, 0xcacb79ca, 0xbd0bdc78, 0x26cbac00, 0x3fd1370d, + 0xf71f4de1, 0xbd01f8be, 0x72fa0800, 0x3fd11957, 0x55bf910b, + 0x3c946e2b, 0x5f106000, 0x3fd0fbc1, 0x39e639c1, 0x3d14a84b, + 0xa802a800, 0x3fd0de4a, 0xd3f31d5d, 0xbd178385, 0x0b992000, + 0x3fd0c0f3, 0x3843106f, 0xbd1f602f, 0x486ce800, 0x3fd0a3ba, + 0x8819497c, 0x3cef987a, 0x1de49400, 0x3fd086a0, 0x1caa0467, + 0x3d0faec7, 0x4c30cc00, 0x3fd069a4, 0xa4424372, 0xbd1618fc, + 0x94490000, 0x3fd04cc6, 0x946517d2, 0xbd18384b, 0xb7e84000, + 0x3fd03006, 0xe0109c37, 0xbd19a6ac, 0x798a0c00, 0x3fd01364, + 0x5121e864, 0xbd164cf7, 0x38ce8000, 0x3fcfedbf, 0x46214d1a, + 0xbcbbc402, 0xc8e62000, 0x3fcfb4ef, 0xdab93203, 0x3d1e0176, + 0x2cb02800, 0x3fcf7c5a, 0x2a2ea8e4, 0xbcfec86a, 0xeeeaa000, + 0x3fcf43fd, 0xc18e49a4, 0x3cf110a8, 0x9bb6e800, 0x3fcf0bda, + 0x923cc9c0, 0xbd15ce99, 0xc093f000, 0x3fced3ef, 0x4d4b51e9, + 0x3d1a04c7, 0xec58f800, 0x3fce9c3c, 0x163cad59, 0x3cac8260, + 0x9a907000, 0x3fce2d7d, 0x3fa93646, 0x3ce4a1c0, 0x37311000, + 0x3fcdbf99, 0x32abd1fd, 0x3d07ea9d, 0x6744b800, 0x3fcd528c, + 0x4dcbdfd4, 0xbd1b08e2, 0xe36de800, 0x3fcce653, 0x0b7b7f7f, + 0xbd1b8f03, 0x77506800, 0x3fcc7aec, 0xa821c9fb, 0x3d13c163, + 0x00ff8800, 0x3fcc1053, 0x536bca76, 0xbd074ee5, 0x70719800, + 0x3fcba684, 0xd7da9b6b, 0xbd1fbf16, 0xc6f8d800, 0x3fcb3d7d, + 0xe2220bb3, 0x3d1a295d, 0x16c15800, 0x3fcad53c, 0xe724911e, + 0xbcf55822, 0x82533800, 0x3fca6dbc, 0x6d982371, 0x3cac567c, + 0x3c19e800, 0x3fca06fc, 0x84d17d80, 0x3d1da204, 0x85ef8000, + 0x3fc9a0f8, 0x54466a6a, 0xbd002204, 0xb0ac2000, 0x3fc93bae, + 0xd601fd65, 0x3d18840c, 0x1bb9b000, 0x3fc8d71c, 0x7bf58766, + 0xbd14f897, 0x34aae800, 0x3fc8733e, 0x3af6ac24, 0xbd0f5c45, + 0x76d68000, 0x3fc81012, 0x4303e1a1, 0xbd1f9a80, 0x6af57800, + 0x3fc7ad96, 0x43fbcb46, 0x3cf4c33e, 0xa6c51000, 0x3fc74bc7, + 0x70f0eac5, 0xbd192e3b, 0xccab9800, 0x3fc6eaa3, 0xc0093dfe, + 0xbd0faf15, 0x8b60b800, 0x3fc68a28, 0xde78d5fd, 0xbc9ea4ee, + 0x9d987000, 0x3fc62a53, 0x962bea6e, 0xbd194084, 0xc9b0e800, + 0x3fc5cb22, 0x888dd999, 0x3d1fe201, 0xe1634800, 0x3fc56c93, + 0x16ada7ad, 0x3d1b1188, 0xc176c000, 0x3fc50ea4, 0x4159b5b5, + 0xbcf09c08, 0x51766000, 0x3fc4b153, 0x84393d23, 0xbcf6a89c, + 0x83695000, 0x3fc4549d, 0x9f0b8bbb, 0x3d1c4b8c, 0x538d5800, + 0x3fc3f881, 0xf49df747, 0x3cf89b99, 0xc8138000, 0x3fc39cfc, + 0xd503b834, 0xbd13b99f, 0xf0df0800, 0x3fc3420d, 0xf011b386, + 0xbd05d8be, 0xe7466800, 0x3fc2e7b2, 0xf39c7bc2, 0xbd1bb94e, + 0xcdd62800, 0x3fc28de9, 0x05e6d69b, 0xbd10ed05, 0xd015d800, + 0x3fc234b0, 0xe29b6c9d, 0xbd1ff967, 0x224ea800, 0x3fc1dc06, + 0x727711fc, 0xbcffb30d, 0x01540000, 0x3fc183e8, 0x39786c5a, + 0x3cc23f57, 0xb24d9800, 0x3fc12c54, 0xc905a342, 0x3d003a1d, + 0x82835800, 0x3fc0d54a, 0x9b9920c0, 0x3d03b25a, 0xc72ac000, + 0x3fc07ec7, 0x46f26a24, 0x3cf0fa41, 0xdd35d800, 0x3fc028ca, + 0x41d9d6dc, 0x3d034a65, 0x52474000, 0x3fbfa6a4, 0x44f66449, + 0x3d19cad3, 0x2da3d000, 0x3fbefcb8, 0x67832999, 0x3d18400f, + 0x32a10000, 0x3fbe53ce, 0x9c0e3b1a, 0xbcff62fd, 0x556b7000, + 0x3fbdabe3, 0x02976913, 0xbcf8243b, 0x97e88000, 0x3fbd04f4, + 0xec793797, 0x3d1c0578, 0x09647000, 0x3fbc5eff, 0x05fc0565, + 0xbd1d799e, 0xc6426000, 0x3fbbb9ff, 0x4625f5ed, 0x3d1f5723, + 0xf7afd000, 0x3fbb15f3, 0xdd5aae61, 0xbd1a7e1e, 0xd358b000, + 0x3fba72d8, 0x3314e4d3, 0x3d17bc91, 0x9b1f5000, 0x3fb9d0ab, + 0x9a4d514b, 0x3cf18c9b, 0x9cd4e000, 0x3fb92f69, 0x7e4496ab, + 0x3cf1f96d, 0x31f4f000, 0x3fb88f10, 0xf56479e7, 0x3d165818, + 0xbf628000, 0x3fb7ef9c, 0x26bf486d, 0xbd1113a6, 0xb526b000, + 0x3fb7510c, 0x1a1c3384, 0x3ca9898d, 0x8e31e000, 0x3fb6b35d, + 0xb3875361, 0xbd0661ac, 0xd01de000, 0x3fb6168c, 0x2a7cacfa, + 0xbd1bdf10, 0x0af23000, 0x3fb57a98, 0xff868816, 0x3cf046d0, + 0xd8ea0000, 0x3fb4df7c, 0x1515fbe7, 0xbd1fd529, 0xde3b2000, + 0x3fb44538, 0x6e59a132, 0x3d1faeee, 0xc8df9000, 0x3fb3abc9, + 0xf1322361, 0xbd198807, 0x505f1000, 0x3fb3132d, 0x0888e6ab, + 0x3d1e5380, 0x359bd000, 0x3fb27b61, 0xdfbcbb22, 0xbcfe2724, + 0x429ee000, 0x3fb1e463, 0x6eb4c58c, 0xbcfe4dd6, 0x4a673000, + 0x3fb14e31, 0x4ce1ac9b, 0x3d1ba691, 0x28b96000, 0x3fb0b8c9, + 0x8c7813b8, 0xbd0b3872, 0xc1f08000, 0x3fb02428, 0xc2bc8c2c, + 0x3cb5ea6b, 0x05a1a000, 0x3faf209c, 0x72e8f18e, 0xbce8df84, + 0xc0b5e000, 0x3fadfa6d, 0x9fdef436, 0x3d087364, 0xaf416000, + 0x3facd5c2, 0x1068c3a9, 0x3d0827e7, 0xdb356000, 0x3fabb296, + 0x120a34d3, 0x3d101a9f, 0x5dfea000, 0x3faa90e6, 0xdaded264, + 0xbd14c392, 0x6034c000, 0x3fa970ad, 0x1c9d06a9, 0xbd1b705e, + 0x194c6000, 0x3fa851e8, 0x83996ad9, 0xbd0117bc, 0xcf4ac000, + 0x3fa73492, 0xb1a94a62, 0xbca5ea42, 0xd67b4000, 0x3fa618a9, + 0x75aed8ca, 0xbd07119b, 0x9126c000, 0x3fa4fe29, 0x5291d533, + 0x3d12658f, 0x6f4d4000, 0x3fa3e50e, 0xcd2c5cd9, 0x3d1d5c70, + 0xee608000, 0x3fa2cd54, 0xd1008489, 0x3d1a4802, 0x9900e000, + 0x3fa1b6f9, 0x54fb5598, 0xbd16593f, 0x06bb6000, 0x3fa0a1f9, + 0x64ef57b4, 0xbd17636b, 0xb7940000, 0x3f9f1c9f, 0xee6a4737, + 0x3cb5d479, 0x91aa0000, 0x3f9cf7f5, 0x3a16373c, 0x3d087114, + 0x156b8000, 0x3f9ad5ed, 0x836c554a, 0x3c6900b0, 0xd4764000, + 0x3f98b67f, 0xed12f17b, 0xbcffc974, 0x77dec000, 0x3f9699a7, + 0x232ce7ea, 0x3d1e35bb, 0xbfbf4000, 0x3f947f5d, 0xd84ffa6e, + 0x3d0e0a49, 0x82c7c000, 0x3f92679c, 0x8d170e90, 0xbd14d9f2, + 0xadd20000, 0x3f90525d, 0x86d9f88e, 0x3cdeb986, 0x86f10000, + 0x3f8c7f36, 0xb9e0a517, 0x3ce29faa, 0xb75c8000, 0x3f885e9e, + 0x542568cb, 0xbd1f7bdb, 0x46b30000, 0x3f8442e8, 0xb954e7d9, + 0x3d1e5287, 0xb7e60000, 0x3f802c07, 0x22da0b17, 0xbd19fb27, + 0x6c8b0000, 0x3f7833e3, 0x821271ef, 0xbd190f96, 0x29910000, + 0x3f701936, 0xbc3491a5, 0xbd1bcf45, 0x354a0000, 0x3f600fe3, + 0xc0ff520a, 0xbd19d71c, 0x00000000, 0x00000000, 0x00000000, + 0x00000000 + }; + + private static int[] logTwoLogTenData = { + 0x509f7800, 0x3f934413, 0x1f12b358, 0x3cdfef31 + }; + + private static int[] coeffLogTenData = { + 0xc1a5f12e, 0x40358874, 0x64d4ef0d, 0xc0089309, 0x385593b1, + 0xc025c917, 0xdc963467, 0x3ffc6a02, 0x7f9d3aa1, 0x4016ab9f, + 0xdc77b115, 0xbff27af2 + }; + + /* + * Copyright (c) 2014, 2016, Intel Corporation. All rights reserved. Intel Math Library (LIBM) + * Source Code + * + * ALGORITHM DESCRIPTION - LOG10() --------------------- + * + * Let x=2^k * mx, mx in [1,2) + * + * Get B~1/mx based on the output of rcpss instruction (B0) B = int((B0*LH*2^7+0.5))/2^7 LH is a + * short approximation for log10(e) + * + * Reduced argument: r=B*mx-LH (computed accurately in high and low parts) + * + * Result: k*log10(2) - log(B) + p(r) p(r) is a degree 7 polynomial -log(B) read from data table + * (high, low parts) Result is formed from high and low parts + * + * Special cases: log10(0) = -INF with divide-by-zero exception raised log10(1) = +0 log10(x) = + * NaN with invalid exception raised if x < -0, including -INF log10(+INF) = +INF + * + */ + + public void log10Intrinsic(Register dest, Register value, CompilationResultBuilder crb, AMD64MacroAssembler masm) { + ArrayDataPointerConstant highmaskLogTenPtr = new ArrayDataPointerConstant(highmaskLogTen, 16); + ArrayDataPointerConstant logTenEPtr = new ArrayDataPointerConstant(logTenE, 16); + ArrayDataPointerConstant logTenTablePtr = new ArrayDataPointerConstant(logTenTable, 16); + ArrayDataPointerConstant logTwoLogTenDataPtr = new ArrayDataPointerConstant(logTwoLogTenData, 16); + ArrayDataPointerConstant coeffLogTenDataPtr = new ArrayDataPointerConstant(coeffLogTenData, 16); + + Label bb0 = new Label(); + Label bb1 = new Label(); + Label bb2 = new Label(); + Label bb3 = new Label(); + Label bb4 = new Label(); + Label bb5 = new Label(); + Label bb6 = new Label(); + Label bb7 = new Label(); + Label bb8 = new Label(); + + Register gpr1 = asRegister(gpr1Temp, AMD64Kind.QWORD); + Register gpr2 = asRegister(gpr2Temp, AMD64Kind.QWORD); + Register gpr3 = asRegister(rcxTemp, AMD64Kind.QWORD); + Register gpr4 = asRegister(gpr4Temp, AMD64Kind.QWORD); + + Register temp1 = asRegister(xmm1Temp, AMD64Kind.DOUBLE); + Register temp2 = asRegister(xmm2Temp, AMD64Kind.DOUBLE); + Register temp3 = asRegister(xmm3Temp, AMD64Kind.DOUBLE); + Register temp4 = asRegister(xmm4Temp, AMD64Kind.DOUBLE); + Register temp5 = asRegister(xmm5Temp, AMD64Kind.DOUBLE); + Register temp6 = asRegister(xmm6Temp, AMD64Kind.DOUBLE); + Register temp7 = asRegister(xmm7Temp, AMD64Kind.DOUBLE); + + AMD64Address stackSlot = (AMD64Address) crb.asAddress(stackTemp); + + setCrb(crb); + masm.movdq(stackSlot, value); + if (dest.encoding != value.encoding) { + masm.movdqu(dest, value); + } + masm.movdqu(temp5, externalAddress(highmaskLogTenPtr)); // 0xf8000000, + // 0xffffffff, + // 0x00000000, + // 0xffffe000 + masm.xorpd(temp2, temp2); + masm.movl(gpr1, 16368); + masm.pinsrw(temp2, gpr1, 3); + masm.movl(gpr2, 1054736384); + masm.movdl(temp7, gpr2); + masm.xorpd(temp3, temp3); + masm.movl(gpr3, 30704); + masm.pinsrw(temp3, gpr3, 3); + masm.movl(gpr3, 32768); + masm.movdl(temp4, gpr3); + masm.movdqu(temp1, value); + masm.pextrw(gpr1, dest, 3); + masm.por(dest, temp2); + masm.movl(gpr2, 16352); + masm.psrlq(dest, 27); + masm.movdqu(temp2, externalAddress(logTenEPtr)); // 0x00000000, + // 0x3fdbc000, + // 0xbf2e4108, + // 0x3f5a7a6c + masm.psrld(dest, 2); + masm.rcpps(dest, dest); + masm.psllq(temp1, 12); + masm.pshufd(temp6, temp5, 0x4E); + masm.psrlq(temp1, 12); + masm.subl(gpr1, 16); + masm.cmpl(gpr1, 32736); + masm.jcc(ConditionFlag.AboveEqual, bb0); + + masm.bind(bb1); + masm.mulss(dest, temp7); + masm.por(temp1, temp3); + masm.andpd(temp5, temp1); + masm.paddd(dest, temp4); + masm.movdqu(temp3, externalAddress(coeffLogTenDataPtr)); // 0xc1a5f12e, + // 0x40358874, + // 0x64d4ef0d, + // 0xc0089309 + masm.leaq(gpr4, externalAddress(coeffLogTenDataPtr)); + masm.movdqu(temp4, new AMD64Address(gpr4, 16)); // 0x385593b1, + // 0xc025c917, + // 0xdc963467, + // 0x3ffc6a02 + masm.subsd(temp1, temp5); + masm.movdl(gpr3, dest); + masm.psllq(dest, 29); + masm.andpd(dest, temp6); + masm.movdq(temp6, externalAddress(logTwoLogTenDataPtr)); // 0x509f7800, + // 0x3f934413 + masm.andl(gpr1, 32752); + masm.subl(gpr1, gpr2); + masm.cvtsi2sdl(temp7, gpr1); + masm.mulpd(temp5, dest); + masm.mulsd(temp1, dest); + masm.subsd(temp5, temp2); + masm.movdqu(temp2, new AMD64Address(gpr4, 32)); // 0x7f9d3aa1, + // 0x4016ab9f, + // 0xdc77b115, + // 0xbff27af2 + masm.leaq(gpr4, externalAddress(logTenTablePtr)); + masm.andl(gpr3, 16711680); + masm.shrl(gpr3, 12); + masm.movdqu(dest, new AMD64Address(gpr4, gpr3, Scale.Times1, -1504)); + masm.addsd(temp1, temp5); + masm.mulsd(temp6, temp7); + masm.pshufd(temp5, temp1, 0x44); + masm.leaq(gpr4, externalAddress(logTwoLogTenDataPtr)); + masm.mulsd(temp7, new AMD64Address(gpr4, 8)); // 0x1f12b358, + // 0x3cdfef31 + masm.mulsd(temp3, temp1); + masm.addsd(dest, temp6); + masm.mulpd(temp4, temp5); + masm.leaq(gpr4, externalAddress(logTenEPtr)); + masm.movdq(temp6, new AMD64Address(gpr4, 8)); // 0xbf2e4108, + // 0x3f5a7a6c + masm.mulpd(temp5, temp5); + masm.addpd(temp4, temp2); + masm.mulpd(temp3, temp5); + masm.pshufd(temp2, dest, 0xE4); + masm.addsd(dest, temp1); + masm.mulsd(temp4, temp1); + masm.subsd(temp2, dest); + masm.mulsd(temp6, temp1); + masm.addsd(temp1, temp2); + masm.pshufd(temp2, dest, 0xEE); + masm.mulsd(temp5, temp5); + masm.addsd(temp7, temp2); + masm.addsd(temp1, temp6); + masm.addpd(temp4, temp3); + masm.addsd(temp1, temp7); + masm.mulpd(temp4, temp5); + masm.addsd(temp1, temp4); + masm.pshufd(temp5, temp4, 0xEE); + masm.addsd(temp1, temp5); + masm.addsd(dest, temp1); + masm.jmp(bb8); + + masm.bind(bb0); + masm.movdq(dest, stackSlot); + masm.movdq(temp1, stackSlot); + masm.addl(gpr1, 16); + masm.cmpl(gpr1, 32768); + masm.jcc(ConditionFlag.AboveEqual, bb2); + + masm.cmpl(gpr1, 16); + masm.jcc(ConditionFlag.Below, bb3); + + masm.bind(bb4); + masm.addsd(dest, dest); + masm.jmp(bb8); + + masm.bind(bb5); + masm.jcc(ConditionFlag.Above, bb4); + + masm.cmpl(gpr3, 0); + masm.jcc(ConditionFlag.Above, bb4); + + masm.jmp(bb6); + + masm.bind(bb3); + masm.xorpd(temp1, temp1); + masm.addsd(temp1, dest); + masm.movdl(gpr3, temp1); + masm.psrlq(temp1, 32); + masm.movdl(gpr2, temp1); + masm.orl(gpr3, gpr2); + masm.cmpl(gpr3, 0); + masm.jcc(ConditionFlag.Equal, bb7); + + masm.xorpd(temp1, temp1); + masm.xorpd(temp2, temp2); + masm.movl(gpr1, 18416); + masm.pinsrw(temp1, gpr1, 3); + masm.mulsd(dest, temp1); + masm.movl(gpr1, 16368); + masm.pinsrw(temp2, gpr1, 3); + masm.movdqu(temp1, dest); + masm.pextrw(gpr1, dest, 3); + masm.por(dest, temp2); + masm.movl(gpr2, 18416); + masm.psrlq(dest, 27); + masm.movdqu(temp2, externalAddress(logTenEPtr)); // 0x00000000, + // 0x3fdbc000, + // 0xbf2e4108, + // 0x3f5a7a6c + masm.psrld(dest, 2); + masm.rcpps(dest, dest); + masm.psllq(temp1, 12); + masm.pshufd(temp6, temp5, 0x4E); + masm.psrlq(temp1, 12); + masm.jmp(bb1); + + masm.bind(bb2); + masm.movdl(gpr3, temp1); + masm.psrlq(temp1, 32); + masm.movdl(gpr2, temp1); + masm.addl(gpr2, gpr2); + masm.cmpl(gpr2, -2097152); + masm.jcc(ConditionFlag.AboveEqual, bb5); + + masm.orl(gpr3, gpr2); + masm.cmpl(gpr3, 0); + masm.jcc(ConditionFlag.Equal, bb7); + + masm.bind(bb6); + masm.xorpd(temp1, temp1); + masm.xorpd(dest, dest); + masm.movl(gpr1, 32752); + masm.pinsrw(temp1, gpr1, 3); + masm.mulsd(dest, temp1); + masm.jmp(bb8); + + masm.bind(bb7); + masm.xorpd(temp1, temp1); + masm.xorpd(dest, dest); + masm.movl(gpr1, 49136); + masm.pinsrw(dest, gpr1, 3); + masm.divsd(dest, temp1); + + masm.bind(bb8); + } + + /* + * Copyright (c) 2014, 2016, Intel Corporation. All rights reserved. Intel Math Library (LIBM) + * Source Code + * + * ALGORITHM DESCRIPTION - SIN() --------------------- + * + * 1. RANGE REDUCTION + * + * We perform an initial range reduction from X to r with + * + * X =~= N * pi/32 + r + * + * so that |r| <= pi/64 + epsilon. We restrict inputs to those where |N| <= 932560. Beyond this, + * the range reduction is insufficiently accurate. For extremely small inputs, denormalization + * can occur internally, impacting performance. This means that the main path is actually only + * taken for 2^-252 <= |X| < 90112. + * + * To avoid branches, we perform the range reduction to full accuracy each time. + * + * X - N * (P_1 + P_2 + P_3) + * + * where P_1 and P_2 are 32-bit numbers (so multiplication by N is exact) and P_3 is a 53-bit + * number. Together, these approximate pi well enough for all cases in the restricted range. + * + * The main reduction sequence is: + * + * y = 32/pi * x N = integer(y) (computed by adding and subtracting off SHIFTER) + * + * m_1 = N * P_1 m_2 = N * P_2 r_1 = x - m_1 r = r_1 - m_2 (this r can be used for most of the + * calculation) + * + * c_1 = r_1 - r m_3 = N * P_3 c_2 = c_1 - m_2 c = c_2 - m_3 + * + * 2. MAIN ALGORITHM + * + * The algorithm uses a table lookup based on B = M * pi / 32 where M = N mod 64. The stored + * values are: sigma closest power of 2 to cos(B) C_hl 53-bit cos(B) - sigma S_hi + S_lo 2 * + * 53-bit sin(B) + * + * The computation is organized as follows: + * + * sin(B + r + c) = [sin(B) + sigma * r] + r * (cos(B) - sigma) + sin(B) * [cos(r + c) - 1] + + * cos(B) * [sin(r + c) - r] + * + * which is approximately: + * + * [S_hi + sigma * r] + C_hl * r + S_lo + S_hi * [(cos(r) - 1) - r * c] + (C_hl + sigma) * + * [(sin(r) - r) + c] + * + * and this is what is actually computed. We separate this sum into four parts: + * + * hi + med + pols + corr + * + * where + * + * hi = S_hi + sigma r med = C_hl * r pols = S_hi * (cos(r) - 1) + (C_hl + sigma) * (sin(r) - r) + * corr = S_lo + c * ((C_hl + sigma) - S_hi * r) + * + * 3. POLYNOMIAL + * + * The polynomial S_hi * (cos(r) - 1) + (C_hl + sigma) * (sin(r) - r) can be rearranged freely, + * since it is quite small, so we exploit parallelism to the fullest. + * + * psc4 = SC_4 * r_1 msc4 = psc4 * r r2 = r * r msc2 = SC_2 * r2 r4 = r2 * r2 psc3 = SC_3 + msc4 + * psc1 = SC_1 + msc2 msc3 = r4 * psc3 sincospols = psc1 + msc3 pols = sincospols * + * + * 4. CORRECTION TERM + * + * This is where the "c" component of the range reduction is taken into account; recall that + * just "r" is used for most of the calculation. + * + * -c = m_3 - c_2 -d = S_hi * r - (C_hl + sigma) corr = -c * -d + S_lo + * + * 5. COMPENSATED SUMMATIONS + * + * The two successive compensated summations add up the high and medium parts, leaving just the + * low parts to add up at the end. + * + * rs = sigma * r res_int = S_hi + rs k_0 = S_hi - res_int k_2 = k_0 + rs med = C_hl * r res_hi + * = res_int + med k_1 = res_int - res_hi k_3 = k_1 + med + * + * 6. FINAL SUMMATION + * + * We now add up all the small parts: + * + * res_lo = pols(hi) + pols(lo) + corr + k_1 + k_3 + * + * Now the overall result is just: + * + * res_hi + res_lo + * + * 7. SMALL ARGUMENTS + * + * If |x| < SNN (SNN meaning the smallest normal number), we simply perform 0.1111111 cdots 1111 + * * x. For SNN <= |x|, we do 2^-55 * (2^55 * x - x). + * + * Special cases: sin(NaN) = quiet NaN, and raise invalid exception sin(INF) = NaN and raise + * invalid exception sin(+/-0) = +/-0 + * + */ + + public int[] oneHalf = { + 0x00000000, 0x3fe00000, 0x00000000, 0x3fe00000 + }; + + public int[] pTwo = { + 0x1a600000, 0x3d90b461, 0x1a600000, 0x3d90b461 + }; + + public int[] scFour = { + 0xa556c734, 0x3ec71de3, 0x1a01a01a, 0x3efa01a0 + }; + + public int[] cTable = { + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x3ff00000, 0x176d6d31, 0xbf73b92e, + 0xbc29b42c, 0x3fb917a6, 0xe0000000, 0xbc3e2718, 0x00000000, + 0x3ff00000, 0x011469fb, 0xbf93ad06, 0x3c69a60b, 0x3fc8f8b8, + 0xc0000000, 0xbc626d19, 0x00000000, 0x3ff00000, 0x939d225a, + 0xbfa60bea, 0x2ed59f06, 0x3fd29406, 0xa0000000, 0xbc75d28d, + 0x00000000, 0x3ff00000, 0x866b95cf, 0xbfb37ca1, 0xa6aea963, + 0x3fd87de2, 0xe0000000, 0xbc672ced, 0x00000000, 0x3ff00000, + 0x73fa1279, 0xbfbe3a68, 0x3806f63b, 0x3fde2b5d, 0x20000000, + 0x3c5e0d89, 0x00000000, 0x3ff00000, 0x5bc57974, 0xbfc59267, + 0x39ae68c8, 0x3fe1c73b, 0x20000000, 0x3c8b25dd, 0x00000000, + 0x3ff00000, 0x53aba2fd, 0xbfcd0dfe, 0x25091dd6, 0x3fe44cf3, + 0x20000000, 0x3c68076a, 0x00000000, 0x3ff00000, 0x99fcef32, + 0x3fca8279, 0x667f3bcd, 0x3fe6a09e, 0x20000000, 0xbc8bdd34, + 0x00000000, 0x3fe00000, 0x94247758, 0x3fc133cc, 0x6b151741, + 0x3fe8bc80, 0x20000000, 0xbc82c5e1, 0x00000000, 0x3fe00000, + 0x9ae68c87, 0x3fac73b3, 0x290ea1a3, 0x3fea9b66, 0xe0000000, + 0x3c39f630, 0x00000000, 0x3fe00000, 0x7f909c4e, 0xbf9d4a2c, + 0xf180bdb1, 0x3fec38b2, 0x80000000, 0xbc76e0b1, 0x00000000, + 0x3fe00000, 0x65455a75, 0xbfbe0875, 0xcf328d46, 0x3fed906b, + 0x20000000, 0x3c7457e6, 0x00000000, 0x3fe00000, 0x76acf82d, + 0x3fa4a031, 0x56c62dda, 0x3fee9f41, 0xe0000000, 0x3c8760b1, + 0x00000000, 0x3fd00000, 0x0e5967d5, 0xbfac1d1f, 0xcff75cb0, + 0x3fef6297, 0x20000000, 0x3c756217, 0x00000000, 0x3fd00000, + 0x0f592f50, 0xbf9ba165, 0xa3d12526, 0x3fefd88d, 0x40000000, + 0xbc887df6, 0x00000000, 0x3fc00000, 0x00000000, 0x00000000, + 0x00000000, 0x3ff00000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x0f592f50, 0x3f9ba165, 0xa3d12526, 0x3fefd88d, + 0x40000000, 0xbc887df6, 0x00000000, 0xbfc00000, 0x0e5967d5, + 0x3fac1d1f, 0xcff75cb0, 0x3fef6297, 0x20000000, 0x3c756217, + 0x00000000, 0xbfd00000, 0x76acf82d, 0xbfa4a031, 0x56c62dda, + 0x3fee9f41, 0xe0000000, 0x3c8760b1, 0x00000000, 0xbfd00000, + 0x65455a75, 0x3fbe0875, 0xcf328d46, 0x3fed906b, 0x20000000, + 0x3c7457e6, 0x00000000, 0xbfe00000, 0x7f909c4e, 0x3f9d4a2c, + 0xf180bdb1, 0x3fec38b2, 0x80000000, 0xbc76e0b1, 0x00000000, + 0xbfe00000, 0x9ae68c87, 0xbfac73b3, 0x290ea1a3, 0x3fea9b66, + 0xe0000000, 0x3c39f630, 0x00000000, 0xbfe00000, 0x94247758, + 0xbfc133cc, 0x6b151741, 0x3fe8bc80, 0x20000000, 0xbc82c5e1, + 0x00000000, 0xbfe00000, 0x99fcef32, 0xbfca8279, 0x667f3bcd, + 0x3fe6a09e, 0x20000000, 0xbc8bdd34, 0x00000000, 0xbfe00000, + 0x53aba2fd, 0x3fcd0dfe, 0x25091dd6, 0x3fe44cf3, 0x20000000, + 0x3c68076a, 0x00000000, 0xbff00000, 0x5bc57974, 0x3fc59267, + 0x39ae68c8, 0x3fe1c73b, 0x20000000, 0x3c8b25dd, 0x00000000, + 0xbff00000, 0x73fa1279, 0x3fbe3a68, 0x3806f63b, 0x3fde2b5d, + 0x20000000, 0x3c5e0d89, 0x00000000, 0xbff00000, 0x866b95cf, + 0x3fb37ca1, 0xa6aea963, 0x3fd87de2, 0xe0000000, 0xbc672ced, + 0x00000000, 0xbff00000, 0x939d225a, 0x3fa60bea, 0x2ed59f06, + 0x3fd29406, 0xa0000000, 0xbc75d28d, 0x00000000, 0xbff00000, + 0x011469fb, 0x3f93ad06, 0x3c69a60b, 0x3fc8f8b8, 0xc0000000, + 0xbc626d19, 0x00000000, 0xbff00000, 0x176d6d31, 0x3f73b92e, + 0xbc29b42c, 0x3fb917a6, 0xe0000000, 0xbc3e2718, 0x00000000, + 0xbff00000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0xbff00000, 0x176d6d31, + 0x3f73b92e, 0xbc29b42c, 0xbfb917a6, 0xe0000000, 0x3c3e2718, + 0x00000000, 0xbff00000, 0x011469fb, 0x3f93ad06, 0x3c69a60b, + 0xbfc8f8b8, 0xc0000000, 0x3c626d19, 0x00000000, 0xbff00000, + 0x939d225a, 0x3fa60bea, 0x2ed59f06, 0xbfd29406, 0xa0000000, + 0x3c75d28d, 0x00000000, 0xbff00000, 0x866b95cf, 0x3fb37ca1, + 0xa6aea963, 0xbfd87de2, 0xe0000000, 0x3c672ced, 0x00000000, + 0xbff00000, 0x73fa1279, 0x3fbe3a68, 0x3806f63b, 0xbfde2b5d, + 0x20000000, 0xbc5e0d89, 0x00000000, 0xbff00000, 0x5bc57974, + 0x3fc59267, 0x39ae68c8, 0xbfe1c73b, 0x20000000, 0xbc8b25dd, + 0x00000000, 0xbff00000, 0x53aba2fd, 0x3fcd0dfe, 0x25091dd6, + 0xbfe44cf3, 0x20000000, 0xbc68076a, 0x00000000, 0xbff00000, + 0x99fcef32, 0xbfca8279, 0x667f3bcd, 0xbfe6a09e, 0x20000000, + 0x3c8bdd34, 0x00000000, 0xbfe00000, 0x94247758, 0xbfc133cc, + 0x6b151741, 0xbfe8bc80, 0x20000000, 0x3c82c5e1, 0x00000000, + 0xbfe00000, 0x9ae68c87, 0xbfac73b3, 0x290ea1a3, 0xbfea9b66, + 0xe0000000, 0xbc39f630, 0x00000000, 0xbfe00000, 0x7f909c4e, + 0x3f9d4a2c, 0xf180bdb1, 0xbfec38b2, 0x80000000, 0x3c76e0b1, + 0x00000000, 0xbfe00000, 0x65455a75, 0x3fbe0875, 0xcf328d46, + 0xbfed906b, 0x20000000, 0xbc7457e6, 0x00000000, 0xbfe00000, + 0x76acf82d, 0xbfa4a031, 0x56c62dda, 0xbfee9f41, 0xe0000000, + 0xbc8760b1, 0x00000000, 0xbfd00000, 0x0e5967d5, 0x3fac1d1f, + 0xcff75cb0, 0xbfef6297, 0x20000000, 0xbc756217, 0x00000000, + 0xbfd00000, 0x0f592f50, 0x3f9ba165, 0xa3d12526, 0xbfefd88d, + 0x40000000, 0x3c887df6, 0x00000000, 0xbfc00000, 0x00000000, + 0x00000000, 0x00000000, 0xbff00000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x0f592f50, 0xbf9ba165, 0xa3d12526, + 0xbfefd88d, 0x40000000, 0x3c887df6, 0x00000000, 0x3fc00000, + 0x0e5967d5, 0xbfac1d1f, 0xcff75cb0, 0xbfef6297, 0x20000000, + 0xbc756217, 0x00000000, 0x3fd00000, 0x76acf82d, 0x3fa4a031, + 0x56c62dda, 0xbfee9f41, 0xe0000000, 0xbc8760b1, 0x00000000, + 0x3fd00000, 0x65455a75, 0xbfbe0875, 0xcf328d46, 0xbfed906b, + 0x20000000, 0xbc7457e6, 0x00000000, 0x3fe00000, 0x7f909c4e, + 0xbf9d4a2c, 0xf180bdb1, 0xbfec38b2, 0x80000000, 0x3c76e0b1, + 0x00000000, 0x3fe00000, 0x9ae68c87, 0x3fac73b3, 0x290ea1a3, + 0xbfea9b66, 0xe0000000, 0xbc39f630, 0x00000000, 0x3fe00000, + 0x94247758, 0x3fc133cc, 0x6b151741, 0xbfe8bc80, 0x20000000, + 0x3c82c5e1, 0x00000000, 0x3fe00000, 0x99fcef32, 0x3fca8279, + 0x667f3bcd, 0xbfe6a09e, 0x20000000, 0x3c8bdd34, 0x00000000, + 0x3fe00000, 0x53aba2fd, 0xbfcd0dfe, 0x25091dd6, 0xbfe44cf3, + 0x20000000, 0xbc68076a, 0x00000000, 0x3ff00000, 0x5bc57974, + 0xbfc59267, 0x39ae68c8, 0xbfe1c73b, 0x20000000, 0xbc8b25dd, + 0x00000000, 0x3ff00000, 0x73fa1279, 0xbfbe3a68, 0x3806f63b, + 0xbfde2b5d, 0x20000000, 0xbc5e0d89, 0x00000000, 0x3ff00000, + 0x866b95cf, 0xbfb37ca1, 0xa6aea963, 0xbfd87de2, 0xe0000000, + 0x3c672ced, 0x00000000, 0x3ff00000, 0x939d225a, 0xbfa60bea, + 0x2ed59f06, 0xbfd29406, 0xa0000000, 0x3c75d28d, 0x00000000, + 0x3ff00000, 0x011469fb, 0xbf93ad06, 0x3c69a60b, 0xbfc8f8b8, + 0xc0000000, 0x3c626d19, 0x00000000, 0x3ff00000, 0x176d6d31, + 0xbf73b92e, 0xbc29b42c, 0xbfb917a6, 0xe0000000, 0x3c3e2718, + 0x00000000, 0x3ff00000 + }; + + public int[] scTwo = { + 0x11111111, 0x3f811111, 0x55555555, 0x3fa55555 + }; + + public int[] scThree = { + 0x1a01a01a, 0xbf2a01a0, 0x16c16c17, 0xbf56c16c + }; + + public int[] scOne = { + 0x55555555, 0xbfc55555, 0x00000000, 0xbfe00000 + }; + + public int[] piInvTable = { + 0x00000000, 0x00000000, 0xa2f9836e, 0x4e441529, 0xfc2757d1, + 0xf534ddc0, 0xdb629599, 0x3c439041, 0xfe5163ab, 0xdebbc561, + 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c, 0xfe1deb1c, + 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41, + 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, + 0xde05980f, 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, + 0x4f463f66, 0x9e5fea2d, 0x7527bac7, 0xebe5f17b, 0x3d0739f7, + 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08, 0x56033046, 0xfc7b6bab, + 0xf0cfbc21 + }; + + public int[] piFour = { + 0x40000000, 0x3fe921fb, 0x18469899, 0x3e64442d + }; + + public int[] piThirtyTwoInv = { + 0x6dc9c883, 0x40245f30 + }; + + public int[] shifter = { + 0x00000000, 0x43380000 + }; + + public int[] signMask = { + 0x00000000, 0x80000000 + }; + + public int[] pThree = { + 0x2e037073, 0x3b63198a + }; + + public int[] allOnes = { + 0xffffffff, 0x3fefffff + }; + + public int[] twoPowFiftyFive = { + 0x00000000, 0x43600000 + }; + + public int[] twoPowFiftyFiveM = { + 0x00000000, 0x3c800000 + }; + + public int[] pOne = { + 0x54400000, 0x3fb921fb + }; + + public void sinIntrinsic(Register dest, Register value, CompilationResultBuilder crb, AMD64MacroAssembler masm) { + ArrayDataPointerConstant oneHalfPtr = new ArrayDataPointerConstant(oneHalf, 16); + ArrayDataPointerConstant pTwoPtr = new ArrayDataPointerConstant(pTwo, 16); + ArrayDataPointerConstant scFourPtr = new ArrayDataPointerConstant(scFour, 16); + ArrayDataPointerConstant cTablePtr = new ArrayDataPointerConstant(cTable, 16); + ArrayDataPointerConstant scTwoPtr = new ArrayDataPointerConstant(scTwo, 16); + ArrayDataPointerConstant scThreePtr = new ArrayDataPointerConstant(scThree, 16); + ArrayDataPointerConstant scOnePtr = new ArrayDataPointerConstant(scOne, 16); + ArrayDataPointerConstant piInvTablePtr = new ArrayDataPointerConstant(piInvTable, 16); + ArrayDataPointerConstant piFourPtr = new ArrayDataPointerConstant(piFour, 16); + ArrayDataPointerConstant piThirtyTwoInvPtr = new ArrayDataPointerConstant(piThirtyTwoInv, 8); + ArrayDataPointerConstant shifterPtr = new ArrayDataPointerConstant(shifter, 8); + ArrayDataPointerConstant signMaskPtr = new ArrayDataPointerConstant(signMask, 8); + ArrayDataPointerConstant pThreePtr = new ArrayDataPointerConstant(pThree, 8); + ArrayDataPointerConstant allOnesPtr = new ArrayDataPointerConstant(allOnes, 8); + ArrayDataPointerConstant twoPowFiftyFivePtr = new ArrayDataPointerConstant(twoPowFiftyFive, 8); + ArrayDataPointerConstant twoPowFiftyFiveMPtr = new ArrayDataPointerConstant(twoPowFiftyFiveM, 8); + ArrayDataPointerConstant pOnePtr = new ArrayDataPointerConstant(pOne, 8); + + Label bb0 = new Label(); + Label bb1 = new Label(); + Label bb2 = new Label(); + Label bb4 = new Label(); + Label bb5 = new Label(); + Label bb6 = new Label(); + Label bb8 = new Label(); + Label bb9 = new Label(); + Label bb10 = new Label(); + Label bb11 = new Label(); + Label bb12 = new Label(); + Label bb13 = new Label(); + Label bb14 = new Label(); + Label bb15 = new Label(); + + Register gpr1 = asRegister(gpr1Temp, AMD64Kind.QWORD); + Register gpr2 = asRegister(gpr2Temp, AMD64Kind.QWORD); + Register gpr3 = asRegister(rcxTemp, AMD64Kind.QWORD); + Register gpr4 = asRegister(gpr4Temp, AMD64Kind.QWORD); + Register gpr5 = asRegister(gpr5Temp, AMD64Kind.QWORD); + Register gpr6 = asRegister(gpr6Temp, AMD64Kind.QWORD); + Register gpr7 = asRegister(gpr7Temp, AMD64Kind.QWORD); + Register gpr8 = asRegister(gpr8Temp, AMD64Kind.QWORD); + Register gpr9 = asRegister(gpr9Temp, AMD64Kind.QWORD); + Register gpr10 = asRegister(gpr10Temp, AMD64Kind.QWORD); + + Register temp1 = asRegister(xmm1Temp, AMD64Kind.DOUBLE); + Register temp2 = asRegister(xmm2Temp, AMD64Kind.DOUBLE); + Register temp3 = asRegister(xmm3Temp, AMD64Kind.DOUBLE); + Register temp4 = asRegister(xmm4Temp, AMD64Kind.DOUBLE); + Register temp5 = asRegister(xmm5Temp, AMD64Kind.DOUBLE); + Register temp6 = asRegister(xmm6Temp, AMD64Kind.DOUBLE); + Register temp7 = asRegister(xmm7Temp, AMD64Kind.DOUBLE); + Register temp8 = asRegister(xmm8Temp, AMD64Kind.DOUBLE); + Register temp9 = asRegister(xmm9Temp, AMD64Kind.DOUBLE); + + AMD64Address stackSlot = (AMD64Address) crb.asAddress(stackTemp); + + setCrb(crb); + masm.movsd(stackSlot, value); + if (dest.encoding != value.encoding) { + masm.movdqu(dest, value); + } + + masm.leaq(gpr1, stackSlot); + masm.movl(gpr1, new AMD64Address(gpr1, 4)); + masm.movdq(temp1, externalAddress(piThirtyTwoInvPtr)); // 0x6dc9c883, + // 0x40245f30 + masm.movdq(temp2, externalAddress(shifterPtr)); // 0x00000000, + // 0x43380000 + + masm.andl(gpr1, 2147418112); + masm.subl(gpr1, 808452096); + masm.cmpl(gpr1, 281346048); + masm.jcc(ConditionFlag.Above, bb0); + + masm.mulsd(temp1, dest); + masm.movdqu(temp5, externalAddress(oneHalfPtr)); // 0x00000000, + // 0x3fe00000, + // 0x00000000, + // 0x3fe00000 + masm.movdq(temp4, externalAddress(signMaskPtr)); // 0x00000000, + // 0x80000000 + masm.pand(temp4, dest); + masm.por(temp5, temp4); + masm.addpd(temp1, temp5); + masm.cvttsd2sil(gpr4, temp1); + masm.cvtsi2sdl(temp1, gpr4); + masm.movdqu(temp6, externalAddress(pTwoPtr)); // 0x1a600000, + // 0x3d90b461, + // 0x1a600000, + // 0x3d90b461 + masm.movq(gpr7, 0x3fb921fb54400000L); + masm.movdq(temp3, gpr7); + masm.movdqu(temp5, externalAddress(scFourPtr)); // 0xa556c734, + // 0x3ec71de3, + // 0x1a01a01a, + // 0x3efa01a0 + masm.pshufd(temp4, dest, 0x44); + masm.mulsd(temp3, temp1); + if (masm.supports(CPUFeature.SSE3)) { + masm.movddup(temp1, temp1); + } else { + masm.movlhps(temp1, temp1); + } + masm.andl(gpr4, 63); + masm.shll(gpr4, 5); + masm.leaq(gpr1, externalAddress(cTablePtr)); + masm.addq(gpr1, gpr4); + masm.movdqu(temp8, new AMD64Address(gpr1, 0)); + masm.mulpd(temp6, temp1); + masm.mulsd(temp1, externalAddress(pThreePtr)); // 0x2e037073, + // 0x3b63198a + masm.subsd(temp4, temp3); + masm.subsd(dest, temp3); + if (masm.supports(CPUFeature.SSE3)) { + masm.movddup(temp3, temp4); + } else { + masm.movdqu(temp3, temp4); + masm.movlhps(temp3, temp3); + } + masm.subsd(temp4, temp6); + masm.pshufd(dest, dest, 0x44); + masm.pshufd(temp7, temp8, 0xE); + masm.movdqu(temp2, temp8); + masm.movdqu(temp9, temp7); + masm.mulpd(temp5, dest); + masm.subpd(dest, temp6); + masm.mulsd(temp7, temp4); + masm.subsd(temp3, temp4); + masm.mulpd(temp5, dest); + masm.mulpd(dest, dest); + masm.subsd(temp3, temp6); + masm.movdqu(temp6, externalAddress(scTwoPtr)); // 0x11111111, + // 0x3f811111, + // 0x55555555, + // 0x3fa55555 + masm.subsd(temp1, temp3); + masm.movdq(temp3, new AMD64Address(gpr1, 24)); + masm.addsd(temp2, temp3); + masm.subsd(temp7, temp2); + masm.mulsd(temp2, temp4); + masm.mulpd(temp6, dest); + masm.mulsd(temp3, temp4); + masm.mulpd(temp2, dest); + masm.mulpd(dest, dest); + masm.addpd(temp5, externalAddress(scThreePtr)); // 0x1a01a01a, + // 0xbf2a01a0, + // 0x16c16c17, + // 0xbf56c16c + masm.mulsd(temp4, temp8); + masm.addpd(temp6, externalAddress(scOnePtr)); // 0x55555555, + // 0xbfc55555, + // 0x00000000, + // 0xbfe00000 + masm.mulpd(temp5, dest); + masm.movdqu(dest, temp3); + masm.addsd(temp3, temp9); + masm.mulpd(temp1, temp7); + masm.movdqu(temp7, temp4); + masm.addsd(temp4, temp3); + masm.addpd(temp6, temp5); + masm.subsd(temp9, temp3); + masm.subsd(temp3, temp4); + masm.addsd(temp1, new AMD64Address(gpr1, 16)); + masm.mulpd(temp6, temp2); + masm.addsd(temp9, dest); + masm.addsd(temp3, temp7); + masm.addsd(temp1, temp9); + masm.addsd(temp1, temp3); + masm.addsd(temp1, temp6); + masm.unpckhpd(temp6, temp6); + masm.movdqu(dest, temp4); + masm.addsd(temp1, temp6); + masm.addsd(dest, temp1); + masm.jmp(bb15); + + masm.bind(bb14); + masm.xorpd(temp1, temp1); + masm.xorpd(dest, dest); + masm.divsd(dest, temp1); + masm.jmp(bb15); + + masm.bind(bb0); + masm.jcc(ConditionFlag.Greater, bb1); + + masm.shrl(gpr1, 20); + masm.cmpl(gpr1, 3325); + masm.jcc(ConditionFlag.NotEqual, bb2); + + masm.mulsd(dest, externalAddress(allOnesPtr)); // 0xffffffff, + // 0x3fefffff + masm.jmp(bb15); + + masm.bind(bb2); + masm.movdq(temp3, externalAddress(twoPowFiftyFivePtr)); // 0x00000000, + // 0x43600000 + masm.mulsd(temp3, dest); + masm.subsd(temp3, dest); + masm.mulsd(temp3, externalAddress(twoPowFiftyFiveMPtr)); // 0x00000000, + // 0x3c800000 + masm.jmp(bb15); + + masm.bind(bb1); + masm.pextrw(gpr3, dest, 3); + masm.andl(gpr3, 32752); + masm.cmpl(gpr3, 32752); + masm.jcc(ConditionFlag.Equal, bb14); + + masm.subl(gpr3, 16224); + masm.shrl(gpr3, 7); + masm.andl(gpr3, 65532); + masm.leaq(gpr10, externalAddress(piInvTablePtr)); + masm.addq(gpr3, gpr10); + masm.movdq(gpr1, dest); + masm.movl(gpr9, new AMD64Address(gpr3, 20)); + masm.movl(gpr7, new AMD64Address(gpr3, 24)); + masm.movl(gpr4, gpr1); + masm.shrq(gpr1, 21); + masm.orl(gpr1, Integer.MIN_VALUE); + masm.shrl(gpr1, 11); + masm.movl(gpr8, gpr9); + masm.imulq(gpr9, gpr4); + masm.imulq(gpr8, gpr1); + masm.imulq(gpr7, gpr1); + masm.movl(gpr5, new AMD64Address(gpr3, 16)); + masm.movl(gpr6, new AMD64Address(gpr3, 12)); + masm.movl(gpr10, gpr9); + masm.shrq(gpr9, 32); + masm.addq(gpr8, gpr9); + masm.addq(gpr10, gpr7); + masm.movl(gpr7, gpr10); + masm.shrq(gpr10, 32); + masm.addq(gpr8, gpr10); + masm.movl(gpr9, gpr5); + masm.imulq(gpr5, gpr4); + masm.imulq(gpr9, gpr1); + masm.movl(gpr10, gpr6); + masm.imulq(gpr6, gpr4); + masm.movl(gpr2, gpr5); + masm.shrq(gpr5, 32); + masm.addq(gpr8, gpr2); + masm.movl(gpr2, gpr8); + masm.shrq(gpr8, 32); + masm.addq(gpr9, gpr5); + masm.addq(gpr9, gpr8); + masm.shlq(gpr2, 32); + masm.orq(gpr7, gpr2); + masm.imulq(gpr10, gpr1); + masm.movl(gpr8, new AMD64Address(gpr3, 8)); + masm.movl(gpr5, new AMD64Address(gpr3, 4)); + masm.movl(gpr2, gpr6); + masm.shrq(gpr6, 32); + masm.addq(gpr9, gpr2); + masm.movl(gpr2, gpr9); + masm.shrq(gpr9, 32); + masm.addq(gpr10, gpr6); + masm.addq(gpr10, gpr9); + masm.movq(gpr6, gpr8); + masm.imulq(gpr8, gpr4); + masm.imulq(gpr6, gpr1); + masm.movl(gpr9, gpr8); + masm.shrq(gpr8, 32); + masm.addq(gpr10, gpr9); + masm.movl(gpr9, gpr10); + masm.shrq(gpr10, 32); + masm.addq(gpr6, gpr8); + masm.addq(gpr6, gpr10); + masm.movq(gpr8, gpr5); + masm.imulq(gpr5, gpr4); + masm.imulq(gpr8, gpr1); + masm.shlq(gpr9, 32); + masm.orq(gpr9, gpr2); + masm.movl(gpr1, new AMD64Address(gpr3, 0)); + masm.movl(gpr10, gpr5); + masm.shrq(gpr5, 32); + masm.addq(gpr6, gpr10); + masm.movl(gpr10, gpr6); + masm.shrq(gpr6, 32); + masm.addq(gpr8, gpr5); + masm.addq(gpr8, gpr6); + masm.imulq(gpr4, gpr1); + masm.pextrw(gpr2, dest, 3); + masm.leaq(gpr6, externalAddress(piInvTablePtr)); + masm.subq(gpr3, gpr6); + masm.addl(gpr3, gpr3); + masm.addl(gpr3, gpr3); + masm.addl(gpr3, gpr3); + masm.addl(gpr3, 19); + masm.movl(gpr5, 32768); + masm.andl(gpr5, gpr2); + masm.shrl(gpr2, 4); + masm.andl(gpr2, 2047); + masm.subl(gpr2, 1023); + masm.subl(gpr3, gpr2); + masm.addq(gpr8, gpr4); + masm.movl(gpr4, gpr3); + masm.addl(gpr4, 32); + masm.cmpl(gpr3, 1); + masm.jcc(ConditionFlag.Less, bb4); + + masm.negl(gpr3); + masm.addl(gpr3, 29); + masm.shll(gpr8); + masm.movl(gpr6, gpr8); + masm.andl(gpr8, 536870911); + masm.testl(gpr8, 268435456); + masm.jcc(ConditionFlag.NotEqual, bb5); + + masm.shrl(gpr8); + masm.movl(gpr2, 0); + masm.shlq(gpr8, 32); + masm.orq(gpr8, gpr10); + + masm.bind(bb6); + + masm.cmpq(gpr8, 0); + masm.jcc(ConditionFlag.Equal, bb8); + + masm.bind(bb9); + masm.bsrq(gpr10, gpr8); + masm.movl(gpr3, 29); + masm.subl(gpr3, gpr10); + masm.jcc(ConditionFlag.LessEqual, bb10); + + masm.shlq(gpr8); + masm.movq(gpr1, gpr9); + masm.shlq(gpr9); + masm.addl(gpr4, gpr3); + masm.negl(gpr3); + masm.addl(gpr3, 64); + masm.shrq(gpr1); + masm.shrq(gpr7); + masm.orq(gpr8, gpr1); + masm.orq(gpr9, gpr7); + + masm.bind(bb11); + masm.cvtsi2sdq(dest, gpr8); + masm.shrq(gpr9, 1); + masm.cvtsi2sdq(temp3, gpr9); + masm.xorpd(temp4, temp4); + masm.shll(gpr4, 4); + masm.negl(gpr4); + masm.addl(gpr4, 16368); + masm.orl(gpr4, gpr5); + masm.xorl(gpr4, gpr2); + masm.pinsrw(temp4, gpr4, 3); + masm.leaq(gpr1, externalAddress(piFourPtr)); + masm.movdqu(temp2, new AMD64Address(gpr1, 0)); // 0x40000000, + // 0x3fe921fb, + // 0x18469899, + // 0x3e64442d + masm.xorpd(temp5, temp5); + masm.subl(gpr4, 1008); + masm.pinsrw(temp5, gpr4, 3); + masm.mulsd(dest, temp4); + masm.shll(gpr5, 16); + masm.sarl(gpr5, 31); + masm.mulsd(temp3, temp5); + masm.movdqu(temp1, dest); + masm.pshufd(temp6, temp2, 0xE); + masm.mulsd(dest, temp2); + masm.shrl(gpr6, 29); + masm.addsd(temp1, temp3); + masm.mulsd(temp3, temp2); + masm.addl(gpr6, gpr5); + masm.xorl(gpr6, gpr5); + masm.mulsd(temp6, temp1); + masm.movl(gpr1, gpr6); + masm.addsd(temp6, temp3); + masm.movdqu(temp2, dest); + masm.addsd(dest, temp6); + masm.subsd(temp2, dest); + masm.addsd(temp6, temp2); + + masm.bind(bb12); + masm.movdq(temp1, externalAddress(piThirtyTwoInvPtr)); // 0x6dc9c883, + // 0x40245f30 + masm.mulsd(temp1, dest); + masm.movdq(temp5, externalAddress(oneHalfPtr)); // 0x00000000, + // 0x3fe00000, + // 0x00000000, + // 0x3fe00000 + masm.movdq(temp4, externalAddress(signMaskPtr)); // 0x00000000, + // 0x80000000 + masm.pand(temp4, dest); + masm.por(temp5, temp4); + masm.addpd(temp1, temp5); + masm.cvttsd2sil(gpr4, temp1); + masm.cvtsi2sdl(temp1, gpr4); + masm.movdq(temp3, externalAddress(pOnePtr)); // 0x54400000, + // 0x3fb921fb + masm.movdqu(temp2, externalAddress(pTwoPtr)); // 0x1a600000, + // 0x3d90b461, + // 0x1a600000, + // 0x3d90b461 + masm.mulsd(temp3, temp1); + masm.unpcklpd(temp1, temp1); + masm.shll(gpr1, 3); + masm.addl(gpr4, 1865216); + masm.movdqu(temp4, dest); + masm.addl(gpr4, gpr1); + masm.andl(gpr4, 63); + masm.movdqu(temp5, externalAddress(scFourPtr)); // 0x54400000, + // 0x3fb921fb + masm.leaq(gpr1, externalAddress(cTablePtr)); + masm.shll(gpr4, 5); + masm.addq(gpr1, gpr4); + masm.movdqu(temp8, new AMD64Address(gpr1, 0)); + masm.mulpd(temp2, temp1); + masm.subsd(dest, temp3); + masm.mulsd(temp1, externalAddress(pThreePtr)); // 0x2e037073, + // 0x3b63198a + masm.subsd(temp4, temp3); + masm.unpcklpd(dest, dest); + masm.movdqu(temp3, temp4); + masm.subsd(temp4, temp2); + masm.mulpd(temp5, dest); + masm.subpd(dest, temp2); + masm.pshufd(temp7, temp8, 0xE); + masm.movdqu(temp9, temp7); + masm.mulsd(temp7, temp4); + masm.subsd(temp3, temp4); + masm.mulpd(temp5, dest); + masm.mulpd(dest, dest); + masm.subsd(temp3, temp2); + masm.movdqu(temp2, temp8); + masm.subsd(temp1, temp3); + masm.movdq(temp3, new AMD64Address(gpr1, 24)); + masm.addsd(temp2, temp3); + masm.subsd(temp7, temp2); + masm.subsd(temp1, temp6); + masm.movdqu(temp6, externalAddress(scTwoPtr)); // 0x11111111, + // 0x3f811111, + // 0x55555555, + // 0x3fa55555 + masm.mulsd(temp2, temp4); + masm.mulpd(temp6, dest); + masm.mulsd(temp3, temp4); + masm.mulpd(temp2, dest); + masm.mulpd(dest, dest); + masm.addpd(temp5, externalAddress(scThreePtr)); // 0x1a01a01a, + // 0xbf2a01a0, + // 0x16c16c17, + // 0xbf56c16c + masm.mulsd(temp4, temp8); + masm.addpd(temp6, externalAddress(scOnePtr)); // 0x55555555, + // 0xbfc55555, + // 0x00000000, + // 0xbfe00000 + masm.mulpd(temp5, dest); + masm.movdqu(dest, temp3); + masm.addsd(temp3, temp9); + masm.mulpd(temp1, temp7); + masm.movdqu(temp7, temp4); + masm.addsd(temp4, temp3); + masm.addpd(temp6, temp5); + masm.subsd(temp9, temp3); + masm.subsd(temp3, temp4); + masm.addsd(temp1, new AMD64Address(gpr1, 16)); + masm.mulpd(temp6, temp2); + masm.addsd(temp9, dest); + masm.addsd(temp3, temp7); + masm.addsd(temp1, temp9); + masm.addsd(temp1, temp3); + masm.addsd(temp1, temp6); + masm.unpckhpd(temp6, temp6); + masm.movdqu(dest, temp4); + masm.addsd(temp1, temp6); + masm.addsd(dest, temp1); + masm.jmp(bb15); + + masm.bind(bb8); + masm.addl(gpr4, 64); + masm.movq(gpr8, gpr9); + masm.movq(gpr9, gpr7); + masm.movl(gpr7, 0); + masm.cmpq(gpr8, 0); + masm.jcc(ConditionFlag.NotEqual, bb9); + + masm.addl(gpr4, 64); + masm.movq(gpr8, gpr9); + masm.movq(gpr9, gpr7); + masm.cmpq(gpr8, 0); + masm.jcc(ConditionFlag.NotEqual, bb9); + + masm.xorpd(dest, dest); + masm.xorpd(temp6, temp6); + masm.jmp(bb12); + + masm.bind(bb10); + masm.jcc(ConditionFlag.Equal, bb11); + + masm.negl(gpr3); + masm.shrq(gpr9); + masm.movq(gpr1, gpr8); + masm.shrq(gpr8); + masm.subl(gpr4, gpr3); + masm.negl(gpr3); + masm.addl(gpr3, 64); + masm.shlq(gpr1); + masm.orq(gpr9, gpr1); + masm.jmp(bb11); + + masm.bind(bb4); + masm.negl(gpr3); + masm.shlq(gpr8, 32); + masm.orq(gpr8, gpr10); + masm.shlq(gpr8); + masm.movq(gpr6, gpr8); + masm.testl(gpr8, Integer.MIN_VALUE); + masm.jcc(ConditionFlag.NotEqual, bb13); + + masm.shrl(gpr8); + masm.movl(gpr2, 0); + masm.shrq(gpr6, 3); + masm.jmp(bb6); + + masm.bind(bb5); + masm.shrl(gpr8); + masm.movl(gpr2, 536870912); + masm.shrl(gpr2); + masm.shlq(gpr8, 32); + masm.orq(gpr8, gpr10); + masm.shlq(gpr2, 32); + masm.addl(gpr6, 536870912); + masm.movl(gpr3, 0); + masm.movl(gpr10, 0); + masm.subq(gpr3, gpr7); + masm.sbbq(gpr10, gpr9); + masm.sbbq(gpr2, gpr8); + masm.movq(gpr7, gpr3); + masm.movq(gpr9, gpr10); + masm.movq(gpr8, gpr2); + masm.movl(gpr2, 32768); + masm.jmp(bb6); + + masm.bind(bb13); + masm.shrl(gpr8); + masm.movq(gpr2, 0x100000000L); + masm.shrq(gpr2); + masm.movl(gpr3, 0); + masm.movl(gpr10, 0); + masm.subq(gpr3, gpr7); + masm.sbbq(gpr10, gpr9); + masm.sbbq(gpr2, gpr8); + masm.movq(gpr7, gpr3); + masm.movq(gpr9, gpr10); + masm.movq(gpr8, gpr2); + masm.movl(gpr2, 32768); + masm.shrq(gpr6, 3); + masm.addl(gpr6, 536870912); + masm.jmp(bb6); + + masm.bind(bb15); + } + + /* + * Copyright (c) 2014, 2016, Intel Corporation. All rights reserved. Intel Math Library (LIBM) + * Source Code + * + * ALGORITHM DESCRIPTION - COS() --------------------- + * + * 1. RANGE REDUCTION + * + * We perform an initial range reduction from X to r with + * + * X =~= N * pi/32 + r + * + * so that |r| <= pi/64 + epsilon. We restrict inputs to those where |N| <= 932560. Beyond this, + * the range reduction is insufficiently accurate. For extremely small inputs, denormalization + * can occur internally, impacting performance. This means that the main path is actually only + * taken for 2^-252 <= |X| < 90112. + * + * To avoid branches, we perform the range reduction to full accuracy each time. + * + * X - N * (P_1 + P_2 + P_3) + * + * where P_1 and P_2 are 32-bit numbers (so multiplication by N is exact) and P_3 is a 53-bit + * number. Together, these approximate pi well enough for all cases in the restricted range. + * + * The main reduction sequence is: + * + * y = 32/pi * x N = integer(y) (computed by adding and subtracting off SHIFTER) + * + * m_1 = N * P_1 m_2 = N * P_2 r_1 = x - m_1 r = r_1 - m_2 (this r can be used for most of the + * calculation) + * + * c_1 = r_1 - r m_3 = N * P_3 c_2 = c_1 - m_2 c = c_2 - m_3 + * + * 2. MAIN ALGORITHM + * + * The algorithm uses a table lookup based on B = M * pi / 32 where M = N mod 64. The stored + * values are: sigma closest power of 2 to cos(B) C_hl 53-bit cos(B) - sigma S_hi + S_lo 2 * + * 53-bit sin(B) + * + * The computation is organized as follows: + * + * sin(B + r + c) = [sin(B) + sigma * r] + r * (cos(B) - sigma) + sin(B) * [cos(r + c) - 1] + + * cos(B) * [sin(r + c) - r] + * + * which is approximately: + * + * [S_hi + sigma * r] + C_hl * r + S_lo + S_hi * [(cos(r) - 1) - r * c] + (C_hl + sigma) * + * [(sin(r) - r) + c] + * + * and this is what is actually computed. We separate this sum into four parts: + * + * hi + med + pols + corr + * + * where + * + * hi = S_hi + sigma r med = C_hl * r pols = S_hi * (cos(r) - 1) + (C_hl + sigma) * (sin(r) - r) + * corr = S_lo + c * ((C_hl + sigma) - S_hi * r) + * + * 3. POLYNOMIAL + * + * The polynomial S_hi * (cos(r) - 1) + (C_hl + sigma) * (sin(r) - r) can be rearranged freely, + * since it is quite small, so we exploit parallelism to the fullest. + * + * psc4 = SC_4 * r_1 msc4 = psc4 * r r2 = r * r msc2 = SC_2 * r2 r4 = r2 * r2 psc3 = SC_3 + msc4 + * psc1 = SC_1 + msc2 msc3 = r4 * psc3 sincospols = psc1 + msc3 pols = sincospols * + * + * 4. CORRECTION TERM + * + * This is where the "c" component of the range reduction is taken into account; recall that + * just "r" is used for most of the calculation. + * + * -c = m_3 - c_2 -d = S_hi * r - (C_hl + sigma) corr = -c * -d + S_lo + * + * 5. COMPENSATED SUMMATIONS + * + * The two successive compensated summations add up the high and medium parts, leaving just the + * low parts to add up at the end. + * + * rs = sigma * r res_int = S_hi + rs k_0 = S_hi - res_int k_2 = k_0 + rs med = C_hl * r res_hi + * = res_int + med k_1 = res_int - res_hi k_3 = k_1 + med + * + * 6. FINAL SUMMATION + * + * We now add up all the small parts: + * + * res_lo = pols(hi) + pols(lo) + corr + k_1 + k_3 + * + * Now the overall result is just: + * + * res_hi + res_lo + * + * 7. SMALL ARGUMENTS + * + * Inputs with |X| < 2^-252 are treated specially as 1 - |x|. + * + * Special cases: cos(NaN) = quiet NaN, and raise invalid exception cos(INF) = NaN and raise + * invalid exception cos(0) = 1 + * + */ + + public int[] one = { + 0x00000000, 0x3ff00000 + }; + + public void cosIntrinsic(Register dest, Register value, CompilationResultBuilder crb, AMD64MacroAssembler masm) { + ArrayDataPointerConstant oneHalfPtr = new ArrayDataPointerConstant(oneHalf, 16); + ArrayDataPointerConstant pTwoPtr = new ArrayDataPointerConstant(pTwo, 16); + ArrayDataPointerConstant scFourPtr = new ArrayDataPointerConstant(scFour, 16); + ArrayDataPointerConstant cTablePtr = new ArrayDataPointerConstant(cTable, 16); + ArrayDataPointerConstant scTwoPtr = new ArrayDataPointerConstant(scTwo, 16); + ArrayDataPointerConstant scThreePtr = new ArrayDataPointerConstant(scThree, 16); + ArrayDataPointerConstant scOnePtr = new ArrayDataPointerConstant(scOne, 16); + ArrayDataPointerConstant piInvTablePtr = new ArrayDataPointerConstant(piInvTable, 16); + ArrayDataPointerConstant piFourPtr = new ArrayDataPointerConstant(piFour, 16); + ArrayDataPointerConstant piThirtyTwoInvPtr = new ArrayDataPointerConstant(piThirtyTwoInv, 8); + ArrayDataPointerConstant signMaskPtr = new ArrayDataPointerConstant(signMask, 8); + ArrayDataPointerConstant pThreePtr = new ArrayDataPointerConstant(pThree, 8); + ArrayDataPointerConstant pOnePtr = new ArrayDataPointerConstant(pOne, 8); + ArrayDataPointerConstant onePtr = new ArrayDataPointerConstant(one, 8); + + Label bb0 = new Label(); + Label bb1 = new Label(); + Label bb3 = new Label(); + Label bb4 = new Label(); + Label bb5 = new Label(); + Label bb6 = new Label(); + Label bb7 = new Label(); + Label bb8 = new Label(); + Label bb9 = new Label(); + Label bb10 = new Label(); + Label bb11 = new Label(); + Label bb12 = new Label(); + Label bb13 = new Label(); + Label bb14 = new Label(); + + Register gpr1 = asRegister(gpr1Temp, AMD64Kind.QWORD); + Register gpr2 = asRegister(gpr2Temp, AMD64Kind.QWORD); + Register gpr3 = asRegister(rcxTemp, AMD64Kind.QWORD); + Register gpr4 = asRegister(gpr4Temp, AMD64Kind.QWORD); + Register gpr5 = asRegister(gpr5Temp, AMD64Kind.QWORD); + Register gpr6 = asRegister(gpr6Temp, AMD64Kind.QWORD); + Register gpr7 = asRegister(gpr7Temp, AMD64Kind.QWORD); + Register gpr8 = asRegister(gpr8Temp, AMD64Kind.QWORD); + Register gpr9 = asRegister(gpr9Temp, AMD64Kind.QWORD); + Register gpr10 = asRegister(gpr10Temp, AMD64Kind.QWORD); + + Register temp1 = asRegister(xmm1Temp, AMD64Kind.DOUBLE); + Register temp2 = asRegister(xmm2Temp, AMD64Kind.DOUBLE); + Register temp3 = asRegister(xmm3Temp, AMD64Kind.DOUBLE); + Register temp4 = asRegister(xmm4Temp, AMD64Kind.DOUBLE); + Register temp5 = asRegister(xmm5Temp, AMD64Kind.DOUBLE); + Register temp6 = asRegister(xmm6Temp, AMD64Kind.DOUBLE); + Register temp7 = asRegister(xmm7Temp, AMD64Kind.DOUBLE); + Register temp8 = asRegister(xmm8Temp, AMD64Kind.DOUBLE); + Register temp9 = asRegister(xmm9Temp, AMD64Kind.DOUBLE); + + AMD64Address stackSlot = (AMD64Address) crb.asAddress(stackTemp); + + setCrb(crb); + masm.movdq(stackSlot, value); + if (dest.encoding != value.encoding) { + masm.movdqu(dest, value); + } + + masm.leaq(gpr1, stackSlot); + masm.movl(gpr1, new AMD64Address(gpr1, 4)); + masm.movdq(temp1, externalAddress(piThirtyTwoInvPtr)); // 0x6dc9c883, + // 0x40245f30 + + masm.andl(gpr1, 2147418112); + masm.subl(gpr1, 808452096); + masm.cmpl(gpr1, 281346048); + masm.jcc(ConditionFlag.Above, bb0); + + masm.mulsd(temp1, dest); + masm.movdqu(temp5, externalAddress(oneHalfPtr)); // 0x00000000, + // 0x3fe00000, + // 0x00000000, + // 0x3fe00000 + masm.movdq(temp4, externalAddress(signMaskPtr)); // 0x00000000, + // 0x80000000 + masm.pand(temp4, dest); + masm.por(temp5, temp4); + masm.addpd(temp1, temp5); + masm.cvttsd2sil(gpr4, temp1); + masm.cvtsi2sdl(temp1, gpr4); + masm.movdqu(temp2, externalAddress(pTwoPtr)); // 0x1a600000, + // 0x3d90b461, + // 0x1a600000, + // 0x3d90b461 + masm.movdq(temp3, externalAddress(pOnePtr)); // 0x54400000, + // 0x3fb921fb + masm.mulsd(temp3, temp1); + masm.unpcklpd(temp1, temp1); + masm.addq(gpr4, 1865232); + masm.movdqu(temp4, dest); + masm.andq(gpr4, 63); + masm.movdqu(temp5, externalAddress(scFourPtr)); // 0xa556c734, + // 0x3ec71de3, + // 0x1a01a01a, + // 0x3efa01a0 + masm.leaq(gpr1, externalAddress(cTablePtr)); + masm.shlq(gpr4, 5); + masm.addq(gpr1, gpr4); + masm.movdqu(temp8, new AMD64Address(gpr1, 0)); + masm.mulpd(temp2, temp1); + masm.subsd(dest, temp3); + masm.mulsd(temp1, externalAddress(pThreePtr)); // 0x2e037073, + // 0x3b63198a + masm.subsd(temp4, temp3); + masm.unpcklpd(dest, dest); + masm.movdqu(temp3, temp4); + masm.subsd(temp4, temp2); + masm.mulpd(temp5, dest); + masm.subpd(dest, temp2); + masm.pshufd(temp7, temp8, 0xE); + masm.movdqu(temp6, externalAddress(scTwoPtr)); // 0x11111111, + // 0x3f811111, + // 0x55555555, + // 0x3fa55555 + masm.mulsd(temp7, temp4); + masm.subsd(temp3, temp4); + masm.mulpd(temp5, dest); + masm.mulpd(dest, dest); + masm.subsd(temp3, temp2); + masm.movdqu(temp2, temp8); + masm.subsd(temp1, temp3); + masm.movdq(temp3, new AMD64Address(gpr1, 24)); + masm.addsd(temp2, temp3); + masm.subsd(temp7, temp2); + masm.mulsd(temp2, temp4); + masm.mulpd(temp6, dest); + masm.mulsd(temp3, temp4); + masm.mulpd(temp2, dest); + masm.mulpd(dest, dest); + masm.addpd(temp5, externalAddress(scThreePtr)); // 0x1a01a01a, + // 0xbf2a01a0, + // 0x16c16c17, + // 0xbf56c16c + masm.mulsd(temp4, temp8); + masm.pshufd(temp9, temp8, 0xE); + masm.addpd(temp6, externalAddress(scOnePtr)); // 0x55555555, + // 0xbfc55555, + // 0x00000000, + // 0xbfe00000 + masm.mulpd(temp5, dest); + masm.movdqu(dest, temp3); + masm.addsd(temp3, temp9); + masm.mulpd(temp1, temp7); + masm.movdqu(temp7, temp4); + masm.addsd(temp4, temp3); + masm.addpd(temp6, temp5); + masm.subsd(temp9, temp3); + masm.subsd(temp3, temp4); + masm.addsd(temp1, new AMD64Address(gpr1, 16)); + masm.mulpd(temp6, temp2); + masm.addsd(dest, temp9); + masm.addsd(temp3, temp7); + masm.addsd(dest, temp1); + masm.addsd(dest, temp3); + masm.addsd(dest, temp6); + masm.unpckhpd(temp6, temp6); + masm.addsd(dest, temp6); + masm.addsd(dest, temp4); + masm.jmp(bb13); + + masm.bind(bb14); + masm.xorpd(temp1, temp1); + masm.xorpd(dest, dest); + masm.divsd(dest, temp1); + masm.jmp(bb13); + + masm.bind(bb0); + masm.jcc(ConditionFlag.Greater, bb1); + + masm.pextrw(gpr1, dest, 3); + masm.andl(gpr1, 32767); + masm.pinsrw(dest, gpr1, 3); + masm.movdq(temp1, externalAddress(onePtr)); // 0x00000000, + // 0x3ff00000 + masm.subsd(temp1, dest); + masm.movdqu(dest, temp1); + masm.jmp(bb13); + + masm.bind(bb1); + masm.pextrw(gpr3, dest, 3); + masm.andl(gpr3, 32752); + masm.cmpl(gpr3, 32752); + masm.jcc(ConditionFlag.Equal, bb14); + + masm.subl(gpr3, 16224); + masm.shrl(gpr3, 7); + masm.andl(gpr3, 65532); + masm.leaq(gpr10, externalAddress(piInvTablePtr)); + masm.addq(gpr3, gpr10); + masm.movdq(gpr1, dest); + masm.movl(gpr9, new AMD64Address(gpr3, 20)); + masm.movl(gpr7, new AMD64Address(gpr3, 24)); + masm.movl(gpr4, gpr1); + masm.shrq(gpr1, 21); + masm.orl(gpr1, Integer.MIN_VALUE); + masm.shrl(gpr1, 11); + masm.movl(gpr8, gpr9); + masm.imulq(gpr9, gpr4); + masm.imulq(gpr8, gpr1); + masm.imulq(gpr7, gpr1); + masm.movl(gpr5, new AMD64Address(gpr3, 16)); + masm.movl(gpr6, new AMD64Address(gpr3, 12)); + masm.movl(gpr10, gpr9); + masm.shrq(gpr9, 32); + masm.addq(gpr8, gpr9); + masm.addq(gpr10, gpr7); + masm.movl(gpr7, gpr10); + masm.shrq(gpr10, 32); + masm.addq(gpr8, gpr10); + masm.movl(gpr9, gpr5); + masm.imulq(gpr5, gpr4); + masm.imulq(gpr9, gpr1); + masm.movl(gpr10, gpr6); + masm.imulq(gpr6, gpr4); + masm.movl(gpr2, gpr5); + masm.shrq(gpr5, 32); + masm.addq(gpr8, gpr2); + masm.movl(gpr2, gpr8); + masm.shrq(gpr8, 32); + masm.addq(gpr9, gpr5); + masm.addq(gpr9, gpr8); + masm.shlq(gpr2, 32); + masm.orq(gpr7, gpr2); + masm.imulq(gpr10, gpr1); + masm.movl(gpr8, new AMD64Address(gpr3, 8)); + masm.movl(gpr5, new AMD64Address(gpr3, 4)); + masm.movl(gpr2, gpr6); + masm.shrq(gpr6, 32); + masm.addq(gpr9, gpr2); + masm.movl(gpr2, gpr9); + masm.shrq(gpr9, 32); + masm.addq(gpr10, gpr6); + masm.addq(gpr10, gpr9); + masm.movq(gpr6, gpr8); + masm.imulq(gpr8, gpr4); + masm.imulq(gpr6, gpr1); + masm.movl(gpr9, gpr8); + masm.shrq(gpr8, 32); + masm.addq(gpr10, gpr9); + masm.movl(gpr9, gpr10); + masm.shrq(gpr10, 32); + masm.addq(gpr6, gpr8); + masm.addq(gpr6, gpr10); + masm.movq(gpr8, gpr5); + masm.imulq(gpr5, gpr4); + masm.imulq(gpr8, gpr1); + masm.shlq(gpr9, 32); + masm.orq(gpr9, gpr2); + masm.movl(gpr1, new AMD64Address(gpr3, 0)); + masm.movl(gpr10, gpr5); + masm.shrq(gpr5, 32); + masm.addq(gpr6, gpr10); + masm.movl(gpr10, gpr6); + masm.shrq(gpr6, 32); + masm.addq(gpr8, gpr5); + masm.addq(gpr8, gpr6); + masm.imulq(gpr4, gpr1); + masm.pextrw(gpr2, dest, 3); + masm.leaq(gpr6, externalAddress(piInvTablePtr)); + masm.subq(gpr3, gpr6); + masm.addl(gpr3, gpr3); + masm.addl(gpr3, gpr3); + masm.addl(gpr3, gpr3); + masm.addl(gpr3, 19); + masm.movl(gpr5, 32768); + masm.andl(gpr5, gpr2); + masm.shrl(gpr2, 4); + masm.andl(gpr2, 2047); + masm.subl(gpr2, 1023); + masm.subl(gpr3, gpr2); + masm.addq(gpr8, gpr4); + masm.movl(gpr4, gpr3); + masm.addl(gpr4, 32); + masm.cmpl(gpr3, 1); + masm.jcc(ConditionFlag.Less, bb3); + + masm.negl(gpr3); + masm.addl(gpr3, 29); + masm.shll(gpr8); + masm.movl(gpr6, gpr8); + masm.andl(gpr8, 536870911); + masm.testl(gpr8, 268435456); + masm.jcc(ConditionFlag.NotEqual, bb4); + + masm.shrl(gpr8); + masm.movl(gpr2, 0); + masm.shlq(gpr8, 32); + masm.orq(gpr8, gpr10); + + masm.bind(bb5); + + masm.bind(bb6); + masm.cmpq(gpr8, 0); + masm.jcc(ConditionFlag.Equal, bb7); + + masm.bind(bb8); + masm.bsrq(gpr10, gpr8); + masm.movl(gpr3, 29); + masm.subl(gpr3, gpr10); + masm.jcc(ConditionFlag.LessEqual, bb9); + + masm.shlq(gpr8); + masm.movq(gpr1, gpr9); + masm.shlq(gpr9); + masm.addl(gpr4, gpr3); + masm.negl(gpr3); + masm.addl(gpr3, 64); + masm.shrq(gpr1); + masm.shrq(gpr7); + masm.orq(gpr8, gpr1); + masm.orq(gpr9, gpr7); + + masm.bind(bb10); + masm.cvtsi2sdq(dest, gpr8); + masm.shrq(gpr9, 1); + masm.cvtsi2sdq(temp3, gpr9); + masm.xorpd(temp4, temp4); + masm.shll(gpr4, 4); + masm.negl(gpr4); + masm.addl(gpr4, 16368); + masm.orl(gpr4, gpr5); + masm.xorl(gpr4, gpr2); + masm.pinsrw(temp4, gpr4, 3); + masm.leaq(gpr2, externalAddress(piFourPtr)); + masm.movdqu(temp2, new AMD64Address(gpr2, 0)); // 0x40000000, + // 0x3fe921fb, + // 0x18469899, + // 0x3e64442d + masm.xorpd(temp5, temp5); + masm.subl(gpr4, 1008); + masm.pinsrw(temp5, gpr4, 3); + masm.mulsd(dest, temp4); + masm.shll(gpr5, 16); + masm.sarl(gpr5, 31); + masm.mulsd(temp3, temp5); + masm.movdqu(temp1, dest); + masm.mulsd(dest, temp2); + masm.pshufd(temp6, temp2, 0xE); + masm.shrl(gpr6, 29); + masm.addsd(temp1, temp3); + masm.mulsd(temp3, temp2); + masm.addl(gpr6, gpr5); + masm.xorl(gpr6, gpr5); + masm.mulsd(temp6, temp1); + masm.movl(gpr1, gpr6); + masm.addsd(temp6, temp3); + masm.movdqu(temp2, dest); + masm.addsd(dest, temp6); + masm.subsd(temp2, dest); + masm.addsd(temp6, temp2); + + masm.bind(bb11); + masm.movq(temp1, externalAddress(piThirtyTwoInvPtr)); // 0x6dc9c883, + // 0x40245f30 + masm.mulsd(temp1, dest); + masm.movdq(temp5, externalAddress(oneHalfPtr)); // 0x00000000, + // 0x3fe00000, + // 0x00000000, + // 0x3fe00000 + masm.movdq(temp4, externalAddress(signMaskPtr)); // 0x00000000, + // 0x80000000 + masm.pand(temp4, dest); + masm.por(temp5, temp4); + masm.addpd(temp1, temp5); + masm.cvttsd2siq(gpr4, temp1); + masm.cvtsi2sdq(temp1, gpr4); + masm.movdq(temp3, externalAddress(pOnePtr)); // 0x54400000, + // 0x3fb921fb + masm.movdqu(temp2, externalAddress(pTwoPtr)); // 0x1a600000, + // 0x3d90b461, + // 0x1a600000, + // 0x3d90b461 + masm.mulsd(temp3, temp1); + masm.unpcklpd(temp1, temp1); + masm.shll(gpr1, 3); + masm.addl(gpr4, 1865232); + masm.movdqu(temp4, dest); + masm.addl(gpr4, gpr1); + masm.andl(gpr4, 63); + masm.movdqu(temp5, externalAddress(scFourPtr)); // 0xa556c734, + // 0x3ec71de3, + // 0x1a01a01a, + // 0x3efa01a0 + masm.leaq(gpr1, externalAddress(cTablePtr)); + masm.shll(gpr4, 5); + masm.addq(gpr1, gpr4); + masm.movdqu(temp8, new AMD64Address(gpr1, 0)); + masm.mulpd(temp2, temp1); + masm.subsd(dest, temp3); + masm.mulsd(temp1, externalAddress(pThreePtr)); // 0x2e037073, + // 0x3b63198a + masm.subsd(temp4, temp3); + masm.unpcklpd(dest, dest); + masm.movdqu(temp3, temp4); + masm.subsd(temp4, temp2); + masm.mulpd(temp5, dest); + masm.pshufd(temp7, temp8, 0xE); + masm.movdqu(temp9, temp7); + masm.subpd(dest, temp2); + masm.mulsd(temp7, temp4); + masm.subsd(temp3, temp4); + masm.mulpd(temp5, dest); + masm.mulpd(dest, dest); + masm.subsd(temp3, temp2); + masm.movdqu(temp2, temp8); + masm.subsd(temp1, temp3); + masm.movdq(temp3, new AMD64Address(gpr1, 24)); + masm.addsd(temp2, temp3); + masm.subsd(temp7, temp2); + masm.subsd(temp1, temp6); + masm.movdqu(temp6, externalAddress(scTwoPtr)); // 0x11111111, + // 0x3f811111, + // 0x55555555, + // 0x3fa55555 + masm.mulsd(temp2, temp4); + masm.mulpd(temp6, dest); + masm.mulsd(temp3, temp4); + masm.mulpd(temp2, dest); + masm.mulpd(dest, dest); + masm.addpd(temp5, externalAddress(scThreePtr)); // 0x1a01a01a, + // 0xbf2a01a0, + // 0x16c16c17, + // 0xbf56c16c + masm.mulsd(temp4, temp8); + masm.addpd(temp6, externalAddress(scOnePtr)); // 0x55555555, + // 0xbfc55555, + // 0x00000000, + // 0xbfe00000 + masm.mulpd(temp5, dest); + masm.movdqu(dest, temp3); + masm.addsd(temp3, temp9); + masm.mulpd(temp1, temp7); + masm.movdqu(temp7, temp4); + masm.addsd(temp4, temp3); + masm.addpd(temp6, temp5); + masm.subsd(temp9, temp3); + masm.subsd(temp3, temp4); + masm.addsd(temp1, new AMD64Address(gpr1, 16)); + masm.mulpd(temp6, temp2); + masm.addsd(temp9, dest); + masm.addsd(temp3, temp7); + masm.addsd(temp1, temp9); + masm.addsd(temp1, temp3); + masm.addsd(temp1, temp6); + masm.unpckhpd(temp6, temp6); + masm.movdqu(dest, temp4); + masm.addsd(temp1, temp6); + masm.addsd(dest, temp1); + masm.jmp(bb13); + + masm.bind(bb7); + masm.addl(gpr4, 64); + masm.movq(gpr8, gpr9); + masm.movq(gpr9, gpr7); + masm.movl(gpr7, 0); + masm.cmpq(gpr8, 0); + masm.jcc(ConditionFlag.NotEqual, bb8); + + masm.addl(gpr4, 64); + masm.movq(gpr8, gpr9); + masm.movq(gpr9, gpr7); + masm.cmpq(gpr8, 0); + masm.jcc(ConditionFlag.NotEqual, bb8); + + masm.xorpd(dest, dest); + masm.xorpd(temp6, temp6); + masm.jmp(bb11); + + masm.bind(bb9); + masm.jcc(ConditionFlag.Equal, bb10); + + masm.negl(gpr3); + masm.shrq(gpr9); + masm.movq(gpr1, gpr8); + masm.shrq(gpr8); + masm.subl(gpr4, gpr3); + masm.negl(gpr3); + masm.addl(gpr3, 64); + masm.shlq(gpr1); + masm.orq(gpr9, gpr1); + masm.jmp(bb10); + + masm.bind(bb3); + masm.negl(gpr3); + masm.shlq(gpr8, 32); + masm.orq(gpr8, gpr10); + masm.shlq(gpr8); + masm.movq(gpr6, gpr8); + masm.testl(gpr8, Integer.MIN_VALUE); + masm.jcc(ConditionFlag.NotEqual, bb12); + + masm.shrl(gpr8); + masm.movl(gpr2, 0); + masm.shrq(gpr6, 3); + masm.jmp(bb6); + + masm.bind(bb4); + masm.shrl(gpr8); + masm.movl(gpr2, 536870912); + masm.shrl(gpr2); + masm.shlq(gpr8, 32); + masm.orq(gpr8, gpr10); + masm.shlq(gpr2, 32); + masm.addl(gpr6, 536870912); + masm.movl(gpr3, 0); + masm.movl(gpr10, 0); + masm.subq(gpr3, gpr7); + masm.sbbq(gpr10, gpr9); + masm.sbbq(gpr2, gpr8); + masm.movq(gpr7, gpr3); + masm.movq(gpr9, gpr10); + masm.movq(gpr8, gpr2); + masm.movl(gpr2, 32768); + masm.jmp(bb5); + + masm.bind(bb12); + masm.shrl(gpr8); + masm.movq(gpr2, 0x100000000L); + masm.shrq(gpr2); + masm.movl(gpr3, 0); + masm.movl(gpr10, 0); + masm.subq(gpr3, gpr7); + masm.sbbq(gpr10, gpr9); + masm.sbbq(gpr2, gpr8); + masm.movq(gpr7, gpr3); + masm.movq(gpr9, gpr10); + masm.movq(gpr8, gpr2); + masm.movl(gpr2, 32768); + masm.shrq(gpr6, 3); + masm.addl(gpr6, 536870912); + masm.jmp(bb6); + + masm.bind(bb13); + } + + /* + * Copyright (c) 2014, 2016, Intel Corporation. All rights reserved. Intel Math Library (LIBM) + * Source Code + * + * ALGORITHM DESCRIPTION - TAN() --------------------- + * + * Polynomials coefficients and other constants. + * + * Note that in this algorithm, there is a different polynomial for each breakpoint, so there + * are 32 sets of polynomial coefficients as well as 32 instances of the other constants. + * + * The polynomial coefficients and constants are offset from the start of the main block as + * follows: + * + * 0: c8 | c0 16: c9 | c1 32: c10 | c2 48: c11 | c3 64: c12 | c4 80: c13 | c5 96: c14 | c6 112: + * c15 | c7 128: T_hi 136: T_lo 144: Sigma 152: T_hl 160: Tau 168: Mask 176: (end of block) + * + * The total table size is therefore 5632 bytes. + * + * Note that c0 and c1 are always zero. We could try storing other constants here, and just + * loading the low part of the SIMD register in these cases, after ensuring the high part is + * zero. + * + * The higher terms of the polynomial are computed in the *low* part of the SIMD register. This + * is so we can overlap the multiplication by r^8 and the unpacking of the other part. + * + * The constants are: T_hi + T_lo = accurate constant term in power series Sigma + T_hl = + * accurate coefficient of r in power series (Sigma=1 bit) Tau = multiplier for the reciprocal, + * always -1 or 0 + * + * The basic reconstruction formula using these constants is: + * + * High = tau * recip_hi + t_hi Med = (sgn * r + t_hl * r)_hi Low = (sgn * r + t_hl * r)_lo + + * tau * recip_lo + T_lo + (T_hl + sigma) * c + pol + * + * where pol = c0 + c1 * r + c2 * r^2 + ... + c15 * r^15 + * + * (c0 = c1 = 0, but using them keeps SIMD regularity) + * + * We then do a compensated sum High + Med, add the low parts together and then do the final + * sum. + * + * Here recip_hi + recip_lo is an accurate reciprocal of the remainder modulo pi/2 + * + * Special cases: tan(NaN) = quiet NaN, and raise invalid exception tan(INF) = NaN and raise + * invalid exception tan(+/-0) = +/-0 + * + */ + + private static int[] oneHalfTan = { + 0x00000000, 0x3fe00000, 0x00000000, 0x3fe00000 + }; + + private static int[] mulSixteen = { + 0x00000000, 0x40300000, 0x00000000, 0x3ff00000 + }; + + private static int[] signMaskTan = { + 0x00000000, 0x80000000, 0x00000000, 0x80000000 + }; + + private static int[] piThirtyTwoInvTan = { + 0x6dc9c883, 0x3fe45f30, 0x6dc9c883, 0x40245f30 + }; + + private static int[] pOneTan = { + 0x54444000, 0x3fb921fb, 0x54440000, 0x3fb921fb + }; + + private static int[] pTwoTan = { + 0x67674000, 0xbd32e7b9, 0x4c4c0000, 0x3d468c23 + }; + + private static int[] pThreeTan = { + 0x3707344a, 0x3aa8a2e0, 0x03707345, 0x3ae98a2e + }; + + private static int[] cTableTan = { + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x882c10fa, + 0x3f9664f4, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x55e6c23d, 0x3f8226e3, 0x55555555, + 0x3fd55555, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x0e157de0, 0x3f6d6d3d, 0x11111111, 0x3fc11111, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x452b75e3, 0x3f57da36, + 0x1ba1ba1c, 0x3faba1ba, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x3ff00000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x4e435f9b, + 0x3f953f83, 0x00000000, 0x00000000, 0x3c6e8e46, 0x3f9b74ea, + 0x00000000, 0x00000000, 0xda5b7511, 0x3f85ad63, 0xdc230b9b, + 0x3fb97558, 0x26cb3788, 0x3f881308, 0x76fc4985, 0x3fd62ac9, + 0x77bb08ba, 0x3f757c85, 0xb6247521, 0x3fb1381e, 0x5922170c, + 0x3f754e95, 0x8746482d, 0x3fc27f83, 0x11055b30, 0x3f64e391, + 0x3e666320, 0x3fa3e609, 0x0de9dae3, 0x3f6301df, 0x1f1dca06, + 0x3fafa8ae, 0x8c5b2da2, 0x3fb936bb, 0x4e88f7a5, 0x3c587d05, + 0x00000000, 0x3ff00000, 0xa8935dd9, 0x3f83dde2, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x5a279ea3, 0x3faa3407, + 0x00000000, 0x00000000, 0x432d65fa, 0x3fa70153, 0x00000000, + 0x00000000, 0x891a4602, 0x3f9d03ef, 0xd62ca5f8, 0x3fca77d9, + 0xb35f4628, 0x3f97a265, 0x433258fa, 0x3fd8cf51, 0xb58fd909, + 0x3f8f88e3, 0x01771cea, 0x3fc2b154, 0xf3562f8e, 0x3f888f57, + 0xc028a723, 0x3fc7370f, 0x20b7f9f0, 0x3f80f44c, 0x214368e9, + 0x3fb6dfaa, 0x28891863, 0x3f79b4b6, 0x172dbbf0, 0x3fb6cb8e, + 0xe0553158, 0x3fc975f5, 0x593fe814, 0x3c2ef5d3, 0x00000000, + 0x3ff00000, 0x03dec550, 0x3fa44203, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x9314533e, 0x3fbb8ec5, 0x00000000, + 0x00000000, 0x09aa36d0, 0x3fb6d3f4, 0x00000000, 0x00000000, + 0xdcb427fd, 0x3fb13950, 0xd87ab0bb, 0x3fd5335e, 0xce0ae8a5, + 0x3fabb382, 0x79143126, 0x3fddba41, 0x5f2b28d4, 0x3fa552f1, + 0x59f21a6d, 0x3fd015ab, 0x22c27d95, 0x3fa0e984, 0xe19fc6aa, + 0x3fd0576c, 0x8f2c2950, 0x3f9a4898, 0xc0b3f22c, 0x3fc59462, + 0x1883a4b8, 0x3f94b61c, 0x3f838640, 0x3fc30eb8, 0x355c63dc, + 0x3fd36a08, 0x1dce993d, 0xbc6d704d, 0x00000000, 0x3ff00000, + 0x2b82ab63, 0x3fb78e92, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x56f37042, 0x3fccfc56, 0x00000000, 0x00000000, + 0xaa563951, 0x3fc90125, 0x00000000, 0x00000000, 0x3d0e7c5d, + 0x3fc50533, 0x9bed9b2e, 0x3fdf0ed9, 0x5fe7c47c, 0x3fc1f250, + 0x96c125e5, 0x3fe2edd9, 0x5a02bbd8, 0x3fbe5c71, 0x86362c20, + 0x3fda08b7, 0x4b4435ed, 0x3fb9d342, 0x4b494091, 0x3fd911bd, + 0xb56658be, 0x3fb5e4c7, 0x93a2fd76, 0x3fd3c092, 0xda271794, + 0x3fb29910, 0x3303df2b, 0x3fd189be, 0x99fcef32, 0x3fda8279, + 0xb68c1467, 0x3c708b2f, 0x00000000, 0x3ff00000, 0x980c4337, + 0x3fc5f619, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0xcc03e501, 0x3fdff10f, 0x00000000, 0x00000000, 0x44a4e845, + 0x3fddb63b, 0x00000000, 0x00000000, 0x3768ad9f, 0x3fdb72a4, + 0x3dd01cca, 0x3fe5fdb9, 0xa61d2811, 0x3fd972b2, 0x5645ad0b, + 0x3fe977f9, 0xd013b3ab, 0x3fd78ca3, 0xbf0bf914, 0x3fe4f192, + 0x4d53e730, 0x3fd5d060, 0x3f8b9000, 0x3fe49933, 0xe2b82f08, + 0x3fd4322a, 0x5936a835, 0x3fe27ae1, 0xb1c61c9b, 0x3fd2b3fb, + 0xef478605, 0x3fe1659e, 0x190834ec, 0x3fe11ab7, 0xcdb625ea, + 0xbc8e564b, 0x00000000, 0x3ff00000, 0xb07217e3, 0x3fd248f1, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x2b2c49d0, + 0x3ff2de9c, 0x00000000, 0x00000000, 0x2655bc98, 0x3ff33e58, + 0x00000000, 0x00000000, 0xff691fa2, 0x3ff3972e, 0xe93463bd, + 0x3feeed87, 0x070e10a0, 0x3ff3f5b2, 0xf4d790a4, 0x3ff20c10, + 0xa04e8ea3, 0x3ff4541a, 0x386accd3, 0x3ff1369e, 0x222a66dd, + 0x3ff4b521, 0x22a9777e, 0x3ff20817, 0x52a04a6e, 0x3ff5178f, + 0xddaa0031, 0x3ff22137, 0x4447d47c, 0x3ff57c01, 0x1e9c7f1d, + 0x3ff29311, 0x2ab7f990, 0x3fe561b8, 0x209c7df1, 0x3c87a8c5, + 0x00000000, 0x3ff00000, 0x4170bcc6, 0x3fdc92d8, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0xc7ab4d5a, 0x40085e24, + 0x00000000, 0x00000000, 0xe93ea75d, 0x400b963d, 0x00000000, + 0x00000000, 0x94a7f25a, 0x400f37e2, 0x4b6261cb, 0x3ff5f984, + 0x5a9dd812, 0x4011aab0, 0x74c30018, 0x3ffaf5a5, 0x7f2ce8e3, + 0x4013fe8b, 0xfe8e54fa, 0x3ffd7334, 0x670d618d, 0x4016a10c, + 0x4db97058, 0x4000e012, 0x24df44dd, 0x40199c5f, 0x697d6ece, + 0x4003006e, 0x83298b82, 0x401cfc4d, 0x19d490d6, 0x40058c19, + 0x2ae42850, 0x3fea4300, 0x118e20e6, 0xbc7a6db8, 0x00000000, + 0x40000000, 0xe33345b8, 0xbfd4e526, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x65965966, 0x40219659, 0x00000000, + 0x00000000, 0x882c10fa, 0x402664f4, 0x00000000, 0x00000000, + 0x83cd3723, 0x402c8342, 0x00000000, 0x40000000, 0x55e6c23d, + 0x403226e3, 0x55555555, 0x40055555, 0x34451939, 0x40371c96, + 0xaaaaaaab, 0x400aaaaa, 0x0e157de0, 0x403d6d3d, 0x11111111, + 0x40111111, 0xa738201f, 0x4042bbce, 0x05b05b06, 0x4015b05b, + 0x452b75e3, 0x4047da36, 0x1ba1ba1c, 0x401ba1ba, 0x00000000, + 0x3ff00000, 0x00000000, 0x00000000, 0x00000000, 0x40000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x4f48b8d3, 0xbf33eaf9, 0x00000000, 0x00000000, + 0x0cf7586f, 0x3f20b8ea, 0x00000000, 0x00000000, 0xd0258911, + 0xbf0abaf3, 0x23e49fe9, 0xbfab5a8c, 0x2d53222e, 0x3ef60d15, + 0x21169451, 0x3fa172b2, 0xbb254dbc, 0xbee1d3b5, 0xdbf93b8e, + 0xbf84c7db, 0x05b4630b, 0x3ecd3364, 0xee9aada7, 0x3f743924, + 0x794a8297, 0xbeb7b7b9, 0xe015f797, 0xbf5d41f5, 0xe41a4a56, + 0x3ea35dfb, 0xe4c2a251, 0x3f49a2ab, 0x5af9e000, 0xbfce49ce, + 0x8c743719, 0x3d1eb860, 0x00000000, 0x00000000, 0x1b4863cf, + 0x3fd78294, 0x00000000, 0x3ff00000, 0x00000000, 0xfffffff8, + 0x535ad890, 0xbf2b9320, 0x00000000, 0x00000000, 0x018fdf1f, + 0x3f16d61d, 0x00000000, 0x00000000, 0x0359f1be, 0xbf0139e4, + 0xa4317c6d, 0xbfa67e17, 0x82672d0f, 0x3eebb405, 0x2f1b621e, + 0x3f9f455b, 0x51ccf238, 0xbed55317, 0xf437b9ac, 0xbf804bee, + 0xc791a2b5, 0x3ec0e993, 0x919a1db2, 0x3f7080c2, 0x336a5b0e, + 0xbeaa48a2, 0x0a268358, 0xbf55a443, 0xdfd978e4, 0x3e94b61f, + 0xd7767a58, 0x3f431806, 0x2aea0000, 0xbfc9bbe8, 0x7723ea61, + 0xbd3a2369, 0x00000000, 0x00000000, 0xdf7796ff, 0x3fd6e642, + 0x00000000, 0x3ff00000, 0x00000000, 0xfffffff8, 0xb9ff07ce, + 0xbf231c78, 0x00000000, 0x00000000, 0xa5517182, 0x3f0ff0e0, + 0x00000000, 0x00000000, 0x790b4cbc, 0xbef66191, 0x848a46c6, + 0xbfa21ac0, 0xb16435fa, 0x3ee1d3ec, 0x2a1aa832, 0x3f9c71ea, + 0xfdd299ef, 0xbec9dd1a, 0x3f8dbaaf, 0xbf793363, 0x309fc6ea, + 0x3eb415d6, 0xbee60471, 0x3f6b83ba, 0x94a0a697, 0xbe9dae11, + 0x3e5c67b3, 0xbf4fd07b, 0x9a8f3e3e, 0x3e86bd75, 0xa4beb7a4, + 0x3f3d1eb1, 0x29cfc000, 0xbfc549ce, 0xbf159358, 0xbd397b33, + 0x00000000, 0x00000000, 0x871fee6c, 0x3fd666f0, 0x00000000, + 0x3ff00000, 0x00000000, 0xfffffff8, 0x7d98a556, 0xbf1a3958, + 0x00000000, 0x00000000, 0x9d88dc01, 0x3f0704c2, 0x00000000, + 0x00000000, 0x73742a2b, 0xbeed054a, 0x58844587, 0xbf9c2a13, + 0x55688a79, 0x3ed7a326, 0xee33f1d6, 0x3f9a48f4, 0xa8dc9888, + 0xbebf8939, 0xaad4b5b8, 0xbf72f746, 0x9102efa1, 0x3ea88f82, + 0xdabc29cf, 0x3f678228, 0x9289afb8, 0xbe90f456, 0x741fb4ed, + 0xbf46f3a3, 0xa97f6663, 0x3e79b4bf, 0xca89ff3f, 0x3f36db70, + 0xa8a2a000, 0xbfc0ee13, 0x3da24be1, 0xbd338b9f, 0x00000000, + 0x00000000, 0x11cd6c69, 0x3fd601fd, 0x00000000, 0x3ff00000, + 0x00000000, 0xfffffff8, 0x1a154b97, 0xbf116b01, 0x00000000, + 0x00000000, 0x2d427630, 0x3f0147bf, 0x00000000, 0x00000000, + 0xb93820c8, 0xbee264d4, 0xbb6cbb18, 0xbf94ab8c, 0x888d4d92, + 0x3ed0568b, 0x60730f7c, 0x3f98b19b, 0xe4b1fb11, 0xbeb2f950, + 0x22cf9f74, 0xbf6b21cd, 0x4a3ff0a6, 0x3e9f499e, 0xfd2b83ce, + 0x3f64aad7, 0x637b73af, 0xbe83487c, 0xe522591a, 0xbf3fc092, + 0xa158e8bc, 0x3e6e3aae, 0xe5e82ffa, 0x3f329d2f, 0xd636a000, + 0xbfb9477f, 0xc2c2d2bc, 0xbd135ef9, 0x00000000, 0x00000000, + 0xf2fdb123, 0x3fd5b566, 0x00000000, 0x3ff00000, 0x00000000, + 0xfffffff8, 0xc41acb64, 0xbf05448d, 0x00000000, 0x00000000, + 0xdbb03d6f, 0x3efb7ad2, 0x00000000, 0x00000000, 0x9e42962d, + 0xbed5aea5, 0x2579f8ef, 0xbf8b2398, 0x288a1ed9, 0x3ec81441, + 0xb0198dc5, 0x3f979a3a, 0x2fdfe253, 0xbea57cd3, 0x5766336f, + 0xbf617caa, 0x600944c3, 0x3e954ed6, 0xa4e0aaf8, 0x3f62c646, + 0x6b8fb29c, 0xbe74e3a3, 0xdc4c0409, 0xbf33f952, 0x9bffe365, + 0x3e6301ec, 0xb8869e44, 0x3f2fc566, 0xe1e04000, 0xbfb0cc62, + 0x016b907f, 0xbd119cbc, 0x00000000, 0x00000000, 0xe6b9d8fa, + 0x3fd57fb3, 0x00000000, 0x3ff00000, 0x00000000, 0xfffffff8, + 0x5daf22a6, 0xbef429d7, 0x00000000, 0x00000000, 0x06bca545, + 0x3ef7a27d, 0x00000000, 0x00000000, 0x7211c19a, 0xbec41c3e, + 0x956ed53e, 0xbf7ae3f4, 0xee750e72, 0x3ec3901b, 0x91d443f5, + 0x3f96f713, 0x36661e6c, 0xbe936e09, 0x506f9381, 0xbf5122e8, + 0xcb6dd43f, 0x3e9041b9, 0x6698b2ff, 0x3f61b0c7, 0x576bf12b, + 0xbe625a8a, 0xe5a0e9dc, 0xbf23499d, 0x110384dd, 0x3e5b1c2c, + 0x68d43db6, 0x3f2cb899, 0x6ecac000, 0xbfa0c414, 0xcd7dd58c, + 0x3d13500f, 0x00000000, 0x00000000, 0x85a2c8fb, 0x3fd55fe0, + 0x00000000, 0x3ff00000, 0x00000000, 0xfffffff8, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x2bf70ebe, 0x3ef66a8f, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0xd644267f, 0x3ec22805, 0x16c16c17, 0x3f96c16c, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0xc4e09162, + 0x3e8d6db2, 0xbc011567, 0x3f61566a, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x1f79955c, 0x3e57da4e, 0x9334ef0b, + 0x3f2bbd77, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x55555555, 0x3fd55555, 0x00000000, + 0x3ff00000, 0x00000000, 0xfffffff8, 0x5daf22a6, 0x3ef429d7, + 0x00000000, 0x00000000, 0x06bca545, 0x3ef7a27d, 0x00000000, + 0x00000000, 0x7211c19a, 0x3ec41c3e, 0x956ed53e, 0x3f7ae3f4, + 0xee750e72, 0x3ec3901b, 0x91d443f5, 0x3f96f713, 0x36661e6c, + 0x3e936e09, 0x506f9381, 0x3f5122e8, 0xcb6dd43f, 0x3e9041b9, + 0x6698b2ff, 0x3f61b0c7, 0x576bf12b, 0x3e625a8a, 0xe5a0e9dc, + 0x3f23499d, 0x110384dd, 0x3e5b1c2c, 0x68d43db6, 0x3f2cb899, + 0x6ecac000, 0x3fa0c414, 0xcd7dd58c, 0xbd13500f, 0x00000000, + 0x00000000, 0x85a2c8fb, 0x3fd55fe0, 0x00000000, 0x3ff00000, + 0x00000000, 0xfffffff8, 0xc41acb64, 0x3f05448d, 0x00000000, + 0x00000000, 0xdbb03d6f, 0x3efb7ad2, 0x00000000, 0x00000000, + 0x9e42962d, 0x3ed5aea5, 0x2579f8ef, 0x3f8b2398, 0x288a1ed9, + 0x3ec81441, 0xb0198dc5, 0x3f979a3a, 0x2fdfe253, 0x3ea57cd3, + 0x5766336f, 0x3f617caa, 0x600944c3, 0x3e954ed6, 0xa4e0aaf8, + 0x3f62c646, 0x6b8fb29c, 0x3e74e3a3, 0xdc4c0409, 0x3f33f952, + 0x9bffe365, 0x3e6301ec, 0xb8869e44, 0x3f2fc566, 0xe1e04000, + 0x3fb0cc62, 0x016b907f, 0x3d119cbc, 0x00000000, 0x00000000, + 0xe6b9d8fa, 0x3fd57fb3, 0x00000000, 0x3ff00000, 0x00000000, + 0xfffffff8, 0x1a154b97, 0x3f116b01, 0x00000000, 0x00000000, + 0x2d427630, 0x3f0147bf, 0x00000000, 0x00000000, 0xb93820c8, + 0x3ee264d4, 0xbb6cbb18, 0x3f94ab8c, 0x888d4d92, 0x3ed0568b, + 0x60730f7c, 0x3f98b19b, 0xe4b1fb11, 0x3eb2f950, 0x22cf9f74, + 0x3f6b21cd, 0x4a3ff0a6, 0x3e9f499e, 0xfd2b83ce, 0x3f64aad7, + 0x637b73af, 0x3e83487c, 0xe522591a, 0x3f3fc092, 0xa158e8bc, + 0x3e6e3aae, 0xe5e82ffa, 0x3f329d2f, 0xd636a000, 0x3fb9477f, + 0xc2c2d2bc, 0x3d135ef9, 0x00000000, 0x00000000, 0xf2fdb123, + 0x3fd5b566, 0x00000000, 0x3ff00000, 0x00000000, 0xfffffff8, + 0x7d98a556, 0x3f1a3958, 0x00000000, 0x00000000, 0x9d88dc01, + 0x3f0704c2, 0x00000000, 0x00000000, 0x73742a2b, 0x3eed054a, + 0x58844587, 0x3f9c2a13, 0x55688a79, 0x3ed7a326, 0xee33f1d6, + 0x3f9a48f4, 0xa8dc9888, 0x3ebf8939, 0xaad4b5b8, 0x3f72f746, + 0x9102efa1, 0x3ea88f82, 0xdabc29cf, 0x3f678228, 0x9289afb8, + 0x3e90f456, 0x741fb4ed, 0x3f46f3a3, 0xa97f6663, 0x3e79b4bf, + 0xca89ff3f, 0x3f36db70, 0xa8a2a000, 0x3fc0ee13, 0x3da24be1, + 0x3d338b9f, 0x00000000, 0x00000000, 0x11cd6c69, 0x3fd601fd, + 0x00000000, 0x3ff00000, 0x00000000, 0xfffffff8, 0xb9ff07ce, + 0x3f231c78, 0x00000000, 0x00000000, 0xa5517182, 0x3f0ff0e0, + 0x00000000, 0x00000000, 0x790b4cbc, 0x3ef66191, 0x848a46c6, + 0x3fa21ac0, 0xb16435fa, 0x3ee1d3ec, 0x2a1aa832, 0x3f9c71ea, + 0xfdd299ef, 0x3ec9dd1a, 0x3f8dbaaf, 0x3f793363, 0x309fc6ea, + 0x3eb415d6, 0xbee60471, 0x3f6b83ba, 0x94a0a697, 0x3e9dae11, + 0x3e5c67b3, 0x3f4fd07b, 0x9a8f3e3e, 0x3e86bd75, 0xa4beb7a4, + 0x3f3d1eb1, 0x29cfc000, 0x3fc549ce, 0xbf159358, 0x3d397b33, + 0x00000000, 0x00000000, 0x871fee6c, 0x3fd666f0, 0x00000000, + 0x3ff00000, 0x00000000, 0xfffffff8, 0x535ad890, 0x3f2b9320, + 0x00000000, 0x00000000, 0x018fdf1f, 0x3f16d61d, 0x00000000, + 0x00000000, 0x0359f1be, 0x3f0139e4, 0xa4317c6d, 0x3fa67e17, + 0x82672d0f, 0x3eebb405, 0x2f1b621e, 0x3f9f455b, 0x51ccf238, + 0x3ed55317, 0xf437b9ac, 0x3f804bee, 0xc791a2b5, 0x3ec0e993, + 0x919a1db2, 0x3f7080c2, 0x336a5b0e, 0x3eaa48a2, 0x0a268358, + 0x3f55a443, 0xdfd978e4, 0x3e94b61f, 0xd7767a58, 0x3f431806, + 0x2aea0000, 0x3fc9bbe8, 0x7723ea61, 0x3d3a2369, 0x00000000, + 0x00000000, 0xdf7796ff, 0x3fd6e642, 0x00000000, 0x3ff00000, + 0x00000000, 0xfffffff8, 0x4f48b8d3, 0x3f33eaf9, 0x00000000, + 0x00000000, 0x0cf7586f, 0x3f20b8ea, 0x00000000, 0x00000000, + 0xd0258911, 0x3f0abaf3, 0x23e49fe9, 0x3fab5a8c, 0x2d53222e, + 0x3ef60d15, 0x21169451, 0x3fa172b2, 0xbb254dbc, 0x3ee1d3b5, + 0xdbf93b8e, 0x3f84c7db, 0x05b4630b, 0x3ecd3364, 0xee9aada7, + 0x3f743924, 0x794a8297, 0x3eb7b7b9, 0xe015f797, 0x3f5d41f5, + 0xe41a4a56, 0x3ea35dfb, 0xe4c2a251, 0x3f49a2ab, 0x5af9e000, + 0x3fce49ce, 0x8c743719, 0xbd1eb860, 0x00000000, 0x00000000, + 0x1b4863cf, 0x3fd78294, 0x00000000, 0x3ff00000, 0x00000000, + 0xfffffff8, 0x65965966, 0xc0219659, 0x00000000, 0x00000000, + 0x882c10fa, 0x402664f4, 0x00000000, 0x00000000, 0x83cd3723, + 0xc02c8342, 0x00000000, 0xc0000000, 0x55e6c23d, 0x403226e3, + 0x55555555, 0x40055555, 0x34451939, 0xc0371c96, 0xaaaaaaab, + 0xc00aaaaa, 0x0e157de0, 0x403d6d3d, 0x11111111, 0x40111111, + 0xa738201f, 0xc042bbce, 0x05b05b06, 0xc015b05b, 0x452b75e3, + 0x4047da36, 0x1ba1ba1c, 0x401ba1ba, 0x00000000, 0xbff00000, + 0x00000000, 0x00000000, 0x00000000, 0x40000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0xc7ab4d5a, 0xc0085e24, 0x00000000, 0x00000000, 0xe93ea75d, + 0x400b963d, 0x00000000, 0x00000000, 0x94a7f25a, 0xc00f37e2, + 0x4b6261cb, 0xbff5f984, 0x5a9dd812, 0x4011aab0, 0x74c30018, + 0x3ffaf5a5, 0x7f2ce8e3, 0xc013fe8b, 0xfe8e54fa, 0xbffd7334, + 0x670d618d, 0x4016a10c, 0x4db97058, 0x4000e012, 0x24df44dd, + 0xc0199c5f, 0x697d6ece, 0xc003006e, 0x83298b82, 0x401cfc4d, + 0x19d490d6, 0x40058c19, 0x2ae42850, 0xbfea4300, 0x118e20e6, + 0x3c7a6db8, 0x00000000, 0x40000000, 0xe33345b8, 0xbfd4e526, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x2b2c49d0, + 0xbff2de9c, 0x00000000, 0x00000000, 0x2655bc98, 0x3ff33e58, + 0x00000000, 0x00000000, 0xff691fa2, 0xbff3972e, 0xe93463bd, + 0xbfeeed87, 0x070e10a0, 0x3ff3f5b2, 0xf4d790a4, 0x3ff20c10, + 0xa04e8ea3, 0xbff4541a, 0x386accd3, 0xbff1369e, 0x222a66dd, + 0x3ff4b521, 0x22a9777e, 0x3ff20817, 0x52a04a6e, 0xbff5178f, + 0xddaa0031, 0xbff22137, 0x4447d47c, 0x3ff57c01, 0x1e9c7f1d, + 0x3ff29311, 0x2ab7f990, 0xbfe561b8, 0x209c7df1, 0xbc87a8c5, + 0x00000000, 0x3ff00000, 0x4170bcc6, 0x3fdc92d8, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0xcc03e501, 0xbfdff10f, + 0x00000000, 0x00000000, 0x44a4e845, 0x3fddb63b, 0x00000000, + 0x00000000, 0x3768ad9f, 0xbfdb72a4, 0x3dd01cca, 0xbfe5fdb9, + 0xa61d2811, 0x3fd972b2, 0x5645ad0b, 0x3fe977f9, 0xd013b3ab, + 0xbfd78ca3, 0xbf0bf914, 0xbfe4f192, 0x4d53e730, 0x3fd5d060, + 0x3f8b9000, 0x3fe49933, 0xe2b82f08, 0xbfd4322a, 0x5936a835, + 0xbfe27ae1, 0xb1c61c9b, 0x3fd2b3fb, 0xef478605, 0x3fe1659e, + 0x190834ec, 0xbfe11ab7, 0xcdb625ea, 0x3c8e564b, 0x00000000, + 0x3ff00000, 0xb07217e3, 0x3fd248f1, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x56f37042, 0xbfccfc56, 0x00000000, + 0x00000000, 0xaa563951, 0x3fc90125, 0x00000000, 0x00000000, + 0x3d0e7c5d, 0xbfc50533, 0x9bed9b2e, 0xbfdf0ed9, 0x5fe7c47c, + 0x3fc1f250, 0x96c125e5, 0x3fe2edd9, 0x5a02bbd8, 0xbfbe5c71, + 0x86362c20, 0xbfda08b7, 0x4b4435ed, 0x3fb9d342, 0x4b494091, + 0x3fd911bd, 0xb56658be, 0xbfb5e4c7, 0x93a2fd76, 0xbfd3c092, + 0xda271794, 0x3fb29910, 0x3303df2b, 0x3fd189be, 0x99fcef32, + 0xbfda8279, 0xb68c1467, 0xbc708b2f, 0x00000000, 0x3ff00000, + 0x980c4337, 0x3fc5f619, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x9314533e, 0xbfbb8ec5, 0x00000000, 0x00000000, + 0x09aa36d0, 0x3fb6d3f4, 0x00000000, 0x00000000, 0xdcb427fd, + 0xbfb13950, 0xd87ab0bb, 0xbfd5335e, 0xce0ae8a5, 0x3fabb382, + 0x79143126, 0x3fddba41, 0x5f2b28d4, 0xbfa552f1, 0x59f21a6d, + 0xbfd015ab, 0x22c27d95, 0x3fa0e984, 0xe19fc6aa, 0x3fd0576c, + 0x8f2c2950, 0xbf9a4898, 0xc0b3f22c, 0xbfc59462, 0x1883a4b8, + 0x3f94b61c, 0x3f838640, 0x3fc30eb8, 0x355c63dc, 0xbfd36a08, + 0x1dce993d, 0x3c6d704d, 0x00000000, 0x3ff00000, 0x2b82ab63, + 0x3fb78e92, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x5a279ea3, 0xbfaa3407, 0x00000000, 0x00000000, 0x432d65fa, + 0x3fa70153, 0x00000000, 0x00000000, 0x891a4602, 0xbf9d03ef, + 0xd62ca5f8, 0xbfca77d9, 0xb35f4628, 0x3f97a265, 0x433258fa, + 0x3fd8cf51, 0xb58fd909, 0xbf8f88e3, 0x01771cea, 0xbfc2b154, + 0xf3562f8e, 0x3f888f57, 0xc028a723, 0x3fc7370f, 0x20b7f9f0, + 0xbf80f44c, 0x214368e9, 0xbfb6dfaa, 0x28891863, 0x3f79b4b6, + 0x172dbbf0, 0x3fb6cb8e, 0xe0553158, 0xbfc975f5, 0x593fe814, + 0xbc2ef5d3, 0x00000000, 0x3ff00000, 0x03dec550, 0x3fa44203, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x4e435f9b, + 0xbf953f83, 0x00000000, 0x00000000, 0x3c6e8e46, 0x3f9b74ea, + 0x00000000, 0x00000000, 0xda5b7511, 0xbf85ad63, 0xdc230b9b, + 0xbfb97558, 0x26cb3788, 0x3f881308, 0x76fc4985, 0x3fd62ac9, + 0x77bb08ba, 0xbf757c85, 0xb6247521, 0xbfb1381e, 0x5922170c, + 0x3f754e95, 0x8746482d, 0x3fc27f83, 0x11055b30, 0xbf64e391, + 0x3e666320, 0xbfa3e609, 0x0de9dae3, 0x3f6301df, 0x1f1dca06, + 0x3fafa8ae, 0x8c5b2da2, 0xbfb936bb, 0x4e88f7a5, 0xbc587d05, + 0x00000000, 0x3ff00000, 0xa8935dd9, 0x3f83dde2, 0x00000000, + 0x00000000, 0x00000000, 0x00000000 + }; + + private static int[] maskThirtyFiveTan = { + 0xfffc0000, 0xffffffff, 0x00000000, 0x00000000 + }; + + private static int[] qElevenTan = { + 0xb8fe4d77, 0x3f82609a + }; + + private static int[] qNineTan = { + 0xbf847a43, 0x3f9664a0 + }; + + private static int[] qSevenTan = { + 0x52c4c8ab, 0x3faba1ba + }; + + private static int[] qFiveTan = { + 0x11092746, 0x3fc11111 + }; + + private static int[] qThreeTan = { + 0x55555612, 0x3fd55555 + }; + + private static int[] piInvTableTan = { + 0x00000000, 0x00000000, 0xa2f9836e, 0x4e441529, 0xfc2757d1, + 0xf534ddc0, 0xdb629599, 0x3c439041, 0xfe5163ab, 0xdebbc561, + 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c, 0xfe1deb1c, + 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41, + 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, + 0xde05980f, 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, + 0x4f463f66, 0x9e5fea2d, 0x7527bac7, 0xebe5f17b, 0x3d0739f7, + 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08, 0x56033046, 0xfc7b6bab, + 0xf0cfbc21 + }; + + private static int[] piFourTan = { + 0x00000000, 0x3fe921fb, 0x4611a626, 0x3e85110b + }; + + private static int[] qqTwoTan = { + 0x676733af, 0x3d32e7b9 + }; + + private static int[] twoPowFiftyFiveTan = { + 0x00000000, 0x43600000 + }; + + private static int[] twoPowMFiftyFiveTan = { + 0x00000000, 0x3c800000 + }; + + public void tanIntrinsic(Register dest, Register value, CompilationResultBuilder crb, AMD64MacroAssembler masm) { + ArrayDataPointerConstant oneHalfTanPtr = new ArrayDataPointerConstant(oneHalfTan, 16); + ArrayDataPointerConstant mulSixteenPtr = new ArrayDataPointerConstant(mulSixteen, 16); + ArrayDataPointerConstant signMaskTanPtr = new ArrayDataPointerConstant(signMaskTan, 16); + ArrayDataPointerConstant piThirtyTwoInvTanPtr = new ArrayDataPointerConstant(piThirtyTwoInvTan, 16); + ArrayDataPointerConstant pOneTanPtr = new ArrayDataPointerConstant(pOneTan, 16); + ArrayDataPointerConstant pTwoTanPtr = new ArrayDataPointerConstant(pTwoTan, 16); + ArrayDataPointerConstant pThreeTanPtr = new ArrayDataPointerConstant(pThreeTan, 16); + ArrayDataPointerConstant cTableTanPtr = new ArrayDataPointerConstant(cTableTan, 16); + ArrayDataPointerConstant maskThirtyFiveTanPtr = new ArrayDataPointerConstant(maskThirtyFiveTan, 16); + ArrayDataPointerConstant qElevenTanPtr = new ArrayDataPointerConstant(qElevenTan, 16); + ArrayDataPointerConstant qNineTanPtr = new ArrayDataPointerConstant(qNineTan, 16); + ArrayDataPointerConstant qSevenTanPtr = new ArrayDataPointerConstant(qSevenTan, 8); + ArrayDataPointerConstant qFiveTanPtr = new ArrayDataPointerConstant(qFiveTan, 16); + ArrayDataPointerConstant qThreeTanPtr = new ArrayDataPointerConstant(qThreeTan, 16); + ArrayDataPointerConstant piInvTableTanPtr = new ArrayDataPointerConstant(piInvTableTan, 16); + ArrayDataPointerConstant piFourTanPtr = new ArrayDataPointerConstant(piFourTan, 8); + ArrayDataPointerConstant qqTwoTanPtr = new ArrayDataPointerConstant(qqTwoTan, 8); + ArrayDataPointerConstant onePtr = new ArrayDataPointerConstant(one, 8); + ArrayDataPointerConstant twoPowFiftyFiveTanPtr = new ArrayDataPointerConstant(twoPowFiftyFiveTan, 8); + ArrayDataPointerConstant twoPowMFiftyFiveTanPtr = new ArrayDataPointerConstant(twoPowMFiftyFiveTan, 8); + + Label bb0 = new Label(); + Label bb1 = new Label(); + Label bb2 = new Label(); + Label bb3 = new Label(); + Label bb5 = new Label(); + Label bb6 = new Label(); + Label bb8 = new Label(); + Label bb9 = new Label(); + Label bb10 = new Label(); + Label bb11 = new Label(); + Label bb12 = new Label(); + Label bb13 = new Label(); + Label bb14 = new Label(); + Label bb15 = new Label(); + + Register gpr1 = asRegister(gpr1Temp, AMD64Kind.QWORD); + Register gpr2 = asRegister(gpr2Temp, AMD64Kind.QWORD); + Register gpr3 = asRegister(rcxTemp, AMD64Kind.QWORD); + Register gpr4 = asRegister(gpr4Temp, AMD64Kind.QWORD); + Register gpr5 = asRegister(gpr5Temp, AMD64Kind.QWORD); + Register gpr6 = asRegister(gpr6Temp, AMD64Kind.QWORD); + Register gpr7 = asRegister(gpr7Temp, AMD64Kind.QWORD); + Register gpr8 = asRegister(gpr8Temp, AMD64Kind.QWORD); + Register gpr9 = asRegister(gpr9Temp, AMD64Kind.QWORD); + Register gpr10 = asRegister(gpr10Temp, AMD64Kind.QWORD); + + Register temp1 = asRegister(xmm1Temp, AMD64Kind.DOUBLE); + Register temp2 = asRegister(xmm2Temp, AMD64Kind.DOUBLE); + Register temp3 = asRegister(xmm3Temp, AMD64Kind.DOUBLE); + Register temp4 = asRegister(xmm4Temp, AMD64Kind.DOUBLE); + Register temp5 = asRegister(xmm5Temp, AMD64Kind.DOUBLE); + Register temp6 = asRegister(xmm6Temp, AMD64Kind.DOUBLE); + Register temp7 = asRegister(xmm7Temp, AMD64Kind.DOUBLE); + + setCrb(crb); + if (dest.encoding != value.encoding) { + masm.movdqu(dest, value); + } + + masm.pextrw(gpr1, dest, 3); + masm.andl(gpr1, 32767); + masm.subl(gpr1, 16314); + masm.cmpl(gpr1, 270); + masm.jcc(ConditionFlag.Above, bb0); + + masm.movdqu(temp5, externalAddress(oneHalfTanPtr)); // 0x00000000, + // 0x3fe00000, + // 0x00000000, + // 0x3fe00000 + masm.movdqu(temp6, externalAddress(mulSixteenPtr)); // 0x00000000, + // 0x40300000, + // 0x00000000, + // 0x3ff00000 + masm.unpcklpd(dest, dest); + masm.movdqu(temp4, externalAddress(signMaskTanPtr)); // 0x00000000, + // 0x80000000, + // 0x00000000, + // 0x80000000 + masm.andpd(temp4, dest); + masm.movdqu(temp1, externalAddress(piThirtyTwoInvTanPtr)); // 0x6dc9c883, + // 0x3fe45f30, + // 0x6dc9c883, + // 0x40245f30 + masm.mulpd(temp1, dest); + masm.por(temp5, temp4); + masm.addpd(temp1, temp5); + masm.movdqu(temp7, temp1); + masm.unpckhpd(temp7, temp7); + masm.cvttsd2sil(gpr4, temp7); + masm.cvttpd2dq(temp1, temp1); + masm.cvtdq2pd(temp1, temp1); + masm.mulpd(temp1, temp6); + masm.movdqu(temp3, externalAddress(pOneTanPtr)); // 0x54444000, + // 0x3fb921fb, + // 0x54440000, + // 0x3fb921fb + masm.movdq(temp5, externalAddress(qqTwoTanPtr)); // 0x676733af, + // 0x3d32e7b9 + masm.addq(gpr4, 469248); + masm.movdqu(temp4, externalAddress(pTwoTanPtr)); // 0x67674000, + // 0xbd32e7b9, + // 0x4c4c0000, + // 0x3d468c23 + masm.mulpd(temp3, temp1); + masm.andq(gpr4, 31); + masm.mulsd(temp5, temp1); + masm.movq(gpr3, gpr4); + masm.mulpd(temp4, temp1); + masm.shlq(gpr3, 1); + masm.subpd(dest, temp3); + masm.mulpd(temp1, externalAddress(pThreeTanPtr)); // 0x3707344a, + // 0x3aa8a2e0, + // 0x03707345, + // 0x3ae98a2e + masm.addq(gpr4, gpr3); + masm.shlq(gpr3, 2); + masm.addq(gpr4, gpr3); + masm.addsd(temp5, dest); + masm.movdqu(temp2, dest); + masm.subpd(dest, temp4); + masm.movdq(temp6, externalAddress(onePtr)); // 0x00000000, + // 0x3ff00000 + masm.shlq(gpr4, 4); + masm.leaq(gpr1, externalAddress(cTableTanPtr)); + masm.andpd(temp5, externalAddress(maskThirtyFiveTanPtr)); // 0xfffc0000, + // 0xffffffff, + // 0x00000000, + // 0x00000000 + masm.movdqu(temp3, dest); + masm.addq(gpr1, gpr4); + masm.subpd(temp2, dest); + masm.unpckhpd(dest, dest); + masm.divsd(temp6, temp5); + masm.subpd(temp2, temp4); + masm.movdqu(temp7, new AMD64Address(gpr1, 16)); + masm.subsd(temp3, temp5); + masm.mulpd(temp7, dest); + masm.subpd(temp2, temp1); + masm.movdqu(temp1, new AMD64Address(gpr1, 48)); + masm.mulpd(temp1, dest); + masm.movdqu(temp4, new AMD64Address(gpr1, 96)); + masm.mulpd(temp4, dest); + masm.addsd(temp2, temp3); + masm.movdqu(temp3, dest); + masm.mulpd(dest, dest); + masm.addpd(temp7, new AMD64Address(gpr1, 0)); + masm.addpd(temp1, new AMD64Address(gpr1, 32)); + masm.mulpd(temp1, dest); + masm.addpd(temp4, new AMD64Address(gpr1, 80)); + masm.addpd(temp7, temp1); + masm.movdqu(temp1, new AMD64Address(gpr1, 112)); + masm.mulpd(temp1, dest); + masm.mulpd(dest, dest); + masm.addpd(temp4, temp1); + masm.movdqu(temp1, new AMD64Address(gpr1, 64)); + masm.mulpd(temp1, dest); + masm.addpd(temp7, temp1); + masm.movdqu(temp1, temp3); + masm.mulpd(temp3, dest); + masm.mulsd(dest, dest); + masm.mulpd(temp1, new AMD64Address(gpr1, 144)); + masm.mulpd(temp4, temp3); + masm.movdqu(temp3, temp1); + masm.addpd(temp7, temp4); + masm.movdqu(temp4, temp1); + masm.mulsd(dest, temp7); + masm.unpckhpd(temp7, temp7); + masm.addsd(dest, temp7); + masm.unpckhpd(temp1, temp1); + masm.addsd(temp3, temp1); + masm.subsd(temp4, temp3); + masm.addsd(temp1, temp4); + masm.movdqu(temp4, temp2); + masm.movdq(temp7, new AMD64Address(gpr1, 144)); + masm.unpckhpd(temp2, temp2); + masm.addsd(temp7, new AMD64Address(gpr1, 152)); + masm.mulsd(temp7, temp2); + masm.addsd(temp7, new AMD64Address(gpr1, 136)); + masm.addsd(temp7, temp1); + masm.addsd(dest, temp7); + masm.movdq(temp7, externalAddress(onePtr)); // 0x00000000, + // 0x3ff00000 + masm.mulsd(temp4, temp6); + masm.movdq(temp2, new AMD64Address(gpr1, 168)); + masm.andpd(temp2, temp6); + masm.mulsd(temp5, temp2); + masm.mulsd(temp6, new AMD64Address(gpr1, 160)); + masm.subsd(temp7, temp5); + masm.subsd(temp2, new AMD64Address(gpr1, 128)); + masm.subsd(temp7, temp4); + masm.mulsd(temp7, temp6); + masm.movdqu(temp4, temp3); + masm.subsd(temp3, temp2); + masm.addsd(temp2, temp3); + masm.subsd(temp4, temp2); + masm.addsd(dest, temp4); + masm.subsd(dest, temp7); + masm.addsd(dest, temp3); + masm.jmp(bb15); + + masm.bind(bb0); + masm.jcc(ConditionFlag.Greater, bb1); + + masm.pextrw(gpr1, dest, 3); + masm.movl(gpr4, gpr1); + masm.andl(gpr1, 32752); + masm.jcc(ConditionFlag.Equal, bb2); + + masm.andl(gpr4, 32767); + masm.cmpl(gpr4, 15904); + masm.jcc(ConditionFlag.Below, bb3); + + masm.movdqu(temp2, dest); + masm.movdqu(temp3, dest); + masm.movdq(temp1, externalAddress(qElevenTanPtr)); // 0xb8fe4d77, + // 0x3f82609a + masm.mulsd(temp2, dest); + masm.mulsd(temp3, temp2); + masm.mulsd(temp1, temp2); + masm.addsd(temp1, externalAddress(qNineTanPtr)); // 0xbf847a43, + // 0x3f9664a0 + masm.mulsd(temp1, temp2); + masm.addsd(temp1, externalAddress(qSevenTanPtr)); // 0x52c4c8ab, + // 0x3faba1ba + masm.mulsd(temp1, temp2); + masm.addsd(temp1, externalAddress(qFiveTanPtr)); // 0x11092746, + // 0x3fc11111 + masm.mulsd(temp1, temp2); + masm.addsd(temp1, externalAddress(qThreeTanPtr)); // 0x55555612, + // 0x3fd55555 + masm.mulsd(temp1, temp3); + masm.addsd(dest, temp1); + masm.jmp(bb15); + + masm.bind(bb3); + masm.movdq(temp3, externalAddress(twoPowFiftyFiveTanPtr)); // 0x00000000, + // 0x43600000 + masm.mulsd(temp3, dest); + masm.addsd(dest, temp3); + masm.mulsd(dest, externalAddress(twoPowMFiftyFiveTanPtr)); // 0x00000000, + // 0x3c800000 + masm.jmp(bb15); + + masm.bind(bb14); + masm.xorpd(temp1, temp1); + masm.xorpd(dest, dest); + masm.divsd(dest, temp1); + masm.jmp(bb15); + + masm.bind(bb2); + masm.movdqu(temp1, dest); + masm.mulsd(temp1, temp1); + masm.jmp(bb15); + + masm.bind(bb1); + masm.pextrw(gpr3, dest, 3); + masm.andl(gpr3, 32752); + masm.cmpl(gpr3, 32752); + masm.jcc(ConditionFlag.Equal, bb14); + + masm.subl(gpr3, 16224); + masm.shrl(gpr3, 7); + masm.andl(gpr3, 65532); + masm.leaq(gpr10, externalAddress(piInvTableTanPtr)); + masm.addq(gpr3, gpr10); + masm.movdq(gpr1, dest); + masm.movl(gpr9, new AMD64Address(gpr3, 20)); + masm.movl(gpr7, new AMD64Address(gpr3, 24)); + masm.movl(gpr4, gpr1); + masm.shrq(gpr1, 21); + masm.orl(gpr1, Integer.MIN_VALUE); + masm.shrl(gpr1, 11); + masm.movl(gpr8, gpr9); + masm.imulq(gpr9, gpr4); + masm.imulq(gpr8, gpr1); + masm.imulq(gpr7, gpr1); + masm.movl(gpr5, new AMD64Address(gpr3, 16)); + masm.movl(gpr6, new AMD64Address(gpr3, 12)); + masm.movl(gpr10, gpr9); + masm.shrq(gpr9, 32); + masm.addq(gpr8, gpr9); + masm.addq(gpr10, gpr7); + masm.movl(gpr7, gpr10); + masm.shrq(gpr10, 32); + masm.addq(gpr8, gpr10); + masm.movl(gpr9, gpr5); + masm.imulq(gpr5, gpr4); + masm.imulq(gpr9, gpr1); + masm.movl(gpr10, gpr6); + masm.imulq(gpr6, gpr4); + masm.movl(gpr2, gpr5); + masm.shrq(gpr5, 32); + masm.addq(gpr8, gpr2); + masm.movl(gpr2, gpr8); + masm.shrq(gpr8, 32); + masm.addq(gpr9, gpr5); + masm.addq(gpr9, gpr8); + masm.shlq(gpr2, 32); + masm.orq(gpr7, gpr2); + masm.imulq(gpr10, gpr1); + masm.movl(gpr8, new AMD64Address(gpr3, 8)); + masm.movl(gpr5, new AMD64Address(gpr3, 4)); + masm.movl(gpr2, gpr6); + masm.shrq(gpr6, 32); + masm.addq(gpr9, gpr2); + masm.movl(gpr2, gpr9); + masm.shrq(gpr9, 32); + masm.addq(gpr10, gpr6); + masm.addq(gpr10, gpr9); + masm.movq(gpr6, gpr8); + masm.imulq(gpr8, gpr4); + masm.imulq(gpr6, gpr1); + masm.movl(gpr9, gpr8); + masm.shrq(gpr8, 32); + masm.addq(gpr10, gpr9); + masm.movl(gpr9, gpr10); + masm.shrq(gpr10, 32); + masm.addq(gpr6, gpr8); + masm.addq(gpr6, gpr10); + masm.movq(gpr8, gpr5); + masm.imulq(gpr5, gpr4); + masm.imulq(gpr8, gpr1); + masm.shlq(gpr9, 32); + masm.orq(gpr9, gpr2); + masm.movl(gpr1, new AMD64Address(gpr3, 0)); + masm.movl(gpr10, gpr5); + masm.shrq(gpr5, 32); + masm.addq(gpr6, gpr10); + masm.movl(gpr10, gpr6); + masm.shrq(gpr6, 32); + masm.addq(gpr8, gpr5); + masm.addq(gpr8, gpr6); + masm.imulq(gpr4, gpr1); + masm.pextrw(gpr2, dest, 3); + masm.leaq(gpr6, externalAddress(piInvTableTanPtr)); + masm.subq(gpr3, gpr6); + masm.addl(gpr3, gpr3); + masm.addl(gpr3, gpr3); + masm.addl(gpr3, gpr3); + masm.addl(gpr3, 19); + masm.movl(gpr5, 32768); + masm.andl(gpr5, gpr2); + masm.shrl(gpr2, 4); + masm.andl(gpr2, 2047); + masm.subl(gpr2, 1023); + masm.subl(gpr3, gpr2); + masm.addq(gpr8, gpr4); + masm.movl(gpr4, gpr3); + masm.addl(gpr4, 32); + masm.cmpl(gpr3, 0); + masm.jcc(ConditionFlag.Less, bb5); + + masm.negl(gpr3); + masm.addl(gpr3, 29); + masm.shll(gpr8); + masm.movl(gpr6, gpr8); + masm.andl(gpr8, 1073741823); + masm.testl(gpr8, 536870912); + masm.jcc(ConditionFlag.NotEqual, bb6); + + masm.shrl(gpr8); + masm.movl(gpr2, 0); + masm.shlq(gpr8, 32); + masm.orq(gpr8, gpr10); + + masm.bind(bb8); + masm.cmpq(gpr8, 0); + masm.jcc(ConditionFlag.Equal, bb9); + + masm.bind(bb10); + masm.bsrq(gpr10, gpr8); + masm.movl(gpr3, 29); + masm.subl(gpr3, gpr10); + masm.jcc(ConditionFlag.LessEqual, bb11); + + masm.shlq(gpr8); + masm.movq(gpr1, gpr9); + masm.shlq(gpr9); + masm.addl(gpr4, gpr3); + masm.negl(gpr3); + masm.addl(gpr3, 64); + masm.shrq(gpr1); + masm.shrq(gpr7); + masm.orq(gpr8, gpr1); + masm.orq(gpr9, gpr7); + + masm.bind(bb12); + masm.cvtsi2sdq(dest, gpr8); + masm.shrq(gpr9, 1); + masm.cvtsi2sdq(temp3, gpr9); + masm.xorpd(temp4, temp4); + masm.shll(gpr4, 4); + masm.negl(gpr4); + masm.addl(gpr4, 16368); + masm.orl(gpr4, gpr5); + masm.xorl(gpr4, gpr2); + masm.pinsrw(temp4, gpr4, 3); + masm.leaq(gpr1, externalAddress(piFourTanPtr)); + masm.movdq(temp2, new AMD64Address(gpr1, 0)); // 0x00000000, + // 0x3fe921fb, + masm.movdq(temp7, new AMD64Address(gpr1, 8)); // 0x4611a626, + // 0x3e85110b + masm.xorpd(temp5, temp5); + masm.subl(gpr4, 1008); + masm.pinsrw(temp5, gpr4, 3); + masm.mulsd(dest, temp4); + masm.shll(gpr5, 16); + masm.sarl(gpr5, 31); + masm.mulsd(temp3, temp5); + masm.movdqu(temp1, dest); + masm.mulsd(dest, temp2); + masm.shrl(gpr6, 30); + masm.addsd(temp1, temp3); + masm.mulsd(temp3, temp2); + masm.addl(gpr6, gpr5); + masm.xorl(gpr6, gpr5); + masm.mulsd(temp7, temp1); + masm.movl(gpr1, gpr6); + masm.addsd(temp7, temp3); + masm.movdqu(temp2, dest); + masm.addsd(dest, temp7); + masm.subsd(temp2, dest); + masm.addsd(temp7, temp2); + masm.movdqu(temp1, externalAddress(piThirtyTwoInvTanPtr)); // 0x6dc9c883, + // 0x3fe45f30, + // 0x6dc9c883, + // 0x40245f30 + if (masm.supports(CPUFeature.SSE3)) { + masm.movddup(dest, dest); + } else { + masm.movlhps(dest, dest); + } + masm.movdqu(temp4, externalAddress(signMaskTanPtr)); // 0x00000000, + // 0x80000000, + // 0x00000000, + // 0x80000000 + masm.andpd(temp4, dest); + masm.mulpd(temp1, dest); + if (masm.supports(CPUFeature.SSE3)) { + masm.movddup(temp7, temp7); + } else { + masm.movlhps(temp7, temp7); + } + masm.movdqu(temp5, externalAddress(oneHalfTanPtr)); // 0x00000000, + // 0x3fe00000, + // 0x00000000, + // 0x3fe00000 + masm.movdqu(temp6, externalAddress(mulSixteenPtr)); // 0x00000000, + // 0x40300000, + // 0x00000000, + // 0x3ff00000 + masm.por(temp5, temp4); + masm.addpd(temp1, temp5); + masm.movdqu(temp5, temp1); + masm.unpckhpd(temp5, temp5); + masm.cvttsd2sil(gpr4, temp5); + masm.cvttpd2dq(temp1, temp1); + masm.cvtdq2pd(temp1, temp1); + masm.mulpd(temp1, temp6); + masm.movdqu(temp3, externalAddress(pOneTanPtr)); // 0x54444000, + // 0x3fb921fb, + // 0x54440000, + // 0x3fb921fb + masm.movdq(temp5, externalAddress(qqTwoTanPtr)); // 0x676733af, + // 0x3d32e7b9 + masm.shll(gpr1, 4); + masm.addl(gpr4, 469248); + masm.movdqu(temp4, externalAddress(pTwoTanPtr)); // 0x67674000, + // 0xbd32e7b9, + // 0x4c4c0000, + // 0x3d468c23 + masm.mulpd(temp3, temp1); + masm.addl(gpr4, gpr1); + masm.andl(gpr4, 31); + masm.mulsd(temp5, temp1); + masm.movl(gpr3, gpr4); + masm.mulpd(temp4, temp1); + masm.shll(gpr3, 1); + masm.subpd(dest, temp3); + masm.mulpd(temp1, externalAddress(pThreeTanPtr)); // 0x3707344a, + // 0x3aa8a2e0, + // 0x03707345, + // 0x3ae98a2e + masm.addl(gpr4, gpr3); + masm.shll(gpr3, 2); + masm.addl(gpr4, gpr3); + masm.addsd(temp5, dest); + masm.movdqu(temp2, dest); + masm.subpd(dest, temp4); + masm.movdq(temp6, externalAddress(onePtr)); // 0x00000000, + // 0x3ff00000 + masm.shll(gpr4, 4); + masm.leaq(gpr1, externalAddress(cTableTanPtr)); + masm.andpd(temp5, externalAddress(maskThirtyFiveTanPtr)); // 0xfffc0000, + // 0xffffffff, + // 0x00000000, + // 0x00000000 + masm.movdqu(temp3, dest); + masm.addq(gpr1, gpr4); + masm.subpd(temp2, dest); + masm.unpckhpd(dest, dest); + masm.divsd(temp6, temp5); + masm.subpd(temp2, temp4); + masm.subsd(temp3, temp5); + masm.subpd(temp2, temp1); + masm.movdqu(temp1, new AMD64Address(gpr1, 48)); + masm.addpd(temp2, temp7); + masm.movdqu(temp7, new AMD64Address(gpr1, 16)); + masm.mulpd(temp7, dest); + masm.movdqu(temp4, new AMD64Address(gpr1, 96)); + masm.mulpd(temp1, dest); + masm.mulpd(temp4, dest); + masm.addsd(temp2, temp3); + masm.movdqu(temp3, dest); + masm.mulpd(dest, dest); + masm.addpd(temp7, new AMD64Address(gpr1, 0)); + masm.addpd(temp1, new AMD64Address(gpr1, 32)); + masm.mulpd(temp1, dest); + masm.addpd(temp4, new AMD64Address(gpr1, 80)); + masm.addpd(temp7, temp1); + masm.movdqu(temp1, new AMD64Address(gpr1, 112)); + masm.mulpd(temp1, dest); + masm.mulpd(dest, dest); + masm.addpd(temp4, temp1); + masm.movdqu(temp1, new AMD64Address(gpr1, 64)); + masm.mulpd(temp1, dest); + masm.addpd(temp7, temp1); + masm.movdqu(temp1, temp3); + masm.mulpd(temp3, dest); + masm.mulsd(dest, dest); + masm.mulpd(temp1, new AMD64Address(gpr1, 144)); + masm.mulpd(temp4, temp3); + masm.movdqu(temp3, temp1); + masm.addpd(temp7, temp4); + masm.movdqu(temp4, temp1); + masm.mulsd(dest, temp7); + masm.unpckhpd(temp7, temp7); + masm.addsd(dest, temp7); + masm.unpckhpd(temp1, temp1); + masm.addsd(temp3, temp1); + masm.subsd(temp4, temp3); + masm.addsd(temp1, temp4); + masm.movdqu(temp4, temp2); + masm.movdq(temp7, new AMD64Address(gpr1, 144)); + masm.unpckhpd(temp2, temp2); + masm.addsd(temp7, new AMD64Address(gpr1, 152)); + masm.mulsd(temp7, temp2); + masm.addsd(temp7, new AMD64Address(gpr1, 136)); + masm.addsd(temp7, temp1); + masm.addsd(dest, temp7); + masm.movdq(temp7, externalAddress(onePtr)); // 0x00000000, + // 0x3ff00000 + masm.mulsd(temp4, temp6); + masm.movdq(temp2, new AMD64Address(gpr1, 168)); + masm.andpd(temp2, temp6); + masm.mulsd(temp5, temp2); + masm.mulsd(temp6, new AMD64Address(gpr1, 160)); + masm.subsd(temp7, temp5); + masm.subsd(temp2, new AMD64Address(gpr1, 128)); + masm.subsd(temp7, temp4); + masm.mulsd(temp7, temp6); + masm.movdqu(temp4, temp3); + masm.subsd(temp3, temp2); + masm.addsd(temp2, temp3); + masm.subsd(temp4, temp2); + masm.addsd(dest, temp4); + masm.subsd(dest, temp7); + masm.addsd(dest, temp3); + masm.jmp(bb15); + + masm.bind(bb9); + masm.addl(gpr4, 64); + masm.movq(gpr8, gpr9); + masm.movq(gpr9, gpr7); + masm.movl(gpr7, 0); + masm.cmpq(gpr8, 0); + masm.jcc(ConditionFlag.NotEqual, bb10); + + masm.addl(gpr4, 64); + masm.movq(gpr8, gpr9); + masm.movq(gpr9, gpr7); + masm.cmpq(gpr8, 0); + masm.jcc(ConditionFlag.NotEqual, bb10); + + masm.jmp(bb12); + + masm.bind(bb11); + masm.jcc(ConditionFlag.Equal, bb12); + + masm.negl(gpr3); + masm.shrq(gpr9); + masm.movq(gpr1, gpr8); + masm.shrq(gpr8); + masm.subl(gpr4, gpr3); + masm.negl(gpr3); + masm.addl(gpr3, 64); + masm.shlq(gpr1); + masm.orq(gpr9, gpr1); + masm.jmp(bb12); + + masm.bind(bb5); + masm.notl(gpr3); + masm.shlq(gpr8, 32); + masm.orq(gpr8, gpr10); + masm.shlq(gpr8); + masm.movq(gpr6, gpr8); + masm.testl(gpr8, Integer.MIN_VALUE); + masm.jcc(ConditionFlag.NotEqual, bb13); + + masm.shrl(gpr8); + masm.movl(gpr2, 0); + masm.shrq(gpr6, 2); + masm.jmp(bb8); + + masm.bind(bb6); + masm.shrl(gpr8); + masm.movl(gpr2, 1073741824); + masm.shrl(gpr2); + masm.shlq(gpr8, 32); + masm.orq(gpr8, gpr10); + masm.shlq(gpr2, 32); + masm.addl(gpr6, 1073741824); + masm.movl(gpr3, 0); + masm.movl(gpr10, 0); + masm.subq(gpr3, gpr7); + masm.sbbq(gpr10, gpr9); + masm.sbbq(gpr2, gpr8); + masm.movq(gpr7, gpr3); + masm.movq(gpr9, gpr10); + masm.movq(gpr8, gpr2); + masm.movl(gpr2, 32768); + masm.jmp(bb8); + + masm.bind(bb13); + masm.shrl(gpr8); + masm.movq(gpr2, 0x100000000L); + masm.shrq(gpr2); + masm.movl(gpr3, 0); + masm.movl(gpr10, 0); + masm.subq(gpr3, gpr7); + masm.sbbq(gpr10, gpr9); + masm.sbbq(gpr2, gpr8); + masm.movq(gpr7, gpr3); + masm.movq(gpr9, gpr10); + masm.movq(gpr8, gpr2); + masm.movl(gpr2, 32768); + masm.shrq(gpr6, 2); + masm.addl(gpr6, 1073741824); + masm.jmp(bb8); + + masm.bind(bb15); + } + + /* + * Copyright (c) 2014, 2016, Intel Corporation. All rights reserved. Intel Math Library (LIBM) + * Source Code + * + * ALGORITHM DESCRIPTION - EXP() --------------------- + * + * Description: Let K = 64 (table size). x x/log(2) n e = 2 = 2 * T[j] * (1 + P(y)) where x = + * m*log(2)/K + y, y in [-log(2)/K..log(2)/K] m = n*K + j, m,n,j - signed integer, j in + * [-K/2..K/2] j/K values of 2 are tabulated as T[j] = T_hi[j] ( 1 + T_lo[j]). + * + * P(y) is a minimax polynomial approximation of exp(x)-1 on small interval + * [-log(2)/K..log(2)/K] (were calculated by Maple V). + * + * To avoid problems with arithmetic overflow and underflow, n n1 n2 value of 2 is safely + * computed as 2 * 2 where n1 in [-BIAS/2..BIAS/2] where BIAS is a value of exponent bias. + * + * Special cases: exp(NaN) = NaN exp(+INF) = +INF exp(-INF) = 0 exp(x) = 1 for subnormals for + * finite argument, only exp(0)=1 is exact For IEEE double if x > 709.782712893383973096 then + * exp(x) overflow if x < -745.133219101941108420 then exp(x) underflow + * + */ + + private static int[] cvExp = { + 0x652b82fe, 0x40571547, 0x652b82fe, 0x40571547, 0xfefa0000, + 0x3f862e42, 0xfefa0000, 0x3f862e42, 0xbc9e3b3a, 0x3d1cf79a, + 0xbc9e3b3a, 0x3d1cf79a, 0xfffffffe, 0x3fdfffff, 0xfffffffe, + 0x3fdfffff, 0xe3289860, 0x3f56c15c, 0x555b9e25, 0x3fa55555, + 0xc090cf0f, 0x3f811115, 0x55548ba1, 0x3fc55555 + }; + + private static int[] shifterExp = { + 0x00000000, 0x43380000, 0x00000000, 0x43380000 + }; + + private static int[] mMaskExp = { + 0xffffffc0, 0x00000000, 0xffffffc0, 0x00000000 + }; + + private static int[] biasExp = { + 0x0000ffc0, 0x00000000, 0x0000ffc0, 0x00000000 + }; + + private static int[] tblAddrExp = { + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x0e03754d, + 0x3cad7bbf, 0x3e778060, 0x00002c9a, 0x3567f613, 0x3c8cd252, + 0xd3158574, 0x000059b0, 0x61e6c861, 0x3c60f74e, 0x18759bc8, + 0x00008745, 0x5d837b6c, 0x3c979aa6, 0x6cf9890f, 0x0000b558, + 0x702f9cd1, 0x3c3ebe3d, 0x32d3d1a2, 0x0000e3ec, 0x1e63bcd8, + 0x3ca3516e, 0xd0125b50, 0x00011301, 0x26f0387b, 0x3ca4c554, + 0xaea92ddf, 0x0001429a, 0x62523fb6, 0x3ca95153, 0x3c7d517a, + 0x000172b8, 0x3f1353bf, 0x3c8b898c, 0xeb6fcb75, 0x0001a35b, + 0x3e3a2f5f, 0x3c9aecf7, 0x3168b9aa, 0x0001d487, 0x44a6c38d, + 0x3c8a6f41, 0x88628cd6, 0x0002063b, 0xe3a8a894, 0x3c968efd, + 0x6e756238, 0x0002387a, 0x981fe7f2, 0x3c80472b, 0x65e27cdd, + 0x00026b45, 0x6d09ab31, 0x3c82f7e1, 0xf51fdee1, 0x00029e9d, + 0x720c0ab3, 0x3c8b3782, 0xa6e4030b, 0x0002d285, 0x4db0abb6, + 0x3c834d75, 0x0a31b715, 0x000306fe, 0x5dd3f84a, 0x3c8fdd39, + 0xb26416ff, 0x00033c08, 0xcc187d29, 0x3ca12f8c, 0x373aa9ca, + 0x000371a7, 0x738b5e8b, 0x3ca7d229, 0x34e59ff6, 0x0003a7db, + 0xa72a4c6d, 0x3c859f48, 0x4c123422, 0x0003dea6, 0x259d9205, + 0x3ca8b846, 0x21f72e29, 0x0004160a, 0x60c2ac12, 0x3c4363ed, + 0x6061892d, 0x00044e08, 0xdaa10379, 0x3c6ecce1, 0xb5c13cd0, + 0x000486a2, 0xbb7aafb0, 0x3c7690ce, 0xd5362a27, 0x0004bfda, + 0x9b282a09, 0x3ca083cc, 0x769d2ca6, 0x0004f9b2, 0xc1aae707, + 0x3ca509b0, 0x569d4f81, 0x0005342b, 0x18fdd78e, 0x3c933505, + 0x36b527da, 0x00056f47, 0xe21c5409, 0x3c9063e1, 0xdd485429, + 0x0005ab07, 0x2b64c035, 0x3c9432e6, 0x15ad2148, 0x0005e76f, + 0x99f08c0a, 0x3ca01284, 0xb03a5584, 0x0006247e, 0x0073dc06, + 0x3c99f087, 0x82552224, 0x00066238, 0x0da05571, 0x3c998d4d, + 0x667f3bcc, 0x0006a09e, 0x86ce4786, 0x3ca52bb9, 0x3c651a2e, + 0x0006dfb2, 0x206f0dab, 0x3ca32092, 0xe8ec5f73, 0x00071f75, + 0x8e17a7a6, 0x3ca06122, 0x564267c8, 0x00075feb, 0x461e9f86, + 0x3ca244ac, 0x73eb0186, 0x0007a114, 0xabd66c55, 0x3c65ebe1, + 0x36cf4e62, 0x0007e2f3, 0xbbff67d0, 0x3c96fe9f, 0x994cce12, + 0x00082589, 0x14c801df, 0x3c951f14, 0x9b4492ec, 0x000868d9, + 0xc1f0eab4, 0x3c8db72f, 0x422aa0db, 0x0008ace5, 0x59f35f44, + 0x3c7bf683, 0x99157736, 0x0008f1ae, 0x9c06283c, 0x3ca360ba, + 0xb0cdc5e4, 0x00093737, 0x20f962aa, 0x3c95e8d1, 0x9fde4e4f, + 0x00097d82, 0x2b91ce27, 0x3c71affc, 0x82a3f090, 0x0009c491, + 0x589a2ebd, 0x3c9b6d34, 0x7b5de564, 0x000a0c66, 0x9ab89880, + 0x3c95277c, 0xb23e255c, 0x000a5503, 0x6e735ab3, 0x3c846984, + 0x5579fdbf, 0x000a9e6b, 0x92cb3387, 0x3c8c1a77, 0x995ad3ad, + 0x000ae89f, 0xdc2d1d96, 0x3ca22466, 0xb84f15fa, 0x000b33a2, + 0xb19505ae, 0x3ca1112e, 0xf2fb5e46, 0x000b7f76, 0x0a5fddcd, + 0x3c74ffd7, 0x904bc1d2, 0x000bcc1e, 0x30af0cb3, 0x3c736eae, + 0xdd85529c, 0x000c199b, 0xd10959ac, 0x3c84e08f, 0x2e57d14b, + 0x000c67f1, 0x6c921968, 0x3c676b2c, 0xdcef9069, 0x000cb720, + 0x36df99b3, 0x3c937009, 0x4a07897b, 0x000d072d, 0xa63d07a7, + 0x3c74a385, 0xdcfba487, 0x000d5818, 0xd5c192ac, 0x3c8e5a50, + 0x03db3285, 0x000da9e6, 0x1c4a9792, 0x3c98bb73, 0x337b9b5e, + 0x000dfc97, 0x603a88d3, 0x3c74b604, 0xe78b3ff6, 0x000e502e, + 0x92094926, 0x3c916f27, 0xa2a490d9, 0x000ea4af, 0x41aa2008, + 0x3c8ec3bc, 0xee615a27, 0x000efa1b, 0x31d185ee, 0x3c8a64a9, + 0x5b6e4540, 0x000f5076, 0x4d91cd9d, 0x3c77893b, 0x819e90d8, + 0x000fa7c1 + }; + + private static int[] allOnesExp = { + 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff + }; + + private static int[] expBias = { + 0x00000000, 0x3ff00000, 0x00000000, 0x3ff00000 + }; + + private static int[] xMaxExp = { + 0xffffffff, 0x7fefffff + }; + + private static int[] xMinExp = { + 0x00000000, 0x00100000 + }; + + private static int[] infExp = { + 0x00000000, 0x7ff00000 + }; + + private static int[] zeroExp = { + 0x00000000, 0x00000000 + }; + + public void expIntrinsic(Register dest, Register value, CompilationResultBuilder crb, AMD64MacroAssembler masm) { + ArrayDataPointerConstant onePtr = new ArrayDataPointerConstant(one, 16); + ArrayDataPointerConstant cvExpPtr = new ArrayDataPointerConstant(cvExp, 16); + ArrayDataPointerConstant shifterExpPtr = new ArrayDataPointerConstant(shifterExp, 8); + ArrayDataPointerConstant mMaskExpPtr = new ArrayDataPointerConstant(mMaskExp, 16); + ArrayDataPointerConstant biasExpPtr = new ArrayDataPointerConstant(biasExp, 16); + ArrayDataPointerConstant tblAddrExpPtr = new ArrayDataPointerConstant(tblAddrExp, 16); + ArrayDataPointerConstant expBiasPtr = new ArrayDataPointerConstant(expBias, 8); + ArrayDataPointerConstant xMaxExpPtr = new ArrayDataPointerConstant(xMaxExp, 8); + ArrayDataPointerConstant xMinExpPtr = new ArrayDataPointerConstant(xMinExp, 8); + ArrayDataPointerConstant infExpPtr = new ArrayDataPointerConstant(infExp, 8); + ArrayDataPointerConstant zeroExpPtr = new ArrayDataPointerConstant(zeroExp, 8); + ArrayDataPointerConstant allOnesExpPtr = new ArrayDataPointerConstant(allOnesExp, 8); + + Label bb0 = new Label(); + Label bb1 = new Label(); + Label bb2 = new Label(); + Label bb3 = new Label(); + Label bb4 = new Label(); + Label bb5 = new Label(); + Label bb7 = new Label(); + Label bb8 = new Label(); + Label bb9 = new Label(); + Label bb10 = new Label(); + Label bb11 = new Label(); + Label bb12 = new Label(); + Label bb14 = new Label(); + + Register gpr1 = asRegister(gpr1Temp, AMD64Kind.QWORD); + Register gpr2 = asRegister(gpr2Temp, AMD64Kind.QWORD); + Register gpr3 = asRegister(rcxTemp, AMD64Kind.QWORD); + Register gpr4 = asRegister(gpr4Temp, AMD64Kind.QWORD); + Register gpr5 = asRegister(gpr5Temp, AMD64Kind.QWORD); + + Register temp1 = asRegister(xmm1Temp, AMD64Kind.DOUBLE); + Register temp2 = asRegister(xmm2Temp, AMD64Kind.DOUBLE); + Register temp3 = asRegister(xmm3Temp, AMD64Kind.DOUBLE); + Register temp4 = asRegister(xmm4Temp, AMD64Kind.DOUBLE); + Register temp5 = asRegister(xmm5Temp, AMD64Kind.DOUBLE); + Register temp6 = asRegister(xmm6Temp, AMD64Kind.DOUBLE); + Register temp7 = asRegister(xmm7Temp, AMD64Kind.DOUBLE); + Register temp8 = asRegister(xmm8Temp, AMD64Kind.DOUBLE); + Register temp9 = asRegister(xmm9Temp, AMD64Kind.DOUBLE); + Register temp10 = asRegister(xmm10Temp, AMD64Kind.DOUBLE); + + AMD64Address stackSlot = (AMD64Address) crb.asAddress(stackTemp); + + setCrb(crb); + masm.movsd(stackSlot, value); + if (dest.encoding != value.encoding) { + masm.movdqu(dest, value); + } + + masm.movdqu(temp9, externalAddress(mMaskExpPtr)); // 0xffffffc0, + // 0x00000000, + // 0xffffffc0, + // 0x00000000 + masm.movdqu(temp10, externalAddress(biasExpPtr)); // 0x0000ffc0, + // 0x00000000, + // 0x0000ffc0, + // 0x00000000 + masm.unpcklpd(dest, dest); + masm.leaq(gpr5, stackSlot); + masm.leaq(gpr2, externalAddress(cvExpPtr)); + masm.movdqu(temp1, new AMD64Address(gpr2, 0)); // 0x652b82fe, + // 0x40571547, + // 0x652b82fe, + // 0x40571547 + masm.movdqu(temp6, externalAddress(shifterExpPtr)); // 0x00000000, + // 0x43380000, + // 0x00000000, + // 0x43380000 + masm.movdqu(temp2, new AMD64Address(gpr2, 16)); // 0xfefa0000, + // 0x3f862e42, + // 0xfefa0000, + // 0x3f862e42 + masm.movdqu(temp3, new AMD64Address(gpr2, 32)); // 0xbc9e3b3a, + // 0x3d1cf79a, + // 0xbc9e3b3a, + // 0x3d1cf79a + masm.pextrw(gpr1, dest, 3); + masm.andl(gpr1, 32767); + masm.movl(gpr4, 16527); + masm.subl(gpr4, gpr1); + masm.subl(gpr1, 15504); + masm.orl(gpr4, gpr1); + masm.cmpl(gpr4, Integer.MIN_VALUE); + masm.jcc(ConditionFlag.AboveEqual, bb0); + + masm.leaq(gpr4, externalAddress(tblAddrExpPtr)); + masm.movdqu(temp8, new AMD64Address(gpr2, 48)); // 0xfffffffe, + // 0x3fdfffff, + // 0xfffffffe, + // 0x3fdfffff + masm.movdqu(temp4, new AMD64Address(gpr2, 64)); // 0xe3289860, + // 0x3f56c15c, + // 0x555b9e25, + // 0x3fa55555 + masm.movdqu(temp5, new AMD64Address(gpr2, 80)); // 0xc090cf0f, + // 0x3f811115, + // 0x55548ba1, + // 0x3fc55555 + masm.mulpd(temp1, dest); + masm.addpd(temp1, temp6); + masm.movapd(temp7, temp1); + masm.movdl(gpr1, temp1); + masm.pand(temp7, temp9); + masm.subpd(temp1, temp6); + masm.mulpd(temp2, temp1); + masm.mulpd(temp3, temp1); + masm.paddq(temp7, temp10); + masm.subpd(dest, temp2); + masm.movl(gpr3, gpr1); + masm.andl(gpr3, 63); + masm.shll(gpr3, 4); + masm.movdqu(temp2, new AMD64Address(gpr3, gpr4, Scale.Times1, 0)); + masm.sarl(gpr1, 6); + masm.psllq(temp7, 46); + masm.subpd(dest, temp3); + masm.mulpd(temp4, dest); + masm.movl(gpr4, gpr1); + masm.movapd(temp6, dest); + masm.movapd(temp1, dest); + masm.mulpd(temp6, temp6); + masm.mulpd(dest, temp6); + masm.addpd(temp5, temp4); + masm.mulsd(dest, temp6); + masm.mulpd(temp6, temp8); + masm.addsd(temp1, temp2); + masm.unpckhpd(temp2, temp2); + masm.mulpd(dest, temp5); + masm.addsd(temp1, dest); + masm.por(temp2, temp7); + masm.unpckhpd(dest, dest); + masm.addsd(dest, temp1); + masm.addsd(dest, temp6); + masm.addl(gpr4, 894); + masm.cmpl(gpr4, 1916); + masm.jcc(ConditionFlag.Above, bb1); + + masm.mulsd(dest, temp2); + masm.addsd(dest, temp2); + masm.jmp(bb14); + + masm.bind(bb1); + masm.movdqu(temp6, externalAddress(expBiasPtr)); // 0x00000000, + // 0x3ff00000, + // 0x00000000, + // 0x3ff00000 + masm.xorpd(temp3, temp3); + masm.movdqu(temp4, externalAddress(allOnesExpPtr)); // 0xffffffff, + // 0xffffffff, + // 0xffffffff, + // 0xffffffff + masm.movl(gpr4, -1022); + masm.subl(gpr4, gpr1); + masm.movdl(temp5, gpr4); + masm.psllq(temp4, temp5); + masm.movl(gpr3, gpr1); + masm.sarl(gpr1, 1); + masm.pinsrw(temp3, gpr1, 3); + masm.psllq(temp3, 4); + masm.psubd(temp2, temp3); + masm.mulsd(dest, temp2); + masm.cmpl(gpr4, 52); + masm.jcc(ConditionFlag.Greater, bb2); + + masm.pand(temp4, temp2); + masm.paddd(temp3, temp6); + masm.subsd(temp2, temp4); + masm.addsd(dest, temp2); + masm.cmpl(gpr3, 1023); + masm.jcc(ConditionFlag.GreaterEqual, bb3); + + masm.pextrw(gpr3, dest, 3); + masm.andl(gpr3, 32768); + masm.orl(gpr4, gpr3); + masm.cmpl(gpr4, 0); + masm.jcc(ConditionFlag.Equal, bb4); + + masm.movapd(temp6, dest); + masm.addsd(dest, temp4); + masm.mulsd(dest, temp3); + masm.pextrw(gpr3, dest, 3); + masm.andl(gpr3, 32752); + masm.cmpl(gpr3, 0); + masm.jcc(ConditionFlag.Equal, bb5); + + masm.jmp(bb14); + + masm.bind(bb5); + masm.mulsd(temp6, temp3); + masm.mulsd(temp4, temp3); + masm.movdqu(dest, temp6); + masm.pxor(temp6, temp4); + masm.psrad(temp6, 31); + masm.pshufd(temp6, temp6, 85); + masm.psllq(dest, 1); + masm.psrlq(dest, 1); + masm.pxor(dest, temp6); + masm.psrlq(temp6, 63); + masm.paddq(dest, temp6); + masm.paddq(dest, temp4); + masm.jmp(bb14); + + masm.bind(bb4); + masm.addsd(dest, temp4); + masm.mulsd(dest, temp3); + masm.jmp(bb14); + + masm.bind(bb3); + masm.addsd(dest, temp4); + masm.mulsd(dest, temp3); + masm.pextrw(gpr3, dest, 3); + masm.andl(gpr3, 32752); + masm.cmpl(gpr3, 32752); + masm.jcc(ConditionFlag.AboveEqual, bb7); + + masm.jmp(bb14); + + masm.bind(bb2); + masm.paddd(temp3, temp6); + masm.addpd(dest, temp2); + masm.mulsd(dest, temp3); + masm.jmp(bb14); + + masm.bind(bb8); + masm.movsd(dest, externalAddress(xMaxExpPtr)); // 0xffffffff, + // 0x7fefffff + masm.movsd(temp8, externalAddress(xMinExpPtr)); // 0x00000000, + // 0x00100000 + masm.cmpl(gpr1, 2146435072); + masm.jcc(ConditionFlag.AboveEqual, bb9); + + masm.movl(gpr1, new AMD64Address(gpr5, 4)); + masm.cmpl(gpr1, Integer.MIN_VALUE); + masm.jcc(ConditionFlag.AboveEqual, bb10); + + masm.mulsd(dest, dest); + + masm.bind(bb7); + masm.jmp(bb14); + + masm.bind(bb10); + masm.mulsd(dest, temp8); + masm.jmp(bb14); + + masm.bind(bb9); + masm.movl(gpr4, stackSlot); + masm.cmpl(gpr1, 2146435072); + masm.jcc(ConditionFlag.Above, bb11); + + masm.cmpl(gpr4, 0); + masm.jcc(ConditionFlag.NotEqual, bb11); + + masm.movl(gpr1, new AMD64Address(gpr5, 4)); + masm.cmpl(gpr1, 2146435072); + masm.jcc(ConditionFlag.NotEqual, bb12); + + masm.movsd(dest, externalAddress(infExpPtr)); // 0x00000000, + // 0x7ff00000 + masm.jmp(bb14); + + masm.bind(bb12); + masm.movsd(dest, externalAddress(zeroExpPtr)); // 0x00000000, + // 0x00000000 + masm.jmp(bb14); + + masm.bind(bb11); + masm.movsd(dest, stackSlot); + masm.addsd(dest, dest); + masm.jmp(bb14); + + masm.bind(bb0); + masm.movl(gpr1, new AMD64Address(gpr5, 4)); + masm.andl(gpr1, 2147483647); + masm.cmpl(gpr1, 1083179008); + masm.jcc(ConditionFlag.AboveEqual, bb8); + + masm.addsd(dest, externalAddress(onePtr)); // 0x00000000, + // 0x3ff00000 + masm.bind(bb14); + } +}