< prev index next >

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp

Print this page
8248238: Adding Windows support to OpenJDK on AArch64

Summary: LP64 vs LLP64 changes to add Windows support

Contributed-by: Monica Beckwith <monica.beckwith@microsoft.com>, Ludovic Henry <luhenry@microsoft.com>
Reviewed-by:
8248238: Adding Windows support to OpenJDK on AArch64

Summary: Adding Windows support for AArch64

Contributed-by: Ludovic Henry <luhenry@microsoft.com>, Monica Beckwith <monica.beckwith@microsoft.com>
Reviewed-by:

*** 563,575 **** __ cbnz(c_rarg2, error); } #endif // Check if the oop is in the right area of memory ! __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); __ andr(c_rarg2, r0, c_rarg3); ! __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); // Compare c_rarg2 and c_rarg3. We don't use a compare // instruction here because the flags register is live. __ eor(c_rarg2, c_rarg2, c_rarg3); __ cbnz(c_rarg2, error); --- 563,577 ---- __ cbnz(c_rarg2, error); } #endif // Check if the oop is in the right area of memory ! // Make sure we cast to `address` or it ends up calling the wrong `mov` ! // with MSVC, leading to a crash. ! __ mov(c_rarg3, (address) Universe::verify_oop_mask()); __ andr(c_rarg2, r0, c_rarg3); ! __ mov(c_rarg3, (address) Universe::verify_oop_bits()); // Compare c_rarg2 and c_rarg3. We don't use a compare // instruction here because the flags register is live. __ eor(c_rarg2, c_rarg2, c_rarg3); __ cbnz(c_rarg2, error);
*** 695,705 **** void generate_copy_longs(Label &start, Register s, Register d, Register count, copy_direction direction) { int unit = wordSize * direction; int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; - int offset; const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, t4 = r7, t5 = r10, t6 = r11, t7 = r12; const Register stride = r13; assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); --- 697,706 ----
*** 1086,1096 **** // <= 96 bytes do inline. Direction doesn't matter because we always // load all the data before writing anything Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; ! const Register send = r17, dend = r18; if (PrefetchCopyIntervalInBytes > 0) __ prfm(Address(s, 0), PLDL1KEEP); __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); __ br(Assembler::HI, copy_big); --- 1087,1097 ---- // <= 96 bytes do inline. Direction doesn't matter because we always // load all the data before writing anything Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; ! const Register send = r17, dend = r16; if (PrefetchCopyIntervalInBytes > 0) __ prfm(Address(s, 0), PLDL1KEEP); __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); __ br(Assembler::HI, copy_big);
*** 1278,1290 **** void clobber_registers() { #ifdef ASSERT __ mov(rscratch1, (uint64_t)0xdeadbeef); __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); ! for (Register r = r3; r <= r18; r++) if (r != rscratch1) __ mov(r, rscratch1); #endif } // Scan over array at a for count oops, verifying each one. // Preserves a and count, clobbers rscratch1 and rscratch2. void verify_oop_array (size_t size, Register a, Register count, Register temp) { --- 1279,1292 ---- void clobber_registers() { #ifdef ASSERT __ mov(rscratch1, (uint64_t)0xdeadbeef); __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); ! for (Register r = r3; r <= NOT_WIN64(r18) WIN64_ONLY(r17); r++) if (r != rscratch1) __ mov(r, rscratch1); #endif + } // Scan over array at a for count oops, verifying each one. // Preserves a and count, clobbers rscratch1 and rscratch2. void verify_oop_array (size_t size, Register a, Register count, Register temp) {
*** 1713,1726 **** const Register ckval = c_rarg4; // super_klass RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); RegSet wb_post_saved_regs = RegSet::of(count); ! // Registers used as temps (r18, r19, r20 are save-on-entry) const Register count_save = r21; // orig elementscount const Register start_to = r20; // destination array start address - const Register copied_oop = r18; // actual oop copied const Register r19_klass = r19; // oop._klass //--------------------------------------------------------------- // Assembler stub will be used for this call to arraycopy // if the two arrays are subtypes of Object[] but the --- 1715,1728 ---- const Register ckval = c_rarg4; // super_klass RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); RegSet wb_post_saved_regs = RegSet::of(count); ! // Registers used as temps (r19, r20, r21, r22 are save-on-entry) ! const Register copied_oop = r22; // actual oop copied const Register count_save = r21; // orig elementscount const Register start_to = r20; // destination array start address const Register r19_klass = r19; // oop._klass //--------------------------------------------------------------- // Assembler stub will be used for this call to arraycopy // if the two arrays are subtypes of Object[] but the
*** 1753,1764 **** BLOCK_COMMENT("Entry:"); } // Empty array: Nothing to do. __ cbz(count, L_done); ! ! __ push(RegSet::of(r18, r19, r20, r21), sp); #ifdef ASSERT BLOCK_COMMENT("assert consistent ckoff/ckval"); // The ckoff and ckval must be mutually consistent, // even though caller generates both. --- 1755,1765 ---- BLOCK_COMMENT("Entry:"); } // Empty array: Nothing to do. __ cbz(count, L_done); ! __ push(RegSet::of(r19, r20, r21, r22), sp); #ifdef ASSERT BLOCK_COMMENT("assert consistent ckoff/ckval"); // The ckoff and ckval must be mutually consistent, // even though caller generates both.
*** 1823,1833 **** __ BIND(L_do_card_marks); bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); __ bind(L_done_pop); ! __ pop(RegSet::of(r18, r19, r20, r21), sp); inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); __ bind(L_done); __ mov(r0, count); __ leave(); --- 1824,1834 ---- __ BIND(L_do_card_marks); bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); __ bind(L_done_pop); ! __ pop(RegSet::of(r19, r20, r21, r22), sp); inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); __ bind(L_done); __ mov(r0, count); __ leave();
*** 2000,2010 **** __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set // registers used as temp const Register scratch_length = r16; // elements count to copy const Register scratch_src_klass = r17; // array klass ! const Register lh = r18; // layout helper // if (length < 0) return -1; __ movw(scratch_length, length); // length (elements count, 32-bits value) __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set --- 2001,2011 ---- __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set // registers used as temp const Register scratch_length = r16; // elements count to copy const Register scratch_src_klass = r17; // array klass ! const Register lh = r15; // layout helper // if (length < 0) return -1; __ movw(scratch_length, length); // length (elements count, 32-bits value) __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set
*** 2071,2081 **** // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); // const Register rscratch1_offset = rscratch1; // array offset ! const Register r18_elsize = lh; // element size __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, exact_log2(Klass::_lh_header_size_mask+1)); // array_offset __ add(src, src, rscratch1_offset); // src array offset __ add(dst, dst, rscratch1_offset); // dst array offset --- 2072,2082 ---- // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); // const Register rscratch1_offset = rscratch1; // array offset ! const Register r15_elsize = lh; // element size __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, exact_log2(Klass::_lh_header_size_mask+1)); // array_offset __ add(src, src, rscratch1_offset); // src array offset __ add(dst, dst, rscratch1_offset); // dst array offset
*** 2092,2103 **** assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); // The possible values of elsize are 0-3, i.e. exact_log2(element // size in bytes). We do a simple bitwise binary search. __ BIND(L_copy_bytes); ! __ tbnz(r18_elsize, 1, L_copy_ints); ! __ tbnz(r18_elsize, 0, L_copy_shorts); __ lea(from, Address(src, src_pos));// src_addr __ lea(to, Address(dst, dst_pos));// dst_addr __ movw(count, scratch_length); // length __ b(RuntimeAddress(byte_copy_entry)); --- 2093,2104 ---- assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); // The possible values of elsize are 0-3, i.e. exact_log2(element // size in bytes). We do a simple bitwise binary search. __ BIND(L_copy_bytes); ! __ tbnz(r15_elsize, 1, L_copy_ints); ! __ tbnz(r15_elsize, 0, L_copy_shorts); __ lea(from, Address(src, src_pos));// src_addr __ lea(to, Address(dst, dst_pos));// dst_addr __ movw(count, scratch_length); // length __ b(RuntimeAddress(byte_copy_entry));
*** 2106,2128 **** __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr __ movw(count, scratch_length); // length __ b(RuntimeAddress(short_copy_entry)); __ BIND(L_copy_ints); ! __ tbnz(r18_elsize, 0, L_copy_longs); __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr __ movw(count, scratch_length); // length __ b(RuntimeAddress(int_copy_entry)); __ BIND(L_copy_longs); #ifdef ASSERT { BLOCK_COMMENT("assert long copy {"); Label L; ! __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize ! __ cmpw(r18_elsize, LogBytesPerLong); __ br(Assembler::EQ, L); __ stop("must be long copy, but elsize is wrong"); __ bind(L); BLOCK_COMMENT("} assert long copy done"); } --- 2107,2129 ---- __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr __ movw(count, scratch_length); // length __ b(RuntimeAddress(short_copy_entry)); __ BIND(L_copy_ints); ! __ tbnz(r15_elsize, 0, L_copy_longs); __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr __ movw(count, scratch_length); // length __ b(RuntimeAddress(int_copy_entry)); __ BIND(L_copy_longs); #ifdef ASSERT { BLOCK_COMMENT("assert long copy {"); Label L; ! __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize ! __ cmpw(r15_elsize, LogBytesPerLong); __ br(Assembler::EQ, L); __ stop("must be long copy, but elsize is wrong"); __ bind(L); BLOCK_COMMENT("} assert long copy done"); }
*** 2136,2147 **** __ BIND(L_objArray); // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] Label L_plain_copy, L_checkcast_copy; // test array classes for subtyping ! __ load_klass(r18, dst); ! __ cmp(scratch_src_klass, r18); // usual case is exact equality __ br(Assembler::NE, L_checkcast_copy); // Identically typed arrays can be copied without element-wise checks. arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, rscratch2, L_failed); --- 2137,2148 ---- __ BIND(L_objArray); // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] Label L_plain_copy, L_checkcast_copy; // test array classes for subtyping ! __ load_klass(r15, dst); ! __ cmp(scratch_src_klass, r15); // usual case is exact equality __ br(Assembler::NE, L_checkcast_copy); // Identically typed arrays can be copied without element-wise checks. arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, rscratch2, L_failed);
*** 2153,2173 **** __ movw(count, scratch_length); // length __ BIND(L_plain_copy); __ b(RuntimeAddress(oop_copy_entry)); __ BIND(L_checkcast_copy); ! // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) { // Before looking at dst.length, make sure dst is also an objArray. ! __ ldrw(rscratch1, Address(r18, lh_offset)); __ movw(rscratch2, objArray_lh); __ eorw(rscratch1, rscratch1, rscratch2); __ cbnzw(rscratch1, L_failed); // It is safe to examine both src.length and dst.length. arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, ! r18, L_failed); __ load_klass(dst_klass, dst); // reload // Marshal the base address arguments now, freeing registers. __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); --- 2154,2174 ---- __ movw(count, scratch_length); // length __ BIND(L_plain_copy); __ b(RuntimeAddress(oop_copy_entry)); __ BIND(L_checkcast_copy); ! // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) { // Before looking at dst.length, make sure dst is also an objArray. ! __ ldrw(rscratch1, Address(r15, lh_offset)); __ movw(rscratch2, objArray_lh); __ eorw(rscratch1, rscratch1, rscratch2); __ cbnzw(rscratch1, L_failed); // It is safe to examine both src.length and dst.length. arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, ! r15, L_failed); __ load_klass(dst_klass, dst); // reload // Marshal the base address arguments now, freeing registers. __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
*** 3281,3292 **** FloatRegister vs2acc = v2; FloatRegister vtable = v3; // Max number of bytes we can process before having to take the mod // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 ! unsigned long BASE = 0xfff1; ! unsigned long NMAX = 0x15B0; __ mov(base, BASE); __ mov(nmax, NMAX); // Load accumulation coefficients for the upper 16 bits --- 3282,3293 ---- FloatRegister vs2acc = v2; FloatRegister vtable = v3; // Max number of bytes we can process before having to take the mod // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 ! uint64_t BASE = 0xfff1; ! uint64_t NMAX = 0x15B0; __ mov(base, BASE); __ mov(nmax, NMAX); // Load accumulation coefficients for the upper 16 bits
*** 4059,4069 **** Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; RegSet spilled_regs = RegSet::of(tmp3, tmp4); ! int prefetchLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance/2); __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); // cnt2 == amount of characters left to compare // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) __ zip1(vtmp, __ T8B, vtmp, vtmpZ); --- 4060,4070 ---- Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; RegSet spilled_regs = RegSet::of(tmp3, tmp4); ! int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); // cnt2 == amount of characters left to compare // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
*** 4217,4227 **** Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL, LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF, DIFF_LAST_POSITION, DIFF_LAST_POSITION2; // exit from large loop when less than 64 bytes left to read or we're about // to prefetch memory behind array border ! int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used // update cnt2 counter with already loaded 8 bytes __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); // update pointers, because of previous read __ add(str1, str1, wordSize); --- 4218,4228 ---- Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL, LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF, DIFF_LAST_POSITION, DIFF_LAST_POSITION2; // exit from large loop when less than 64 bytes left to read or we're about // to prefetch memory behind array border ! int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used // update cnt2 counter with already loaded 8 bytes __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); // update pointers, because of previous read __ add(str1, str1, wordSize);
*** 4643,4653 **** __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); address entry = __ pc(); Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; Register src = r0, dst = r1, len = r2, octetCounter = r3; ! const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4; // do one more 8-byte read to have address 16-byte aligned in most cases // also use single store instruction __ ldrd(v2, __ post(src, 8)); __ sub(octetCounter, octetCounter, 2); --- 4644,4654 ---- __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); address entry = __ pc(); Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; Register src = r0, dst = r1, len = r2, octetCounter = r3; ! const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; // do one more 8-byte read to have address 16-byte aligned in most cases // also use single store instruction __ ldrd(v2, __ post(src, 8)); __ sub(octetCounter, octetCounter, 2);
*** 4893,4936 **** Register reg = c_rarg0; Pa_base = reg; // Argument registers if (squaring) Pb_base = Pa_base; else ! Pb_base = ++reg; ! Pn_base = ++reg; ! Rlen= ++reg; ! inv = ++reg; ! Pm_base = ++reg; // Working registers: ! Ra = ++reg; // The current digit of a, b, n, and m. ! Rb = ++reg; ! Rm = ++reg; ! Rn = ++reg; ! ! Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. ! Pb = ++reg; ! Pm = ++reg; ! Pn = ++reg; ! ! t0 = ++reg; // Three registers which form a ! t1 = ++reg; // triple-precision accumuator. ! t2 = ++reg; ! ! Ri = ++reg; // Inner and outer loop indexes. ! Rj = ++reg; ! ! Rhi_ab = ++reg; // Product registers: low and high parts ! Rlo_ab = ++reg; // of a*b and m*n. ! Rhi_mn = ++reg; ! Rlo_mn = ++reg; // r19 and up are callee-saved. _toSave = RegSet::range(r19, reg) + Pm_base; } private: void save_regs() { push(_toSave, sp); } void restore_regs() { --- 4894,4946 ---- Register reg = c_rarg0; Pa_base = reg; // Argument registers if (squaring) Pb_base = Pa_base; else ! Pb_base = next_reg(reg); ! Pn_base = next_reg(reg); ! Rlen= next_reg(reg); ! inv = next_reg(reg); ! Pm_base = next_reg(reg); // Working registers: ! Ra = next_reg(reg); // The current digit of a, b, n, and m. ! Rb = next_reg(reg); ! Rm = next_reg(reg); ! Rn = next_reg(reg); ! ! Pa = next_reg(reg); // Pointers to the current/next digit of a, b, n, and m. ! Pb = next_reg(reg); ! Pm = next_reg(reg); ! Pn = next_reg(reg); ! ! t0 = next_reg(reg); // Three registers which form a ! t1 = next_reg(reg); // triple-precision accumuator. ! t2 = next_reg(reg); ! ! Ri = next_reg(reg); // Inner and outer loop indexes. ! Rj = next_reg(reg); ! ! Rhi_ab = next_reg(reg); // Product registers: low and high parts ! Rlo_ab = next_reg(reg); // of a*b and m*n. ! Rhi_mn = next_reg(reg); ! Rlo_mn = next_reg(reg); // r19 and up are callee-saved. _toSave = RegSet::range(r19, reg) + Pm_base; } private: + Register next_reg(Register &reg) { + #ifdef _WIN64 + // skip r18 on Windows, it's used by native TLS + return ++reg == r18 ? ++reg : reg; + #else + return ++reg; + #endif + } + void save_regs() { push(_toSave, sp); } void restore_regs() {
*** 5379,5394 **** return entry; } // In C, approximately: // void ! // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], ! // unsigned long Pn_base[], unsigned long Pm_base[], ! // unsigned long inv, int len) { ! // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator ! // unsigned long *Pa, *Pb, *Pn, *Pm; ! // unsigned long Ra, Rb, Rn, Rm; // int i; // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); --- 5389,5404 ---- return entry; } // In C, approximately: // void ! // montgomery_multiply(uint64_t Pa_base[], uint64_t Pb_base[], ! // uint64_t Pn_base[], uint64_t Pm_base[], ! // uint64_t inv, int len) { ! // uint64_t t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator ! // uint64_t *Pa, *Pb, *Pn, *Pm; ! // uint64_t Ra, Rb, Rn, Rm; // int i; // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
*** 5592,5606 **** return entry; } // In C, approximately: // void ! // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], ! // unsigned long Pm_base[], unsigned long inv, int len) { ! // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator ! // unsigned long *Pa, *Pb, *Pn, *Pm; ! // unsigned long Ra, Rb, Rn, Rm; // int i; // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); --- 5602,5616 ---- return entry; } // In C, approximately: // void ! // montgomery_square(uint64_t Pa_base[], uint64_t Pn_base[], ! // uint64_t Pm_base[], uint64_t inv, int len) { ! // uint64_t t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator ! // uint64_t *Pa, *Pb, *Pn, *Pm; ! // uint64_t Ra, Rb, Rn, Rm; // int i; // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
< prev index next >