1422 // qword_count - 64-bits element count 1423 // to - scratch 1424 // L_copy_bytes - entry label 1425 // L_copy_8_bytes - exit label 1426 // 1427 void copy_bytes_backward(Register from, Register dest, 1428 Register qword_count, Register to, 1429 Label& L_copy_bytes, Label& L_copy_8_bytes) { 1430 DEBUG_ONLY(__ stop("enter at entry label, not here")); 1431 Label L_loop; 1432 __ align(OptoLoopAlignment); 1433 if (UseUnalignedLoadStores) { 1434 Label L_end; 1435 if (UseAVX > 2) { 1436 __ movl(to, 0xffff); 1437 __ kmovql(k1, to); 1438 } 1439 // Copy 64-bytes per iteration 1440 __ BIND(L_loop); 1441 if (UseAVX > 2) { 1442 __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 32), Assembler::AVX_512bit); 1443 __ evmovdqul(Address(dest, qword_count, Address::times_8, 32), xmm0, Assembler::AVX_512bit); 1444 } else if (UseAVX == 2) { 1445 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32)); 1446 __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0); 1447 __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); 1448 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); 1449 } else { 1450 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48)); 1451 __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0); 1452 __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32)); 1453 __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1); 1454 __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16)); 1455 __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2); 1456 __ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0)); 1457 __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3); 1458 } 1459 __ BIND(L_copy_bytes); 1460 __ subptr(qword_count, 8); 1461 __ jcc(Assembler::greaterEqual, L_loop); 1462 1463 __ addptr(qword_count, 4); // add(8) and sub(4) | 1422 // qword_count - 64-bits element count 1423 // to - scratch 1424 // L_copy_bytes - entry label 1425 // L_copy_8_bytes - exit label 1426 // 1427 void copy_bytes_backward(Register from, Register dest, 1428 Register qword_count, Register to, 1429 Label& L_copy_bytes, Label& L_copy_8_bytes) { 1430 DEBUG_ONLY(__ stop("enter at entry label, not here")); 1431 Label L_loop; 1432 __ align(OptoLoopAlignment); 1433 if (UseUnalignedLoadStores) { 1434 Label L_end; 1435 if (UseAVX > 2) { 1436 __ movl(to, 0xffff); 1437 __ kmovql(k1, to); 1438 } 1439 // Copy 64-bytes per iteration 1440 __ BIND(L_loop); 1441 if (UseAVX > 2) { 1442 __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit); 1443 __ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit); 1444 } else if (UseAVX == 2) { 1445 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32)); 1446 __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0); 1447 __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); 1448 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); 1449 } else { 1450 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48)); 1451 __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0); 1452 __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32)); 1453 __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1); 1454 __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16)); 1455 __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2); 1456 __ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0)); 1457 __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3); 1458 } 1459 __ BIND(L_copy_bytes); 1460 __ subptr(qword_count, 8); 1461 __ jcc(Assembler::greaterEqual, L_loop); 1462 1463 __ addptr(qword_count, 4); // add(8) and sub(4) |