1 /*
2 * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
3 * Copyright 2012, 2014 SAP AG. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
1335 // There are unaligned data accesses using integer load/store
1336 // instructions in this stub. POWER allows such accesses.
1337 //
1338 // According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
1339 // Chapter 2: Effect of Operand Placement on Performance) unaligned
1340 // integer load/stores have good performance. Only unaligned
1341 // floating point load/stores can have poor performance.
1342 //
1343 // TODO:
1344 //
1345 // 1. check if aligning the backbranch target of loops is beneficial
1346 //
1347 address generate_disjoint_short_copy(bool aligned, const char * name) {
1348 StubCodeMark mark(this, "StubRoutines", name);
1349
1350 Register tmp1 = R6_ARG4;
1351 Register tmp2 = R7_ARG5;
1352 Register tmp3 = R8_ARG6;
1353 Register tmp4 = R9_ARG7;
1354
1355 address start = __ function_entry();
1356
1357 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
1358 // don't try anything fancy if arrays don't have many elements
1359 __ li(tmp3, 0);
1360 __ cmpwi(CCR0, R5_ARG3, 9);
1361 __ ble(CCR0, l_6); // copy 2 at a time
1362
1363 if (!aligned) {
1364 __ xorr(tmp1, R3_ARG1, R4_ARG2);
1365 __ andi_(tmp1, tmp1, 3);
1366 __ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
1367
1368 // At this point it is guaranteed that both, from and to have the same alignment mod 4.
1369
1370 // Copy 1 element if necessary to align to 4 bytes.
1371 __ andi_(tmp1, R3_ARG1, 3);
1372 __ beq(CCR0, l_2);
1373
1374 __ lhz(tmp2, 0, R3_ARG1);
1375 __ addi(R3_ARG1, R3_ARG1, 2);
1376 __ sth(tmp2, 0, R4_ARG2);
1377 __ addi(R4_ARG2, R4_ARG2, 2);
1395 __ stwx(tmp2, R4_ARG2, tmp3);
1396 { // FasterArrayCopy
1397 __ addi(R3_ARG1, R3_ARG1, 4);
1398 __ addi(R4_ARG2, R4_ARG2, 4);
1399 }
1400 }
1401
1402 __ bind(l_7);
1403
1404 // Copy 4 elements at a time; either the loads or the stores can
1405 // be unaligned if aligned == false.
1406
1407 { // FasterArrayCopy
1408 __ cmpwi(CCR0, R5_ARG3, 15);
1409 __ ble(CCR0, l_6); // copy 2 at a time if less than 16 elements remain
1410
1411 __ srdi(tmp1, R5_ARG3, 4);
1412 __ andi_(R5_ARG3, R5_ARG3, 15);
1413 __ mtctr(tmp1);
1414
1415 __ bind(l_8);
1416 // Use unrolled version for mass copying (copy 16 elements a time).
1417 // Load feeding store gets zero latency on Power6, however not on Power5.
1418 // Therefore, the following sequence is made for the good of both.
1419 __ ld(tmp1, 0, R3_ARG1);
1420 __ ld(tmp2, 8, R3_ARG1);
1421 __ ld(tmp3, 16, R3_ARG1);
1422 __ ld(tmp4, 24, R3_ARG1);
1423 __ std(tmp1, 0, R4_ARG2);
1424 __ std(tmp2, 8, R4_ARG2);
1425 __ std(tmp3, 16, R4_ARG2);
1426 __ std(tmp4, 24, R4_ARG2);
1427 __ addi(R3_ARG1, R3_ARG1, 32);
1428 __ addi(R4_ARG2, R4_ARG2, 32);
1429 __ bdnz(l_8);
1430 }
1431 __ bind(l_6);
1432
1433 // copy 2 elements at a time
1434 { // FasterArrayCopy
1435 __ cmpwi(CCR0, R5_ARG3, 2);
1436 __ blt(CCR0, l_1);
1437 __ srdi(tmp1, R5_ARG3, 1);
1438 __ andi_(R5_ARG3, R5_ARG3, 1);
1439
1440 __ addi(R3_ARG1, R3_ARG1, -4);
1441 __ addi(R4_ARG2, R4_ARG2, -4);
1442 __ mtctr(tmp1);
1443
1444 __ bind(l_3);
1445 __ lwzu(tmp2, 4, R3_ARG1);
1446 __ stwu(tmp2, 4, R4_ARG2);
1447 __ bdnz(l_3);
1448
1449 __ addi(R3_ARG1, R3_ARG1, 4);
1450 __ addi(R4_ARG2, R4_ARG2, 4);
|
1 /*
2 * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
3 * Copyright 2012, 2018 SAP AG. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
1335 // There are unaligned data accesses using integer load/store
1336 // instructions in this stub. POWER allows such accesses.
1337 //
1338 // According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
1339 // Chapter 2: Effect of Operand Placement on Performance) unaligned
1340 // integer load/stores have good performance. Only unaligned
1341 // floating point load/stores can have poor performance.
1342 //
1343 // TODO:
1344 //
1345 // 1. check if aligning the backbranch target of loops is beneficial
1346 //
1347 address generate_disjoint_short_copy(bool aligned, const char * name) {
1348 StubCodeMark mark(this, "StubRoutines", name);
1349
1350 Register tmp1 = R6_ARG4;
1351 Register tmp2 = R7_ARG5;
1352 Register tmp3 = R8_ARG6;
1353 Register tmp4 = R9_ARG7;
1354
1355 VectorSRegister tmp_vsr1 = VSR1;
1356 VectorSRegister tmp_vsr2 = VSR2;
1357
1358 address start = __ function_entry();
1359
1360 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
1361
1362 // don't try anything fancy if arrays don't have many elements
1363 __ li(tmp3, 0);
1364 __ cmpwi(CCR0, R5_ARG3, 9);
1365 __ ble(CCR0, l_6); // copy 2 at a time
1366
1367 if (!aligned) {
1368 __ xorr(tmp1, R3_ARG1, R4_ARG2);
1369 __ andi_(tmp1, tmp1, 3);
1370 __ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
1371
1372 // At this point it is guaranteed that both, from and to have the same alignment mod 4.
1373
1374 // Copy 1 element if necessary to align to 4 bytes.
1375 __ andi_(tmp1, R3_ARG1, 3);
1376 __ beq(CCR0, l_2);
1377
1378 __ lhz(tmp2, 0, R3_ARG1);
1379 __ addi(R3_ARG1, R3_ARG1, 2);
1380 __ sth(tmp2, 0, R4_ARG2);
1381 __ addi(R4_ARG2, R4_ARG2, 2);
1399 __ stwx(tmp2, R4_ARG2, tmp3);
1400 { // FasterArrayCopy
1401 __ addi(R3_ARG1, R3_ARG1, 4);
1402 __ addi(R4_ARG2, R4_ARG2, 4);
1403 }
1404 }
1405
1406 __ bind(l_7);
1407
1408 // Copy 4 elements at a time; either the loads or the stores can
1409 // be unaligned if aligned == false.
1410
1411 { // FasterArrayCopy
1412 __ cmpwi(CCR0, R5_ARG3, 15);
1413 __ ble(CCR0, l_6); // copy 2 at a time if less than 16 elements remain
1414
1415 __ srdi(tmp1, R5_ARG3, 4);
1416 __ andi_(R5_ARG3, R5_ARG3, 15);
1417 __ mtctr(tmp1);
1418
1419 if (!VM_Version::has_vsx()) {
1420
1421 __ bind(l_8);
1422 // Use unrolled version for mass copying (copy 16 elements a time).
1423 // Load feeding store gets zero latency on Power6, however not on Power5.
1424 // Therefore, the following sequence is made for the good of both.
1425 __ ld(tmp1, 0, R3_ARG1);
1426 __ ld(tmp2, 8, R3_ARG1);
1427 __ ld(tmp3, 16, R3_ARG1);
1428 __ ld(tmp4, 24, R3_ARG1);
1429 __ std(tmp1, 0, R4_ARG2);
1430 __ std(tmp2, 8, R4_ARG2);
1431 __ std(tmp3, 16, R4_ARG2);
1432 __ std(tmp4, 24, R4_ARG2);
1433 __ addi(R3_ARG1, R3_ARG1, 32);
1434 __ addi(R4_ARG2, R4_ARG2, 32);
1435 __ bdnz(l_8);
1436
1437 } else { // Processor supports VSX, so use it to mass copy.
1438
1439 // Prefetch src data into L2 cache.
1440 __ dcbt(R3_ARG1, 0);
1441
1442 // If supported set DSCR pre-fetch to deepest.
1443 if (VM_Version::has_mfdscr()) {
1444 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1445 __ mtdscr(tmp2);
1446 }
1447 __ li(tmp1, 16);
1448
1449 // Backbranch target aligned to 32-byte. It's not aligned 16-byte
1450 // as loop contains < 8 instructions that fit inside a single
1451 // i-cache sector.
1452 __ align(32);
1453
1454 __ bind(l_9);
1455 // Use loop with VSX load/store instructions to
1456 // copy 16 elements a time.
1457 __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load from src.
1458 __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst.
1459 __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16.
1460 __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
1461 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32.
1462 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32.
1463 __ bdnz(l_9); // Dec CTR and loop if not zero.
1464
1465 // Restore DSCR pre-fetch value.
1466 if (VM_Version::has_mfdscr()) {
1467 __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1468 __ mtdscr(tmp2);
1469 }
1470
1471 }
1472 } // FasterArrayCopy
1473 __ bind(l_6);
1474
1475 // copy 2 elements at a time
1476 { // FasterArrayCopy
1477 __ cmpwi(CCR0, R5_ARG3, 2);
1478 __ blt(CCR0, l_1);
1479 __ srdi(tmp1, R5_ARG3, 1);
1480 __ andi_(R5_ARG3, R5_ARG3, 1);
1481
1482 __ addi(R3_ARG1, R3_ARG1, -4);
1483 __ addi(R4_ARG2, R4_ARG2, -4);
1484 __ mtctr(tmp1);
1485
1486 __ bind(l_3);
1487 __ lwzu(tmp2, 4, R3_ARG1);
1488 __ stwu(tmp2, 4, R4_ARG2);
1489 __ bdnz(l_3);
1490
1491 __ addi(R3_ARG1, R3_ARG1, 4);
1492 __ addi(R4_ARG2, R4_ARG2, 4);
|