hotspot Sdiff src/cpu/ppc/vm

src/cpu/ppc/vm/stubGenerator_ppc.cpp

rev 8845 : Apply 8154156 for VSX support, with cherry picking from 8077838, 8080684, and 8149655

   1 /*
   2  * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright 2012, 2014 SAP AG. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *

1335   //  There are unaligned data accesses using integer load/store
1336   //  instructions in this stub. POWER allows such accesses.
1337   //
1338   //  According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
1339   //  Chapter 2: Effect of Operand Placement on Performance) unaligned
1340   //  integer load/stores have good performance. Only unaligned
1341   //  floating point load/stores can have poor performance.
1342   //
1343   //  TODO:
1344   //
1345   //  1. check if aligning the backbranch target of loops is beneficial
1346   //
1347   address generate_disjoint_short_copy(bool aligned, const char * name) {
1348     StubCodeMark mark(this, "StubRoutines", name);
1349 
1350     Register tmp1 = R6_ARG4;
1351     Register tmp2 = R7_ARG5;
1352     Register tmp3 = R8_ARG6;
1353     Register tmp4 = R9_ARG7;
1354 



1355     address start = __ function_entry();
1356 
1357       Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;

1358     // don't try anything fancy if arrays don't have many elements
1359     __ li(tmp3, 0);
1360     __ cmpwi(CCR0, R5_ARG3, 9);
1361     __ ble(CCR0, l_6); // copy 2 at a time
1362 
1363     if (!aligned) {
1364       __ xorr(tmp1, R3_ARG1, R4_ARG2);
1365       __ andi_(tmp1, tmp1, 3);
1366       __ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
1367 
1368       // At this point it is guaranteed that both, from and to have the same alignment mod 4.
1369 
1370       // Copy 1 element if necessary to align to 4 bytes.
1371       __ andi_(tmp1, R3_ARG1, 3);
1372       __ beq(CCR0, l_2);
1373 
1374       __ lhz(tmp2, 0, R3_ARG1);
1375       __ addi(R3_ARG1, R3_ARG1, 2);
1376       __ sth(tmp2, 0, R4_ARG2);
1377       __ addi(R4_ARG2, R4_ARG2, 2);

1395       __ stwx(tmp2, R4_ARG2, tmp3);
1396       { // FasterArrayCopy
1397         __ addi(R3_ARG1, R3_ARG1, 4);
1398         __ addi(R4_ARG2, R4_ARG2, 4);
1399       }
1400     }
1401 
1402     __ bind(l_7);
1403 
1404     // Copy 4 elements at a time; either the loads or the stores can
1405     // be unaligned if aligned == false.
1406 
1407     { // FasterArrayCopy
1408       __ cmpwi(CCR0, R5_ARG3, 15);
1409       __ ble(CCR0, l_6); // copy 2 at a time if less than 16 elements remain
1410 
1411       __ srdi(tmp1, R5_ARG3, 4);
1412       __ andi_(R5_ARG3, R5_ARG3, 15);
1413       __ mtctr(tmp1);
1414 


1415       __ bind(l_8);
1416       // Use unrolled version for mass copying (copy 16 elements a time).
1417       // Load feeding store gets zero latency on Power6, however not on Power5.
1418       // Therefore, the following sequence is made for the good of both.
1419       __ ld(tmp1, 0, R3_ARG1);
1420       __ ld(tmp2, 8, R3_ARG1);
1421       __ ld(tmp3, 16, R3_ARG1);
1422       __ ld(tmp4, 24, R3_ARG1);
1423       __ std(tmp1, 0, R4_ARG2);
1424       __ std(tmp2, 8, R4_ARG2);
1425       __ std(tmp3, 16, R4_ARG2);
1426       __ std(tmp4, 24, R4_ARG2);
1427       __ addi(R3_ARG1, R3_ARG1, 32);
1428       __ addi(R4_ARG2, R4_ARG2, 32);
1429       __ bdnz(l_8);



































1430     }

1431     __ bind(l_6);
1432 
1433     // copy 2 elements at a time
1434     { // FasterArrayCopy
1435       __ cmpwi(CCR0, R5_ARG3, 2);
1436       __ blt(CCR0, l_1);
1437       __ srdi(tmp1, R5_ARG3, 1);
1438       __ andi_(R5_ARG3, R5_ARG3, 1);
1439 
1440       __ addi(R3_ARG1, R3_ARG1, -4);
1441       __ addi(R4_ARG2, R4_ARG2, -4);
1442       __ mtctr(tmp1);
1443 
1444       __ bind(l_3);
1445       __ lwzu(tmp2, 4, R3_ARG1);
1446       __ stwu(tmp2, 4, R4_ARG2);
1447       __ bdnz(l_3);
1448 
1449       __ addi(R3_ARG1, R3_ARG1, 4);
1450       __ addi(R4_ARG2, R4_ARG2, 4);

   1 /*
   2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright 2012, 2018 SAP AG. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *

1335   //  There are unaligned data accesses using integer load/store
1336   //  instructions in this stub. POWER allows such accesses.
1337   //
1338   //  According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
1339   //  Chapter 2: Effect of Operand Placement on Performance) unaligned
1340   //  integer load/stores have good performance. Only unaligned
1341   //  floating point load/stores can have poor performance.
1342   //
1343   //  TODO:
1344   //
1345   //  1. check if aligning the backbranch target of loops is beneficial
1346   //
1347   address generate_disjoint_short_copy(bool aligned, const char * name) {
1348     StubCodeMark mark(this, "StubRoutines", name);
1349 
1350     Register tmp1 = R6_ARG4;
1351     Register tmp2 = R7_ARG5;
1352     Register tmp3 = R8_ARG6;
1353     Register tmp4 = R9_ARG7;
1354 
1355     VectorSRegister tmp_vsr1  = VSR1;
1356     VectorSRegister tmp_vsr2  = VSR2;
1357 
1358     address start = __ function_entry();
1359 
1360     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
1361 
1362     // don't try anything fancy if arrays don't have many elements
1363     __ li(tmp3, 0);
1364     __ cmpwi(CCR0, R5_ARG3, 9);
1365     __ ble(CCR0, l_6); // copy 2 at a time
1366 
1367     if (!aligned) {
1368       __ xorr(tmp1, R3_ARG1, R4_ARG2);
1369       __ andi_(tmp1, tmp1, 3);
1370       __ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
1371 
1372       // At this point it is guaranteed that both, from and to have the same alignment mod 4.
1373 
1374       // Copy 1 element if necessary to align to 4 bytes.
1375       __ andi_(tmp1, R3_ARG1, 3);
1376       __ beq(CCR0, l_2);
1377 
1378       __ lhz(tmp2, 0, R3_ARG1);
1379       __ addi(R3_ARG1, R3_ARG1, 2);
1380       __ sth(tmp2, 0, R4_ARG2);
1381       __ addi(R4_ARG2, R4_ARG2, 2);

1399       __ stwx(tmp2, R4_ARG2, tmp3);
1400       { // FasterArrayCopy
1401         __ addi(R3_ARG1, R3_ARG1, 4);
1402         __ addi(R4_ARG2, R4_ARG2, 4);
1403       }
1404     }
1405 
1406     __ bind(l_7);
1407 
1408     // Copy 4 elements at a time; either the loads or the stores can
1409     // be unaligned if aligned == false.
1410 
1411     { // FasterArrayCopy
1412       __ cmpwi(CCR0, R5_ARG3, 15);
1413       __ ble(CCR0, l_6); // copy 2 at a time if less than 16 elements remain
1414 
1415       __ srdi(tmp1, R5_ARG3, 4);
1416       __ andi_(R5_ARG3, R5_ARG3, 15);
1417       __ mtctr(tmp1);
1418 
1419       if (!VM_Version::has_vsx()) {
1420 
1421         __ bind(l_8);
1422         // Use unrolled version for mass copying (copy 16 elements a time).
1423         // Load feeding store gets zero latency on Power6, however not on Power5.
1424         // Therefore, the following sequence is made for the good of both.
1425         __ ld(tmp1, 0, R3_ARG1);
1426         __ ld(tmp2, 8, R3_ARG1);
1427         __ ld(tmp3, 16, R3_ARG1);
1428         __ ld(tmp4, 24, R3_ARG1);
1429         __ std(tmp1, 0, R4_ARG2);
1430         __ std(tmp2, 8, R4_ARG2);
1431         __ std(tmp3, 16, R4_ARG2);
1432         __ std(tmp4, 24, R4_ARG2);
1433         __ addi(R3_ARG1, R3_ARG1, 32);
1434         __ addi(R4_ARG2, R4_ARG2, 32);
1435         __ bdnz(l_8);
1436 
1437       } else { // Processor supports VSX, so use it to mass copy.
1438 
1439         // Prefetch src data into L2 cache.
1440         __ dcbt(R3_ARG1, 0);
1441 
1442         // If supported set DSCR pre-fetch to deepest.
1443         if (VM_Version::has_mfdscr()) {
1444           __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1445           __ mtdscr(tmp2);
1446         }
1447         __ li(tmp1, 16);
1448 
1449         // Backbranch target aligned to 32-byte. It's not aligned 16-byte
1450         // as loop contains < 8 instructions that fit inside a single
1451         // i-cache sector.
1452         __ align(32);
1453 
1454         __ bind(l_9);
1455         // Use loop with VSX load/store instructions to
1456         // copy 16 elements a time.
1457         __ lxvd2x(tmp_vsr1, 0, R3_ARG1);     // Load from src.
1458         __ stxvd2x(tmp_vsr1, 0, R4_ARG2);    // Store to dst.
1459         __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1);  // Load from src + 16.
1460         __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
1461         __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32.
1462         __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32.
1463         __ bdnz(l_9);                        // Dec CTR and loop if not zero.
1464 
1465         // Restore DSCR pre-fetch value.
1466         if (VM_Version::has_mfdscr()) {
1467           __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1468           __ mtdscr(tmp2);
1469         }
1470 
1471       }
1472     } // FasterArrayCopy
1473     __ bind(l_6);
1474 
1475     // copy 2 elements at a time
1476     { // FasterArrayCopy
1477       __ cmpwi(CCR0, R5_ARG3, 2);
1478       __ blt(CCR0, l_1);
1479       __ srdi(tmp1, R5_ARG3, 1);
1480       __ andi_(R5_ARG3, R5_ARG3, 1);
1481 
1482       __ addi(R3_ARG1, R3_ARG1, -4);
1483       __ addi(R4_ARG2, R4_ARG2, -4);
1484       __ mtctr(tmp1);
1485 
1486       __ bind(l_3);
1487       __ lwzu(tmp2, 4, R3_ARG1);
1488       __ stwu(tmp2, 4, R4_ARG2);
1489       __ bdnz(l_3);
1490 
1491       __ addi(R3_ARG1, R3_ARG1, 4);
1492       __ addi(R4_ARG2, R4_ARG2, 4);

< prev index next >