< prev index next >

src/cpu/ppc/vm/stubGenerator_ppc.cpp

Print this page

        

@@ -1,8 +1,8 @@
 /*
- * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2018, SAP SE. All rights reserved.
+ * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2019, SAP SE. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 only, as
  * published by the Free Software Foundation.

@@ -1350,13 +1350,17 @@
     Register tmp1 = R6_ARG4;
     Register tmp2 = R7_ARG5;
     Register tmp3 = R8_ARG6;
     Register tmp4 = R9_ARG7;
 
+    VectorSRegister tmp_vsr1  = VSR1;
+    VectorSRegister tmp_vsr2  = VSR2;
+
     address start = __ function_entry();
 
-      Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
+    Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
+
     // don't try anything fancy if arrays don't have many elements
     __ li(tmp3, 0);
     __ cmpwi(CCR0, R5_ARG3, 9);
     __ ble(CCR0, l_6); // copy 2 at a time
 

@@ -1410,10 +1414,12 @@
 
       __ srdi(tmp1, R5_ARG3, 4);
       __ andi_(R5_ARG3, R5_ARG3, 15);
       __ mtctr(tmp1);
 
+      if (!VM_Version::has_vsx()) {
+
       __ bind(l_8);
       // Use unrolled version for mass copying (copy 16 elements a time).
       // Load feeding store gets zero latency on Power6, however not on Power5.
       // Therefore, the following sequence is made for the good of both.
       __ ld(tmp1, 0, R3_ARG1);

@@ -1425,11 +1431,47 @@
       __ std(tmp3, 16, R4_ARG2);
       __ std(tmp4, 24, R4_ARG2);
       __ addi(R3_ARG1, R3_ARG1, 32);
       __ addi(R4_ARG2, R4_ARG2, 32);
       __ bdnz(l_8);
+
+      } else { // Processor supports VSX, so use it to mass copy.
+
+        // Prefetch src data into L2 cache.
+        __ dcbt(R3_ARG1, 0);
+
+        // If supported set DSCR pre-fetch to deepest.
+        if (VM_Version::has_mfdscr()) {
+          __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
+          __ mtdscr(tmp2);
+        }
+        __ li(tmp1, 16);
+
+        // Backbranch target aligned to 32-byte. It's not aligned 16-byte
+        // as loop contains < 8 instructions that fit inside a single
+        // i-cache sector.
+        __ align(32);
+
+        __ bind(l_9);
+        // Use loop with VSX load/store instructions to
+        // copy 16 elements a time.
+        __ lxvd2x(tmp_vsr1, 0, R3_ARG1);     // Load from src.
+        __ stxvd2x(tmp_vsr1, 0, R4_ARG2);    // Store to dst.
+        __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1);  // Load from src + 16.
+        __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
+        __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32.
+        __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32.
+        __ bdnz(l_9);                        // Dec CTR and loop if not zero.
+
+        // Restore DSCR pre-fetch value.
+        if (VM_Version::has_mfdscr()) {
+          __ load_const_optimized(tmp2, VM_Version::_dscr_val);
+          __ mtdscr(tmp2);
+        }
+
     }
+    } // FasterArrayCopy
     __ bind(l_6);
 
     // copy 2 elements at a time
     { // FasterArrayCopy
       __ cmpwi(CCR0, R5_ARG3, 2);
< prev index next >