hotspot Cdiff src/cpu/ppc/vm/stubGenerator

src/cpu/ppc/vm/stubGenerator_ppc.cpp

rev 8845 : Apply 8154156 for VSX support, with cherry picking from 8077838, 8080684, and 8149655


*** 1,8 ****
  /*
!  * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
!  * Copyright 2012, 2014 SAP AG. All rights reserved.
   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
   * This code is free software; you can redistribute it and/or modify it
   * under the terms of the GNU General Public License version 2 only, as
   * published by the Free Software Foundation.
--- 1,8 ----
  /*
!  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
!  * Copyright 2012, 2018 SAP AG. All rights reserved.
   * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   *
   * This code is free software; you can redistribute it and/or modify it
   * under the terms of the GNU General Public License version 2 only, as
   * published by the Free Software Foundation.
*** 1350,1362 ****
      Register tmp1 = R6_ARG4;
      Register tmp2 = R7_ARG5;
      Register tmp3 = R8_ARG6;
      Register tmp4 = R9_ARG7;
  
      address start = __ function_entry();
  
!       Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
      // don't try anything fancy if arrays don't have many elements
      __ li(tmp3, 0);
      __ cmpwi(CCR0, R5_ARG3, 9);
      __ ble(CCR0, l_6); // copy 2 at a time
  
--- 1350,1366 ----
      Register tmp1 = R6_ARG4;
      Register tmp2 = R7_ARG5;
      Register tmp3 = R8_ARG6;
      Register tmp4 = R9_ARG7;
  
+     VectorSRegister tmp_vsr1  = VSR1;
+     VectorSRegister tmp_vsr2  = VSR2;
+ 
      address start = __ function_entry();
  
!     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
! 
      // don't try anything fancy if arrays don't have many elements
      __ li(tmp3, 0);
      __ cmpwi(CCR0, R5_ARG3, 9);
      __ ble(CCR0, l_6); // copy 2 at a time
  
*** 1410,1419 ****
--- 1414,1425 ----
  
        __ srdi(tmp1, R5_ARG3, 4);
        __ andi_(R5_ARG3, R5_ARG3, 15);
        __ mtctr(tmp1);
  
+       if (!VM_Version::has_vsx()) {
+ 
          __ bind(l_8);
          // Use unrolled version for mass copying (copy 16 elements a time).
          // Load feeding store gets zero latency on Power6, however not on Power5.
          // Therefore, the following sequence is made for the good of both.
          __ ld(tmp1, 0, R3_ARG1);
*** 1425,1435 ****
--- 1431,1477 ----
          __ std(tmp3, 16, R4_ARG2);
          __ std(tmp4, 24, R4_ARG2);
          __ addi(R3_ARG1, R3_ARG1, 32);
          __ addi(R4_ARG2, R4_ARG2, 32);
          __ bdnz(l_8);
+ 
+       } else { // Processor supports VSX, so use it to mass copy.
+ 
+         // Prefetch src data into L2 cache.
+         __ dcbt(R3_ARG1, 0);
+ 
+         // If supported set DSCR pre-fetch to deepest.
+         if (VM_Version::has_mfdscr()) {
+           __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
+           __ mtdscr(tmp2);
+         }
+         __ li(tmp1, 16);
+ 
+         // Backbranch target aligned to 32-byte. It's not aligned 16-byte
+         // as loop contains < 8 instructions that fit inside a single
+         // i-cache sector.
+         __ align(32);
+ 
+         __ bind(l_9);
+         // Use loop with VSX load/store instructions to
+         // copy 16 elements a time.
+         __ lxvd2x(tmp_vsr1, 0, R3_ARG1);     // Load from src.
+         __ stxvd2x(tmp_vsr1, 0, R4_ARG2);    // Store to dst.
+         __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1);  // Load from src + 16.
+         __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
+         __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32.
+         __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32.
+         __ bdnz(l_9);                        // Dec CTR and loop if not zero.
+ 
+         // Restore DSCR pre-fetch value.
+         if (VM_Version::has_mfdscr()) {
+           __ load_const_optimized(tmp2, VM_Version::_dscr_val);
+           __ mtdscr(tmp2);
+         }
+ 
        }
+     } // FasterArrayCopy
      __ bind(l_6);
  
      // copy 2 elements at a time
      { // FasterArrayCopy
        __ cmpwi(CCR0, R5_ARG3, 2);

< prev index next >