< prev index next >

src/cpu/ppc/vm/stubGenerator_ppc.cpp

Print this page
rev 8845 : Apply 8154156 for VSX support, with cherry picking from 8077838, 8080684, and 8149655

*** 1,8 **** /* ! * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved. ! * Copyright 2012, 2014 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. --- 1,8 ---- /* ! * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. ! * Copyright 2012, 2018 SAP AG. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation.
*** 1350,1362 **** Register tmp1 = R6_ARG4; Register tmp2 = R7_ARG5; Register tmp3 = R8_ARG6; Register tmp4 = R9_ARG7; address start = __ function_entry(); ! Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8; // don't try anything fancy if arrays don't have many elements __ li(tmp3, 0); __ cmpwi(CCR0, R5_ARG3, 9); __ ble(CCR0, l_6); // copy 2 at a time --- 1350,1366 ---- Register tmp1 = R6_ARG4; Register tmp2 = R7_ARG5; Register tmp3 = R8_ARG6; Register tmp4 = R9_ARG7; + VectorSRegister tmp_vsr1 = VSR1; + VectorSRegister tmp_vsr2 = VSR2; + address start = __ function_entry(); ! Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9; ! // don't try anything fancy if arrays don't have many elements __ li(tmp3, 0); __ cmpwi(CCR0, R5_ARG3, 9); __ ble(CCR0, l_6); // copy 2 at a time
*** 1410,1419 **** --- 1414,1425 ---- __ srdi(tmp1, R5_ARG3, 4); __ andi_(R5_ARG3, R5_ARG3, 15); __ mtctr(tmp1); + if (!VM_Version::has_vsx()) { + __ bind(l_8); // Use unrolled version for mass copying (copy 16 elements a time). // Load feeding store gets zero latency on Power6, however not on Power5. // Therefore, the following sequence is made for the good of both. __ ld(tmp1, 0, R3_ARG1);
*** 1425,1435 **** --- 1431,1477 ---- __ std(tmp3, 16, R4_ARG2); __ std(tmp4, 24, R4_ARG2); __ addi(R3_ARG1, R3_ARG1, 32); __ addi(R4_ARG2, R4_ARG2, 32); __ bdnz(l_8); + + } else { // Processor supports VSX, so use it to mass copy. + + // Prefetch src data into L2 cache. + __ dcbt(R3_ARG1, 0); + + // If supported set DSCR pre-fetch to deepest. + if (VM_Version::has_mfdscr()) { + __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); + __ mtdscr(tmp2); + } + __ li(tmp1, 16); + + // Backbranch target aligned to 32-byte. It's not aligned 16-byte + // as loop contains < 8 instructions that fit inside a single + // i-cache sector. + __ align(32); + + __ bind(l_9); + // Use loop with VSX load/store instructions to + // copy 16 elements a time. + __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load from src. + __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst. + __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16. + __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16. + __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32. + __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32. + __ bdnz(l_9); // Dec CTR and loop if not zero. + + // Restore DSCR pre-fetch value. + if (VM_Version::has_mfdscr()) { + __ load_const_optimized(tmp2, VM_Version::_dscr_val); + __ mtdscr(tmp2); + } + } + } // FasterArrayCopy __ bind(l_6); // copy 2 elements at a time { // FasterArrayCopy __ cmpwi(CCR0, R5_ARG3, 2);
< prev index next >