< prev index next >

src/cpu/ppc/vm/stubGenerator_ppc.cpp

Print this page

        

*** 1218,1229 **** __ align(32); __ bind(l_10); // Use loop with VSX load/store instructions to // copy 32 elements a time. ! __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src ! __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 __ bdnz(l_10); // Dec CTR and loop if not zero. --- 1218,1229 ---- __ align(32); __ bind(l_10); // Use loop with VSX load/store instructions to // copy 32 elements a time. ! __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src ! __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 __ bdnz(l_10); // Dec CTR and loop if not zero.
*** 1484,1495 **** __ align(32); __ bind(l_9); // Use loop with VSX load/store instructions to // copy 16 elements a time. ! __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load from src. ! __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst. __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16. __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16. __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32. __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32. __ bdnz(l_9); // Dec CTR and loop if not zero. --- 1484,1495 ---- __ align(32); __ bind(l_9); // Use loop with VSX load/store instructions to // copy 16 elements a time. ! __ lxvd2x(tmp_vsr1, R3_ARG1); // Load from src. ! __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst. __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16. __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16. __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32. __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32. __ bdnz(l_9); // Dec CTR and loop if not zero.
*** 1675,1686 **** __ align(32); __ bind(l_7); // Use loop with VSX load/store instructions to // copy 8 elements a time. ! __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src ! __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 __ bdnz(l_7); // Dec CTR and loop if not zero. --- 1675,1686 ---- __ align(32); __ bind(l_7); // Use loop with VSX load/store instructions to // copy 8 elements a time. ! __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src ! __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 __ bdnz(l_7); // Dec CTR and loop if not zero.
*** 1743,1775 **** // void generate_conjoint_int_copy_core(bool aligned) { // Do reverse copy. We assume the case of actual overlap is rare enough // that we don't have to optimize it. ! Label l_1, l_2, l_3, l_4, l_5, l_6; Register tmp1 = R6_ARG4; Register tmp2 = R7_ARG5; Register tmp3 = R8_ARG6; Register tmp4 = R0; { // FasterArrayCopy __ cmpwi(CCR0, R5_ARG3, 0); __ beq(CCR0, l_6); __ sldi(R5_ARG3, R5_ARG3, 2); __ add(R3_ARG1, R3_ARG1, R5_ARG3); __ add(R4_ARG2, R4_ARG2, R5_ARG3); __ srdi(R5_ARG3, R5_ARG3, 2); __ cmpwi(CCR0, R5_ARG3, 7); __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain __ srdi(tmp1, R5_ARG3, 3); __ andi(R5_ARG3, R5_ARG3, 7); __ mtctr(tmp1); __ bind(l_4); // Use unrolled version for mass copying (copy 4 elements a time). // Load feeding store gets zero latency on Power6, however not on Power5. // Therefore, the following sequence is made for the good of both. __ addi(R3_ARG1, R3_ARG1, -32); --- 1743,1799 ---- // void generate_conjoint_int_copy_core(bool aligned) { // Do reverse copy. We assume the case of actual overlap is rare enough // that we don't have to optimize it. ! Label l_1, l_2, l_3, l_4, l_5, l_6, l_7; Register tmp1 = R6_ARG4; Register tmp2 = R7_ARG5; Register tmp3 = R8_ARG6; Register tmp4 = R0; + VectorSRegister tmp_vsr1 = VSR1; + VectorSRegister tmp_vsr2 = VSR2; + { // FasterArrayCopy + __ li(tmp3, 0); __ cmpwi(CCR0, R5_ARG3, 0); __ beq(CCR0, l_6); __ sldi(R5_ARG3, R5_ARG3, 2); __ add(R3_ARG1, R3_ARG1, R5_ARG3); __ add(R4_ARG2, R4_ARG2, R5_ARG3); __ srdi(R5_ARG3, R5_ARG3, 2); + if (!aligned) { + // check if arrays have same alignment mod 8. + __ xorr(tmp1, R3_ARG1, R4_ARG2); + __ andi_(R0, tmp1, 7); + // Not the same alignment, but ld and std just need to be 4 byte aligned. + __ bne(CCR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time + + // copy 1 element to align to and from on an 8 byte boundary + __ andi_(R0, R3_ARG1, 7); + __ beq(CCR0, l_7); + + __ addi(R3_ARG1, R3_ARG1, -4); + __ addi(R4_ARG2, R4_ARG2, -4); + __ addi(R5_ARG3, R5_ARG3, -1); + __ lwzx(tmp2, R3_ARG1, tmp3); + __ stwx(tmp2, R4_ARG2, tmp3); + __ bind(l_7); + } + __ cmpwi(CCR0, R5_ARG3, 7); __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain __ srdi(tmp1, R5_ARG3, 3); __ andi(R5_ARG3, R5_ARG3, 7); __ mtctr(tmp1); + if (!VM_Version::has_vsx()) { __ bind(l_4); // Use unrolled version for mass copying (copy 4 elements a time). // Load feeding store gets zero latency on Power6, however not on Power5. // Therefore, the following sequence is made for the good of both. __ addi(R3_ARG1, R3_ARG1, -32);
*** 1781,1790 **** --- 1805,1848 ---- __ std(tmp4, 24, R4_ARG2); __ std(tmp3, 16, R4_ARG2); __ std(tmp2, 8, R4_ARG2); __ std(tmp1, 0, R4_ARG2); __ bdnz(l_4); + } else { // Processor supports VSX, so use it to mass copy. + // Prefetch the data into the L2 cache. + __ dcbt(R3_ARG1, 0); + + // If supported set DSCR pre-fetch to deepest. + if (VM_Version::has_mfdscr()) { + __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); + __ mtdscr(tmp2); + } + + __ li(tmp1, 16); + + // Backbranch target aligned to 32-byte. Not 16-byte align as + // loop contains < 8 instructions that fit inside a single + // i-cache sector. + __ align(32); + + __ bind(l_4); + // Use loop with VSX load/store instructions to + // copy 8 elements a time. + __ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32 + __ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32 + __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16 + __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src + __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16 + __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst + __ bdnz(l_4); + + // Restore DSCR pre-fetch value. + if (VM_Version::has_mfdscr()) { + __ load_const_optimized(tmp2, VM_Version::_dscr_val); + __ mtdscr(tmp2); + } + } __ cmpwi(CCR0, R5_ARG3, 0); __ beq(CCR0, l_6); __ bind(l_5);
*** 1890,1901 **** __ align(32); __ bind(l_5); // Use loop with VSX load/store instructions to // copy 4 elements a time. ! __ lxvd2x(tmp_vsr1, 0, R3_ARG1); // Load src ! __ stxvd2x(tmp_vsr1, 0, R4_ARG2); // Store to dst __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 __ bdnz(l_5); // Dec CTR and loop if not zero. --- 1948,1959 ---- __ align(32); __ bind(l_5); // Use loop with VSX load/store instructions to // copy 4 elements a time. ! __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src ! __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 __ bdnz(l_5); // Dec CTR and loop if not zero.
*** 1960,1969 **** --- 2018,2030 ---- Register tmp1 = R6_ARG4; Register tmp2 = R7_ARG5; Register tmp3 = R8_ARG6; Register tmp4 = R0; + VectorSRegister tmp_vsr1 = VSR1; + VectorSRegister tmp_vsr2 = VSR2; + Label l_1, l_2, l_3, l_4, l_5; __ cmpwi(CCR0, R5_ARG3, 0); __ beq(CCR0, l_1);
*** 1978,1987 **** --- 2039,2049 ---- __ srdi(tmp1, R5_ARG3, 2); __ andi(R5_ARG3, R5_ARG3, 3); __ mtctr(tmp1); + if (!VM_Version::has_vsx()) { __ bind(l_4); // Use unrolled version for mass copying (copy 4 elements a time). // Load feeding store gets zero latency on Power6, however not on Power5. // Therefore, the following sequence is made for the good of both. __ addi(R3_ARG1, R3_ARG1, -32);
*** 1993,2002 **** --- 2055,2098 ---- __ std(tmp4, 24, R4_ARG2); __ std(tmp3, 16, R4_ARG2); __ std(tmp2, 8, R4_ARG2); __ std(tmp1, 0, R4_ARG2); __ bdnz(l_4); + } else { // Processor supports VSX, so use it to mass copy. + // Prefetch the data into the L2 cache. + __ dcbt(R3_ARG1, 0); + + // If supported set DSCR pre-fetch to deepest. + if (VM_Version::has_mfdscr()) { + __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); + __ mtdscr(tmp2); + } + + __ li(tmp1, 16); + + // Backbranch target aligned to 32-byte. Not 16-byte align as + // loop contains < 8 instructions that fit inside a single + // i-cache sector. + __ align(32); + + __ bind(l_4); + // Use loop with VSX load/store instructions to + // copy 4 elements a time. + __ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32 + __ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32 + __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16 + __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src + __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16 + __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst + __ bdnz(l_4); + + // Restore DSCR pre-fetch value. + if (VM_Version::has_mfdscr()) { + __ load_const_optimized(tmp2, VM_Version::_dscr_val); + __ mtdscr(tmp2); + } + } __ cmpwi(CCR0, R5_ARG3, 0); __ beq(CCR0, l_1); __ bind(l_5);
< prev index next >