< prev index next >

src/cpu/ppc/vm/macroAssembler_ppc.cpp

Print this page
rev 11436 : 8159976: PPC64: Add missing intrinsics for sub-word atomics
Reviewed-by: simonis


1405   Label no_reserved_zone_enabling;
1406 
1407   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1408   cmpld(CCR0, R1_SP, R0);
1409   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1410 
1411   // Enable reserved zone again, throw stack overflow exception.
1412   push_frame_reg_args(0, R0);
1413   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1414   pop_frame();
1415   mtlr(return_pc);
1416   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1417   mtctr(R0);
1418   bctr();
1419 
1420   should_not_reach_here();
1421 
1422   bind(no_reserved_zone_enabling);
1423 }
1424 












































































































































































1425 // CmpxchgX sets condition register to cmpX(current, compare).
1426 void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_value,
1427                               Register compare_value, Register exchange_value,
1428                               Register addr_base, int semantics, bool cmpxchgx_hint,
1429                               Register int_flag_success, bool contention_hint, bool weak) {

1430   Label retry;
1431   Label failed;
1432   Label done;
1433 
1434   // Save one branch if result is returned via register and
1435   // result register is different from the other ones.
1436   bool use_result_reg    = (int_flag_success != noreg);
1437   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1438                             int_flag_success != exchange_value && int_flag_success != addr_base);

1439   assert(!weak || flag == CCR0, "weak only supported with CCR0");

1440 
1441   if (use_result_reg && preset_result_reg) {
1442     li(int_flag_success, 0); // preset (assume cas failed)
1443   }
1444 
1445   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1446   if (contention_hint) { // Don't try to reserve if cmp fails.
1447     lwz(dest_current_value, 0, addr_base);





1448     cmpw(flag, dest_current_value, compare_value);
1449     bne(flag, failed);
1450   }
1451 
1452   // release/fence semantics
1453   if (semantics & MemBarRel) {
1454     release();
1455   }
1456 
1457   // atomic emulation loop
1458   bind(retry);
1459 
1460   lwarx(dest_current_value, addr_base, cmpxchgx_hint);
1461   cmpw(flag, dest_current_value, compare_value);
1462   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1463     bne_predict_not_taken(flag, failed);
1464   } else {
1465     bne(                  flag, failed);
1466   }
1467   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1468   // fall through    => (flag == eq), (dest_current_value == compare_value)
1469 
1470   stwcx_(exchange_value, addr_base);
1471   if (!weak || use_result_reg) {
1472     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1473       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1474     } else {
1475       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1476     }
1477   }
1478   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1479 
1480   // Result in register (must do this at the end because int_flag_success can be the
1481   // same register as one above).
1482   if (use_result_reg) {
1483     li(int_flag_success, 1);
1484   }
1485 
1486   if (semantics & MemBarFenceAfter) {
1487     fence();
1488   } else if (semantics & MemBarAcq) {
1489     isync();
1490   }


3732   and_(tmp0, tmp0, tmp1);
3733   bne(CCR0, Ldone);               // Found negative byte.
3734   addi(src, src, 16);
3735 
3736   bdnz(Lfastloop);
3737 
3738   bind(Lslow);                    // Fallback to slow version
3739   rldicl_(tmp0, cnt, 0, 64-4);
3740   beq(CCR0, Lnoneg);
3741   mtctr(tmp0);
3742   bind(Lloop);
3743   lbz(tmp0, 0, src);
3744   addi(src, src, 1);
3745   andi_(tmp0, tmp0, 0x80);
3746   bne(CCR0, Ldone);               // Found negative byte.
3747   bdnz(Lloop);
3748   bind(Lnoneg);
3749   li(result, 0);
3750 
3751   bind(Ldone);
3752 }
3753 
3754 
3755 // Intrinsics for non-CompactStrings
3756 
3757 // Search for a single jchar in an jchar[].
3758 //
3759 // Assumes that result differs from all other registers.
3760 //
3761 // 'haystack' is the addresses of a jchar-array.
3762 // 'needle' is either the character to search for or R0.
3763 // 'needleChar' is the character to search for if 'needle' == R0..
3764 // 'haycnt' is the length of the haystack. We assume 'haycnt' >=1.
3765 //
3766 // Preserves haystack, haycnt, needle and kills all other registers.
3767 //
3768 // If needle == R0, we search for the constant needleChar.
3769 void MacroAssembler::string_indexof_1(Register result, Register haystack, Register haycnt,
3770                                       Register needle, jchar needleChar,
3771                                       Register tmp1, Register tmp2) {
3772 
3773   assert_different_registers(result, haystack, haycnt, needle, tmp1, tmp2);
3774 
3775   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_Found3, L_NotFound, L_End;
3776   Register addr = tmp1,
3777            ch1 = tmp2,
3778            ch2 = R0;
3779 
3780 //3:
3781    dcbtct(haystack, 0x00);                        // Indicate R/O access to haystack.
3782 
3783    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3784    mr(addr, haystack);
3785    beq(CCR0, L_FinalCheck);
3786    mtctr(tmp2);              // Move to count register.
3787 //8:
3788   bind(L_InnerLoop);             // Main work horse (2x unrolled search loop).
3789    lhz(ch1, 0, addr);        // Load characters from haystack.
3790    lhz(ch2, 2, addr);
3791    (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, needleChar);
3792    (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, needleChar);
3793    beq(CCR0, L_Found1);   // Did we find the needle?
3794    beq(CCR1, L_Found2);
3795    addi(addr, addr, 4);
3796    bdnz(L_InnerLoop);
3797 //16:
3798   bind(L_FinalCheck);
3799    andi_(R0, haycnt, 1);
3800    beq(CCR0, L_NotFound);
3801    lhz(ch1, 0, addr);        // One position left at which we have to compare.
3802    (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, needleChar);
3803    beq(CCR1, L_Found3);
3804 //21:
3805   bind(L_NotFound);
3806    li(result, -1);           // Not found.
3807    b(L_End);
3808 
3809   bind(L_Found2);
3810    addi(addr, addr, 2);
3811 //24:
3812   bind(L_Found1);
3813   bind(L_Found3);                  // Return index ...
3814    subf(addr, haystack, addr); // relative to haystack,
3815    srdi(result, addr, 1);      // in characters.
3816   bind(L_End);
3817 }
3818 
3819 
3820 // Implementation of IndexOf for jchar arrays.
3821 //
3822 // The length of haystack and needle are not constant, i.e. passed in a register.
3823 //
3824 // Preserves registers haystack, needle.
3825 // Kills registers haycnt, needlecnt.
3826 // Assumes that result differs from all other registers.
3827 // Haystack, needle are the addresses of jchar-arrays.
3828 // Haycnt, needlecnt are the lengths of them, respectively.
3829 //
3830 // Needlecntval must be zero or 15-bit unsigned immediate and > 1.
3831 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3832                                     Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3833                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
3834 
3835   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3836   Label L_TooShort, L_Found, L_NotFound, L_End;
3837   Register last_addr = haycnt, // Kill haycnt at the beginning.
3838            addr      = tmp1,
3839            n_start   = tmp2,
3840            ch1       = tmp3,
3841            ch2       = R0;
3842 
3843   // **************************************************************************************************
3844   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3845   // **************************************************************************************************
3846 
3847 //1 (variable) or 3 (const):
3848    dcbtct(needle, 0x00);    // Indicate R/O access to str1.
3849    dcbtct(haystack, 0x00);  // Indicate R/O access to str2.
3850 
3851   // Compute last haystack addr to use if no match gets found.
3852   if (needlecntval == 0) { // variable needlecnt
3853 //3:
3854    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3855    addi(addr, haystack, -2);          // Accesses use pre-increment.
3856    cmpwi(CCR6, needlecnt, 2);
3857    blt(CCR6, L_TooShort);          // Variable needlecnt: handle short needle separately.
3858    slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3859    lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3860    add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3861    addi(needlecnt, needlecnt, -2);    // Rest of needle.
3862   } else { // constant needlecnt
3863   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3864   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3865 //5:
3866    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3867    lwz(n_start, 0, needle);           // Load first 2 characters of needle.
3868    addi(addr, haystack, -2);          // Accesses use pre-increment.
3869    slwi(ch1, ch1, 1);                 // Scale to number of bytes.
3870    add(last_addr, haystack, ch1);     // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3871    li(needlecnt, needlecntval-2);     // Rest of needle.
3872   }
3873 
3874   // Main Loop (now we have at least 3 characters).
3875 //11:
3876   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2, L_Comp3;
3877   bind(L_OuterLoop); // Search for 1st 2 characters.
3878   Register addr_diff = tmp4;
3879    subf(addr_diff, addr, last_addr); // Difference between already checked address and last address to check.
3880    addi(addr, addr, 2);              // This is the new address we want to use for comparing.
3881    srdi_(ch2, addr_diff, 2);
3882    beq(CCR0, L_FinalCheck);       // 2 characters left?
3883    mtctr(ch2);                       // addr_diff/4
3884 //16:
3885   bind(L_InnerLoop);                // Main work horse (2x unrolled search loop)
3886    lwz(ch1, 0, addr);           // Load 2 characters of haystack (ignore alignment).
3887    lwz(ch2, 2, addr);
3888    cmpw(CCR0, ch1, n_start); // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3889    cmpw(CCR1, ch2, n_start);
3890    beq(CCR0, L_Comp1);       // Did we find the needle start?
3891    beq(CCR1, L_Comp2);
3892    addi(addr, addr, 4);
3893    bdnz(L_InnerLoop);
3894 //24:
3895   bind(L_FinalCheck);
3896    rldicl_(addr_diff, addr_diff, 64-1, 63); // Remaining characters not covered by InnerLoop: (addr_diff>>1)&1.
3897    beq(CCR0, L_NotFound);
3898    lwz(ch1, 0, addr);                       // One position left at which we have to compare.
3899    cmpw(CCR1, ch1, n_start);
3900    beq(CCR1, L_Comp3);
3901 //29:
3902   bind(L_NotFound);
3903    li(result, -1); // not found
3904    b(L_End);
3905 
3906 
3907    // **************************************************************************************************
3908    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3909    // **************************************************************************************************
3910 //31:
3911  if ((needlecntval>>1) !=1 ) { // Const needlecnt is 2 or 3? Reduce code size.
3912   int nopcnt = 5;
3913   if (needlecntval !=0 ) ++nopcnt; // Balance alignment (other case: see below).
3914   if (needlecntval == 0) {         // We have to handle these cases separately.
3915   Label L_OneCharLoop;
3916   bind(L_TooShort);
3917    mtctr(haycnt);
3918    lhz(n_start, 0, needle);    // First character of needle
3919   bind(L_OneCharLoop);
3920    lhzu(ch1, 2, addr);
3921    cmpw(CCR1, ch1, n_start);
3922    beq(CCR1, L_Found);      // Did we find the one character needle?
3923    bdnz(L_OneCharLoop);
3924    li(result, -1);             // Not found.
3925    b(L_End);
3926   } // 8 instructions, so no impact on alignment.
3927   for (int x = 0; x < nopcnt; ++x) nop();
3928  }
3929 
3930   // **************************************************************************************************
3931   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3932   // **************************************************************************************************
3933 
3934   // Compare the rest
3935 //36 if needlecntval==0, else 37:
3936   bind(L_Comp2);
3937    addi(addr, addr, 2); // First comparison has failed, 2nd one hit.
3938   bind(L_Comp1);            // Addr points to possible needle start.
3939   bind(L_Comp3);            // Could have created a copy and use a different return address but saving code size here.
3940   if (needlecntval != 2) {  // Const needlecnt==2?
3941    if (needlecntval != 3) {
3942     if (needlecntval == 0) beq(CCR6, L_Found); // Variable needlecnt==2?
3943     Register ind_reg = tmp4;
3944     li(ind_reg, 2*2);   // First 2 characters are already compared, use index 2.
3945     mtctr(needlecnt);   // Decremented by 2, still > 0.
3946 //40:
3947    Label L_CompLoop;
3948    bind(L_CompLoop);
3949     lhzx(ch2, needle, ind_reg);
3950     lhzx(ch1, addr, ind_reg);
3951     cmpw(CCR1, ch1, ch2);
3952     bne(CCR1, L_OuterLoop);
3953     addi(ind_reg, ind_reg, 2);
3954     bdnz(L_CompLoop);
3955    } else { // No loop required if there's only one needle character left.
3956     lhz(ch2, 2*2, needle);
3957     lhz(ch1, 2*2, addr);
3958     cmpw(CCR1, ch1, ch2);
3959     bne(CCR1, L_OuterLoop);
3960    }
3961   }
3962   // Return index ...
3963 //46:
3964   bind(L_Found);
3965    subf(addr, haystack, addr); // relative to haystack, ...
3966    srdi(result, addr, 1);      // in characters.
3967 //48:
3968   bind(L_End);
3969 }
3970 
3971 // Implementation of Compare for jchar arrays.
3972 //
3973 // Kills the registers str1, str2, cnt1, cnt2.
3974 // Kills cr0, ctr.
3975 // Assumes that result differes from the input registers.
3976 void MacroAssembler::string_compare(Register str1_reg, Register str2_reg, Register cnt1_reg, Register cnt2_reg,
3977                                     Register result_reg, Register tmp_reg) {
3978    assert_different_registers(result_reg, str1_reg, str2_reg, cnt1_reg, cnt2_reg, tmp_reg);
3979 
3980    Label Ldone, Lslow_case, Lslow_loop, Lfast_loop;
3981    Register cnt_diff = R0,
3982             limit_reg = cnt1_reg,
3983             chr1_reg = result_reg,
3984             chr2_reg = cnt2_reg,
3985             addr_diff = str2_reg;
3986 
3987    // 'cnt_reg' contains the number of characters in the string's character array for the
3988    // pre-CompactStrings strings implementation and the number of bytes in the string's
3989    // byte array for the CompactStrings strings implementation.
3990    const int HAS_COMPACT_STRING = java_lang_String::has_coder_field() ? 1 : 0; // '1' = byte array, '0' = char array
3991 
3992    // Offset 0 should be 32 byte aligned.
3993 //-6:
3994     srawi(cnt1_reg, cnt1_reg, HAS_COMPACT_STRING);
3995     srawi(cnt2_reg, cnt2_reg, HAS_COMPACT_STRING);
3996 //-4:
3997     dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
3998     dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
3999 //-2:
4000    // Compute min(cnt1, cnt2) and check if 0 (bail out if we don't need to compare characters).
4001     subf(result_reg, cnt2_reg, cnt1_reg);  // difference between cnt1/2
4002     subf_(addr_diff, str1_reg, str2_reg);  // alias?
4003     beq(CCR0, Ldone);                   // return cnt difference if both ones are identical
4004     srawi(limit_reg, result_reg, 31);      // generate signmask (cnt1/2 must be non-negative so cnt_diff can't overflow)
4005     mr(cnt_diff, result_reg);
4006     andr(limit_reg, result_reg, limit_reg); // difference or zero (negative): cnt1<cnt2 ? cnt1-cnt2 : 0
4007     add_(limit_reg, cnt2_reg, limit_reg);  // min(cnt1, cnt2)==0?
4008     beq(CCR0, Ldone);                   // return cnt difference if one has 0 length
4009 
4010     lhz(chr1_reg, 0, str1_reg);            // optional: early out if first characters mismatch
4011     lhzx(chr2_reg, str1_reg, addr_diff);   // optional: early out if first characters mismatch
4012     addi(tmp_reg, limit_reg, -1);          // min(cnt1, cnt2)-1
4013     subf_(result_reg, chr2_reg, chr1_reg); // optional: early out if first characters mismatch
4014     bne(CCR0, Ldone);                   // optional: early out if first characters mismatch
4015 
4016    // Set loop counter by scaling down tmp_reg
4017     srawi_(chr2_reg, tmp_reg, exact_log2(4)); // (min(cnt1, cnt2)-1)/4
4018     ble(CCR0, Lslow_case);                 // need >4 characters for fast loop
4019     andi(limit_reg, tmp_reg, 4-1);            // remaining characters
4020 
4021    // Adapt str1_reg str2_reg for the first loop iteration
4022     mtctr(chr2_reg);                 // (min(cnt1, cnt2)-1)/4
4023     addi(limit_reg, limit_reg, 4+1); // compare last 5-8 characters in slow_case if mismatch found in fast_loop
4024 //16:
4025    // Compare the rest of the characters
4026    bind(Lfast_loop);
4027     ld(chr1_reg, 0, str1_reg);
4028     ldx(chr2_reg, str1_reg, addr_diff);
4029     cmpd(CCR0, chr2_reg, chr1_reg);
4030     bne(CCR0, Lslow_case); // return chr1_reg
4031     addi(str1_reg, str1_reg, 4*2);
4032     bdnz(Lfast_loop);
4033     addi(limit_reg, limit_reg, -4); // no mismatch found in fast_loop, only 1-4 characters missing
4034 //23:
4035    bind(Lslow_case);
4036     mtctr(limit_reg);
4037 //24:
4038    bind(Lslow_loop);
4039     lhz(chr1_reg, 0, str1_reg);
4040     lhzx(chr2_reg, str1_reg, addr_diff);
4041     subf_(result_reg, chr2_reg, chr1_reg);
4042     bne(CCR0, Ldone); // return chr1_reg
4043     addi(str1_reg, str1_reg, 1*2);
4044     bdnz(Lslow_loop);
4045 //30:
4046    // If strings are equal up to min length, return the length difference.
4047     mr(result_reg, cnt_diff);
4048     nop(); // alignment
4049 //32:
4050    // Otherwise, return the difference between the first mismatched chars.
4051    bind(Ldone);
4052 }
4053 
4054 
4055 // Compare char[] arrays.
4056 //
4057 // str1_reg   USE only
4058 // str2_reg   USE only
4059 // cnt_reg    USE_DEF, due to tmp reg shortage
4060 // result_reg DEF only, might compromise USE only registers
4061 void MacroAssembler::char_arrays_equals(Register str1_reg, Register str2_reg, Register cnt_reg, Register result_reg,
4062                                         Register tmp1_reg, Register tmp2_reg, Register tmp3_reg, Register tmp4_reg,
4063                                         Register tmp5_reg) {
4064 
4065   // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
4066   assert_different_registers(result_reg, str1_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
4067   assert_different_registers(result_reg, str2_reg, cnt_reg, tmp1_reg, tmp2_reg, tmp3_reg, tmp4_reg, tmp5_reg);
4068 
4069   // Offset 0 should be 32 byte aligned.
4070   Label Linit_cbc, Lcbc, Lloop, Ldone_true, Ldone_false;
4071   Register index_reg = tmp5_reg;
4072   Register cbc_iter  = tmp4_reg;
4073 
4074   // 'cnt_reg' contains the number of characters in the string's character array for the
4075   // pre-CompactStrings strings implementation and the number of bytes in the string's
4076   // byte array for the CompactStrings strings implementation.
4077   const int HAS_COMPACT_STRING = java_lang_String::has_coder_field() ? 1 : 0; // '1' = byte array, '0' = char array
4078 
4079 //-1:
4080   dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
4081   dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
4082 //1:
4083   // cbc_iter: remaining characters after the '4 java characters per iteration' loop.
4084   rlwinm(cbc_iter, cnt_reg, 32 - HAS_COMPACT_STRING, 30, 31); // (cnt_reg % (HAS_COMPACT_STRING ? 8 : 4)) >> HAS_COMPACT_STRING
4085   li(index_reg, 0); // init
4086   li(result_reg, 0); // assume false
4087   // tmp2_reg: units of 4 java characters (i.e. 8 bytes) per iteration (main loop).
4088   srwi_(tmp2_reg, cnt_reg, exact_log2(4 << HAS_COMPACT_STRING)); // cnt_reg / (HAS_COMPACT_STRING ? 8 : 4)
4089 
4090   cmpwi(CCR1, cbc_iter, 0);             // CCR1 = (cbc_iter==0)
4091   beq(CCR0, Linit_cbc);                 // too short
4092     mtctr(tmp2_reg);
4093 //8:
4094     bind(Lloop);
4095       ldx(tmp1_reg, str1_reg, index_reg);
4096       ldx(tmp2_reg, str2_reg, index_reg);
4097       cmpd(CCR0, tmp1_reg, tmp2_reg);
4098       bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
4099       addi(index_reg, index_reg, 4*sizeof(jchar));
4100       bdnz(Lloop);
4101 //14:
4102   bind(Linit_cbc);
4103   beq(CCR1, Ldone_true);
4104     mtctr(cbc_iter);
4105 //16:
4106     bind(Lcbc);
4107       lhzx(tmp1_reg, str1_reg, index_reg);
4108       lhzx(tmp2_reg, str2_reg, index_reg);
4109       cmpw(CCR0, tmp1_reg, tmp2_reg);
4110       bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
4111       addi(index_reg, index_reg, 1*sizeof(jchar));
4112       bdnz(Lcbc);
4113     nop();
4114   bind(Ldone_true);
4115   li(result_reg, 1);
4116 //24:
4117   bind(Ldone_false);
4118 }
4119 
4120 
4121 void MacroAssembler::char_arrays_equalsImm(Register str1_reg, Register str2_reg, int cntval, Register result_reg,
4122                                            Register tmp1_reg, Register tmp2_reg) {
4123   // Str1 may be the same register as str2 which can occur e.g. after scalar replacement.
4124   assert_different_registers(result_reg, str1_reg, tmp1_reg, tmp2_reg);
4125   assert_different_registers(result_reg, str2_reg, tmp1_reg, tmp2_reg);
4126   assert(sizeof(jchar) == 2, "must be");
4127   assert(cntval >= 0 && ((cntval & 0x7fff) == cntval), "wrong immediate");
4128 
4129   // 'cntval' contains the number of characters in the string's character array for the
4130   // pre-CompactStrings strings implementation and the number of bytes in the string's
4131   // byte array for the CompactStrings strings implementation.
4132   cntval >>= (java_lang_String::has_coder_field() ? 1 : 0); // '1' = byte array strings, '0' = char array strings
4133 
4134   Label Ldone_false;
4135 
4136   if (cntval < 16) { // short case
4137     if (cntval != 0) li(result_reg, 0); // assume false
4138 
4139     const int num_bytes = cntval*sizeof(jchar);
4140     int index = 0;
4141     for (int next_index; (next_index = index + 8) <= num_bytes; index = next_index) {
4142       ld(tmp1_reg, index, str1_reg);
4143       ld(tmp2_reg, index, str2_reg);
4144       cmpd(CCR0, tmp1_reg, tmp2_reg);
4145       bne(CCR0, Ldone_false);
4146     }
4147     if (cntval & 2) {
4148       lwz(tmp1_reg, index, str1_reg);
4149       lwz(tmp2_reg, index, str2_reg);
4150       cmpw(CCR0, tmp1_reg, tmp2_reg);
4151       bne(CCR0, Ldone_false);
4152       index += 4;
4153     }
4154     if (cntval & 1) {
4155       lhz(tmp1_reg, index, str1_reg);
4156       lhz(tmp2_reg, index, str2_reg);
4157       cmpw(CCR0, tmp1_reg, tmp2_reg);
4158       bne(CCR0, Ldone_false);
4159     }
4160     // fallthrough: true
4161   } else {
4162     Label Lloop;
4163     Register index_reg = tmp1_reg;
4164     const int loopcnt = cntval/4;
4165     assert(loopcnt > 0, "must be");
4166     // Offset 0 should be 32 byte aligned.
4167     //2:
4168     dcbtct(str1_reg, 0x00);  // Indicate R/O access to str1.
4169     dcbtct(str2_reg, 0x00);  // Indicate R/O access to str2.
4170     li(tmp2_reg, loopcnt);
4171     li(index_reg, 0); // init
4172     li(result_reg, 0); // assume false
4173     mtctr(tmp2_reg);
4174     //8:
4175     bind(Lloop);
4176     ldx(R0, str1_reg, index_reg);
4177     ldx(tmp2_reg, str2_reg, index_reg);
4178     cmpd(CCR0, R0, tmp2_reg);
4179     bne(CCR0, Ldone_false);  // Unequal char pair found -> done.
4180     addi(index_reg, index_reg, 4*sizeof(jchar));
4181     bdnz(Lloop);
4182     //14:
4183     if (cntval & 2) {
4184       lwzx(R0, str1_reg, index_reg);
4185       lwzx(tmp2_reg, str2_reg, index_reg);
4186       cmpw(CCR0, R0, tmp2_reg);
4187       bne(CCR0, Ldone_false);
4188       if (cntval & 1) addi(index_reg, index_reg, 2*sizeof(jchar));
4189     }
4190     if (cntval & 1) {
4191       lhzx(R0, str1_reg, index_reg);
4192       lhzx(tmp2_reg, str2_reg, index_reg);
4193       cmpw(CCR0, R0, tmp2_reg);
4194       bne(CCR0, Ldone_false);
4195     }
4196     // fallthru: true
4197   }
4198   li(result_reg, 1);
4199   bind(Ldone_false);
4200 }
4201 
4202 #endif // Compiler2
4203 
4204 // Helpers for Intrinsic Emitters
4205 //
4206 // Revert the byte order of a 32bit value in a register
4207 //   src: 0x44556677
4208 //   dst: 0x77665544
4209 // Three steps to obtain the result:
4210 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
4211 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
4212 //     This value initializes dst.
4213 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
4214 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
4215 //     This value is mask inserted into dst with a [0..23] mask of 1s.
4216 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
4217 //     This value is mask inserted into dst with a [8..15] mask of 1s.
4218 void MacroAssembler::load_reverse_32(Register dst, Register src) {
4219   assert_different_registers(dst, src);




1405   Label no_reserved_zone_enabling;
1406 
1407   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1408   cmpld(CCR0, R1_SP, R0);
1409   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1410 
1411   // Enable reserved zone again, throw stack overflow exception.
1412   push_frame_reg_args(0, R0);
1413   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1414   pop_frame();
1415   mtlr(return_pc);
1416   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1417   mtctr(R0);
1418   bctr();
1419 
1420   should_not_reach_here();
1421 
1422   bind(no_reserved_zone_enabling);
1423 }
1424 
1425 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1426                                 bool cmpxchgx_hint) {
1427   Label retry;
1428   bind(retry);
1429   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1430   stdcx_(exchange_value, addr_base);
1431   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1432     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1433   } else {
1434     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1435   }
1436 }
1437 
1438 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1439                                 Register tmp, bool cmpxchgx_hint) {
1440   Label retry;
1441   bind(retry);
1442   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1443   add(tmp, dest_current_value, inc_value);
1444   stdcx_(tmp, addr_base);
1445   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1446     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1447   } else {
1448     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1449   }
1450 }
1451 
1452 // Word/sub-word atomic helper functions
1453 
1454 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1455 // Atomic add always kills tmp1.
1456 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1457                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1458                                                    bool cmpxchgx_hint, bool is_add, int size) {
1459   int instruction_type = VM_Version::has_lqarx() ? size : 4; // Sub-word instructions available since Power 8.
1460 
1461   Label retry;
1462   Register shift_amount = noreg,
1463            val32 = dest_current_value,
1464            modval = is_add ? tmp1 : exchange_value;
1465 
1466   if (instruction_type != size) {
1467     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1468     modval = tmp1;
1469     shift_amount = tmp2;
1470     val32 = tmp3;
1471     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1472 #ifdef VM_LITTLE_ENDIAN
1473     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1474     clrrdi(addr_base, addr_base, 2);
1475 #else
1476     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1477     clrrdi(addr_base, addr_base, 2);
1478     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1479 #endif
1480   }
1481 
1482   // atomic emulation loop
1483   bind(retry);
1484 
1485   switch (instruction_type) {
1486     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1487     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1488     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1489     default: ShouldNotReachHere();
1490   }
1491 
1492   if (instruction_type != size) {
1493     srw(dest_current_value, val32, shift_amount);
1494   }
1495 
1496   if (is_add) { add(modval, dest_current_value, exchange_value); }
1497 
1498   if (instruction_type != size) {
1499     // Transform exchange value such that the replacement can be done by one xor instruction
1500     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1501     clrldi(modval, modval, (size == 1) ? 56 : 48);
1502     slw(modval, modval, shift_amount);
1503     xorr(modval, val32, modval);
1504   }
1505 
1506   switch (instruction_type) {
1507     case 4: stwcx_(modval, addr_base); break;
1508     case 2: sthcx_(modval, addr_base); break;
1509     case 1: stbcx_(modval, addr_base); break;
1510     default: ShouldNotReachHere();
1511   }
1512 
1513   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1514     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1515   } else {
1516     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1517   }
1518 
1519   if (size == 1) {
1520     extsb(dest_current_value, dest_current_value);
1521   } else if (size == 2) {
1522     extsh(dest_current_value, dest_current_value);
1523   };
1524 }
1525 
1526 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1527 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1528                                        Register compare_value, Register exchange_value,
1529                                        Register addr_base, Register tmp1, Register tmp2,
1530                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1531   int instruction_type = VM_Version::has_lqarx() ? size : 4; // Sub-word instructions available since Power 8.
1532 
1533   Register shift_amount = noreg,
1534            val32 = dest_current_value,
1535            modval = exchange_value;
1536 
1537   if (instruction_type != size) {
1538     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1539     shift_amount = tmp1;
1540     val32 = tmp2;
1541     modval = tmp2;
1542     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1543 #ifdef VM_LITTLE_ENDIAN
1544     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1545     clrrdi(addr_base, addr_base, 2);
1546 #else
1547     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1548     clrrdi(addr_base, addr_base, 2);
1549     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1550 #endif
1551     // Transform exchange value such that the replacement can be done by one xor instruction.
1552     xorr(exchange_value, compare_value, exchange_value);
1553     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1554     slw(exchange_value, exchange_value, shift_amount);
1555   }
1556 
1557   // atomic emulation loop
1558   bind(retry);
1559 
1560   switch (instruction_type) {
1561     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1562     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1563     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1564     default: ShouldNotReachHere();
1565   }
1566 
1567   if (instruction_type != size) {
1568     srw(dest_current_value, val32, shift_amount);
1569   }
1570   if (size == 1) {
1571     extsb(dest_current_value, dest_current_value);
1572   } else if (size == 2) {
1573     extsh(dest_current_value, dest_current_value);
1574   };
1575 
1576   cmpw(flag, dest_current_value, compare_value);
1577   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1578     bne_predict_not_taken(flag, failed);
1579   } else {
1580     bne(                  flag, failed);
1581   }
1582   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1583   // fall through    => (flag == eq), (dest_current_value == compare_value)
1584 
1585   if (instruction_type != size) {
1586     xorr(modval, val32, exchange_value);
1587   }
1588 
1589   switch (instruction_type) {
1590     case 4: stwcx_(modval, addr_base); break;
1591     case 2: sthcx_(modval, addr_base); break;
1592     case 1: stbcx_(modval, addr_base); break;
1593     default: ShouldNotReachHere();
1594   }
1595 }
1596 
1597 // CmpxchgX sets condition register to cmpX(current, compare).
1598 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1599                                      Register compare_value, Register exchange_value,
1600                                      Register addr_base, Register tmp1, Register tmp2,
1601                                      int semantics, bool cmpxchgx_hint,
1602                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1603   Label retry;
1604   Label failed;
1605   Label done;
1606 
1607   // Save one branch if result is returned via register and
1608   // result register is different from the other ones.
1609   bool use_result_reg    = (int_flag_success != noreg);
1610   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1611                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1612                             int_flag_success != tmp1 && int_flag_success != tmp2);
1613   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1614   assert(size == 1 || size == 2 || size == 4, "unsupported");
1615 
1616   if (use_result_reg && preset_result_reg) {
1617     li(int_flag_success, 0); // preset (assume cas failed)
1618   }
1619 
1620   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1621   if (contention_hint) { // Don't try to reserve if cmp fails.
1622     switch (size) {
1623       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1624       case 2: lha(dest_current_value, 0, addr_base); break;
1625       case 4: lwz(dest_current_value, 0, addr_base); break;
1626       default: ShouldNotReachHere();
1627     }
1628     cmpw(flag, dest_current_value, compare_value);
1629     bne(flag, failed);
1630   }
1631 
1632   // release/fence semantics
1633   if (semantics & MemBarRel) {
1634     release();
1635   }
1636 
1637   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1638                     retry, failed, cmpxchgx_hint, size);












1639   if (!weak || use_result_reg) {
1640     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1641       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1642     } else {
1643       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1644     }
1645   }
1646   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1647 
1648   // Result in register (must do this at the end because int_flag_success can be the
1649   // same register as one above).
1650   if (use_result_reg) {
1651     li(int_flag_success, 1);
1652   }
1653 
1654   if (semantics & MemBarFenceAfter) {
1655     fence();
1656   } else if (semantics & MemBarAcq) {
1657     isync();
1658   }


3900   and_(tmp0, tmp0, tmp1);
3901   bne(CCR0, Ldone);               // Found negative byte.
3902   addi(src, src, 16);
3903 
3904   bdnz(Lfastloop);
3905 
3906   bind(Lslow);                    // Fallback to slow version
3907   rldicl_(tmp0, cnt, 0, 64-4);
3908   beq(CCR0, Lnoneg);
3909   mtctr(tmp0);
3910   bind(Lloop);
3911   lbz(tmp0, 0, src);
3912   addi(src, src, 1);
3913   andi_(tmp0, tmp0, 0x80);
3914   bne(CCR0, Ldone);               // Found negative byte.
3915   bdnz(Lloop);
3916   bind(Lnoneg);
3917   li(result, 0);
3918 
3919   bind(Ldone);
































































































































































































































































































































































































































































3920 }
3921 
3922 #endif // Compiler2
3923 
3924 // Helpers for Intrinsic Emitters
3925 //
3926 // Revert the byte order of a 32bit value in a register
3927 //   src: 0x44556677
3928 //   dst: 0x77665544
3929 // Three steps to obtain the result:
3930 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3931 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3932 //     This value initializes dst.
3933 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3934 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3935 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3936 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3937 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3938 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3939   assert_different_registers(dst, src);


< prev index next >