< prev index next >

src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp

Print this page




 477 
 478   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 479   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 480   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 481   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 482   Address saved_mark_addr(lock_reg, 0);
 483 
 484   // Biased locking
 485   // See whether the lock is currently biased toward our thread and
 486   // whether the epoch is still valid
 487   // Note that the runtime guarantees sufficient alignment of JavaThread
 488   // pointers to allow age to be placed into low bits
 489   // First check to see whether biasing is even enabled for this object
 490   Label cas_label;
 491   int null_check_offset = -1;
 492   if (!swap_reg_contains_mark) {
 493     null_check_offset = offset();
 494     ldr(swap_reg, mark_addr);
 495   }
 496   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 497   cmp(tmp_reg, markOopDesc::biased_lock_pattern);
 498   br(Assembler::NE, cas_label);
 499   // The bias pattern is present in the object's header. Need to check
 500   // whether the bias owner and the epoch are both still current.
 501   load_prototype_header(tmp_reg, obj_reg);
 502   orr(tmp_reg, tmp_reg, rthread);
 503   eor(tmp_reg, swap_reg, tmp_reg);
 504   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 505   if (counters != NULL) {
 506     Label around;
 507     cbnz(tmp_reg, around);
 508     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 509     b(done);
 510     bind(around);
 511   } else {
 512     cbz(tmp_reg, done);
 513   }
 514 
 515   Label try_revoke_bias;
 516   Label try_rebias;
 517 


 616     }
 617     bind(nope);
 618   }
 619 
 620   bind(cas_label);
 621 
 622   return null_check_offset;
 623 }
 624 
 625 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 626   assert(UseBiasedLocking, "why call this otherwise?");
 627 
 628   // Check for biased locking unlock case, which is a no-op
 629   // Note: we do not have to check the thread ID for two reasons.
 630   // First, the interpreter checks for IllegalMonitorStateException at
 631   // a higher level. Second, if the bias was revoked while we held the
 632   // lock, the object could not be rebiased toward another thread, so
 633   // the bias bit would be clear.
 634   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 635   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 636   cmp(temp_reg, markOopDesc::biased_lock_pattern);
 637   br(Assembler::EQ, done);
 638 }
 639 
 640 static void pass_arg0(MacroAssembler* masm, Register arg) {
 641   if (c_rarg0 != arg ) {
 642     masm->mov(c_rarg0, arg);
 643   }
 644 }
 645 
 646 static void pass_arg1(MacroAssembler* masm, Register arg) {
 647   if (c_rarg1 != arg ) {
 648     masm->mov(c_rarg1, arg);
 649   }
 650 }
 651 
 652 static void pass_arg2(MacroAssembler* masm, Register arg) {
 653   if (c_rarg2 != arg ) {
 654     masm->mov(c_rarg2, arg);
 655   }
 656 }


1112     ldrw(temp_reg, super_check_offset_addr);
1113     super_check_offset = RegisterOrConstant(temp_reg);
1114   }
1115   Address super_check_addr(sub_klass, super_check_offset);
1116   ldr(rscratch1, super_check_addr);
1117   cmp(super_klass, rscratch1); // load displayed supertype
1118 
1119   // This check has worked decisively for primary supers.
1120   // Secondary supers are sought in the super_cache ('super_cache_addr').
1121   // (Secondary supers are interfaces and very deeply nested subtypes.)
1122   // This works in the same check above because of a tricky aliasing
1123   // between the super_cache and the primary super display elements.
1124   // (The 'super_check_addr' can address either, as the case requires.)
1125   // Note that the cache is updated below if it does not help us find
1126   // what we need immediately.
1127   // So if it was a primary super, we can just fail immediately.
1128   // Otherwise, it's the slow path for us (no success at this point).
1129 
1130   if (super_check_offset.is_register()) {
1131     br(Assembler::EQ, *L_success);
1132     cmp(super_check_offset.as_register(), sc_offset);
1133     if (L_failure == &L_fallthrough) {
1134       br(Assembler::EQ, *L_slow_path);
1135     } else {
1136       br(Assembler::NE, *L_failure);
1137       final_jmp(*L_slow_path);
1138     }
1139   } else if (super_check_offset.as_constant() == sc_offset) {
1140     // Need a slow path; fast failure is impossible.
1141     if (L_slow_path == &L_fallthrough) {
1142       br(Assembler::EQ, *L_success);
1143     } else {
1144       br(Assembler::NE, *L_slow_path);
1145       final_jmp(*L_success);
1146     }
1147   } else {
1148     // No slow path; it's a fast decision.
1149     if (L_failure == &L_fallthrough) {
1150       br(Assembler::EQ, *L_success);
1151     } else {
1152       br(Assembler::NE, *L_failure);


3287 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3288         Register table0, Register table1, Register table2, Register table3,
3289         Register tmp, Register tmp2, Register tmp3) {
3290   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3291   unsigned long offset;
3292 
3293   if (UseCRC32) {
3294       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3295       return;
3296   }
3297 
3298     mvnw(crc, crc);
3299 
3300     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3301     if (offset) add(table0, table0, offset);
3302     add(table1, table0, 1*256*sizeof(juint));
3303     add(table2, table0, 2*256*sizeof(juint));
3304     add(table3, table0, 3*256*sizeof(juint));
3305 
3306   if (UseNeon) {
3307       cmp(len, 64);
3308       br(Assembler::LT, L_by16);
3309       eor(v16, T16B, v16, v16);
3310 
3311     Label L_fold;
3312 
3313       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3314 
3315       ld1(v0, v1, T2D, post(buf, 32));
3316       ld1r(v4, T2D, post(tmp, 8));
3317       ld1r(v5, T2D, post(tmp, 8));
3318       ld1r(v6, T2D, post(tmp, 8));
3319       ld1r(v7, T2D, post(tmp, 8));
3320       mov(v16, T4S, 0, crc);
3321 
3322       eor(v0, T16B, v0, v16);
3323       sub(len, len, 64);
3324 
3325     BIND(L_fold);
3326       pmull(v22, T8H, v0, v5, T8B);
3327       pmull(v20, T8H, v0, v7, T8B);


4337   int str2_chr_size = str2_isL ? 1:2;
4338   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4339                                       (chr_insn)&MacroAssembler::ldrh;
4340   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4341                                       (chr_insn)&MacroAssembler::ldrh;
4342   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4343   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4344 
4345   // Note, inline_string_indexOf() generates checks:
4346   // if (substr.count > string.count) return -1;
4347   // if (substr.count == 0) return 0;
4348 
4349   // We have two strings, a source string in str2, cnt2 and a pattern string
4350   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4351 
4352   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4353   // With a small pattern and source we use linear scan.
4354 
4355   if (icnt1 == -1) {
4356     sub(result_tmp, cnt2, cnt1);
4357     cmp(cnt1, 8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4358     br(LT, LINEARSEARCH);
4359     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4360     cmp(cnt1, 256);
4361     lsr(tmp1, cnt2, 2);
4362     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4363     br(GE, LINEARSTUB);
4364   }
4365 
4366 // The Boyer Moore alogorithm is based on the description here:-
4367 //
4368 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4369 //
4370 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4371 // and the 'Good Suffix' rule.
4372 //
4373 // These rules are essentially heuristics for how far we can shift the
4374 // pattern along the search string.
4375 //
4376 // The implementation here uses the 'Bad Character' rule only because of the
4377 // complexity of initialisation for the 'Good Suffix' rule.
4378 //
4379 // This is also known as the Boyer-Moore-Horspool algorithm:-
4380 //


4446     const int firstStep = isL ? 7 : 3;
4447 
4448     const int ASIZE = 256;
4449     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4450     sub(sp, sp, ASIZE);
4451     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4452     mov(ch1, sp);
4453     BIND(BM_INIT_LOOP);
4454       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4455       subs(tmp5, tmp5, 1);
4456       br(GT, BM_INIT_LOOP);
4457 
4458       sub(cnt1tmp, cnt1, 1);
4459       mov(tmp5, str2);
4460       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4461       sub(ch2, cnt1, 1);
4462       mov(tmp3, str1);
4463     BIND(BCLOOP);
4464       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4465       if (!str1_isL) {
4466         cmp(ch1, ASIZE);
4467         br(HS, BCSKIP);
4468       }
4469       strb(ch2, Address(sp, ch1));
4470     BIND(BCSKIP);
4471       subs(ch2, ch2, 1);
4472       br(GT, BCLOOP);
4473 
4474       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4475       if (str1_isL == str2_isL) {
4476         // load last 8 bytes (8LL/4UU symbols)
4477         ldr(tmp6, Address(tmp6, -wordSize));
4478       } else {
4479         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4480         // convert Latin1 to UTF. We'll have to wait until load completed, but
4481         // it's still faster than per-character loads+checks
4482         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4483         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4484         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4485         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4486         orr(ch2, ch1, ch2, LSL, 16);


4510         b(BMLOOPSTR1_CMP);
4511       }
4512     BIND(BMLOOPSTR1);
4513       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4514       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4515     BIND(BMLOOPSTR1_AFTER_LOAD);
4516       subs(cnt1tmp, cnt1tmp, 1);
4517       br(LT, BMLOOPSTR1_LASTCMP);
4518     BIND(BMLOOPSTR1_CMP);
4519       cmp(ch1, ch2);
4520       br(EQ, BMLOOPSTR1);
4521     BIND(BMSKIP);
4522       if (!isL) {
4523         // if we've met UTF symbol while searching Latin1 pattern, then we can
4524         // skip cnt1 symbols
4525         if (str1_isL != str2_isL) {
4526           mov(result_tmp, cnt1);
4527         } else {
4528           mov(result_tmp, 1);
4529         }
4530         cmp(skipch, ASIZE);
4531         br(HS, BMADV);
4532       }
4533       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4534     BIND(BMADV);
4535       sub(cnt1tmp, cnt1, 1);
4536       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4537       cmp(str2, str2end);
4538       br(LE, BMLOOPSTR2);
4539       add(sp, sp, ASIZE);
4540       b(NOMATCH);
4541     BIND(BMLOOPSTR1_LASTCMP);
4542       cmp(ch1, ch2);
4543       br(NE, BMSKIP);
4544     BIND(BMMATCH);
4545       sub(result, str2, tmp5);
4546       if (!str2_isL) lsr(result, result, 1);
4547       add(sp, sp, ASIZE);
4548       b(DONE);
4549 
4550     BIND(LINEARSTUB);
4551     cmp(cnt1, 16); // small patterns still should be handled by simple algorithm
4552     br(LT, LINEAR_MEDIUM);
4553     mov(result, zr);
4554     RuntimeAddress stub = NULL;
4555     if (isL) {
4556       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4557       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4558     } else if (str1_isL) {
4559       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4560        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4561     } else {
4562       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4563       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4564     }
4565     trampoline_call(stub);
4566     b(DONE);
4567   }
4568 
4569   BIND(LINEARSEARCH);
4570   {
4571     Label DO1, DO2, DO3;
4572 
4573     Register str2tmp = tmp2;
4574     Register first = tmp3;
4575 
4576     if (icnt1 == -1)
4577     {
4578         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4579 
4580         cmp(cnt1, str1_isL == str2_isL ? 4 : 2);
4581         br(LT, DOSHORT);
4582       BIND(LINEAR_MEDIUM);
4583         (this->*str1_load_1chr)(first, Address(str1));
4584         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4585         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4586         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4587         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4588 
4589       BIND(FIRST_LOOP);
4590         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4591         cmp(first, ch2);
4592         br(EQ, STR1_LOOP);
4593       BIND(STR2_NEXT);
4594         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4595         br(LE, FIRST_LOOP);
4596         b(NOMATCH);
4597 
4598       BIND(STR1_LOOP);
4599         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4600         add(cnt2tmp, cnt2_neg, str2_chr_size);
4601         br(GE, MATCH);
4602 
4603       BIND(STR1_NEXT);
4604         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4605         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4606         cmp(ch1, ch2);
4607         br(NE, STR2_NEXT);
4608         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4609         add(cnt2tmp, cnt2tmp, str2_chr_size);
4610         br(LT, STR1_NEXT);
4611         b(MATCH);
4612 
4613       BIND(DOSHORT);
4614       if (str1_isL == str2_isL) {
4615         cmp(cnt1, 2);
4616         br(LT, DO1);
4617         br(GT, DO3);
4618       }
4619     }
4620 
4621     if (icnt1 == 4) {
4622       Label CH1_LOOP;
4623 
4624         (this->*load_4chr)(ch1, str1);
4625         sub(result_tmp, cnt2, 4);
4626         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4627         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4628 
4629       BIND(CH1_LOOP);
4630         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4631         cmp(ch1, ch2);
4632         br(EQ, MATCH);
4633         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4634         br(LE, CH1_LOOP);
4635         b(NOMATCH);


4670         cmpw(first, ch2);
4671         br(EQ, STR1_LOOP);
4672       BIND(STR2_NEXT);
4673         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4674         br(LE, FIRST_LOOP);
4675         b(NOMATCH);
4676 
4677       BIND(STR1_LOOP);
4678         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4679         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4680         cmp(ch1, ch2);
4681         br(NE, STR2_NEXT);
4682         b(MATCH);
4683     }
4684 
4685     if (icnt1 == -1 || icnt1 == 1) {
4686       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4687 
4688       BIND(DO1);
4689         (this->*str1_load_1chr)(ch1, str1);
4690         cmp(cnt2, 8);
4691         br(LT, DO1_SHORT);
4692 
4693         sub(result_tmp, cnt2, 8/str2_chr_size);
4694         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4695         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4696         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4697 
4698         if (str2_isL) {
4699           orr(ch1, ch1, ch1, LSL, 8);
4700         }
4701         orr(ch1, ch1, ch1, LSL, 16);
4702         orr(ch1, ch1, ch1, LSL, 32);
4703       BIND(CH1_LOOP);
4704         ldr(ch2, Address(str2, cnt2_neg));
4705         eor(ch2, ch1, ch2);
4706         sub(tmp1, ch2, tmp3);
4707         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4708         bics(tmp1, tmp1, tmp2);
4709         br(NE, HAS_ZERO);
4710         adds(cnt2_neg, cnt2_neg, 8);
4711         br(LT, CH1_LOOP);
4712 
4713         cmp(cnt2_neg, 8);
4714         mov(cnt2_neg, 0);
4715         br(LT, CH1_LOOP);
4716         b(NOMATCH);
4717 
4718       BIND(HAS_ZERO);
4719         rev(tmp1, tmp1);
4720         clz(tmp1, tmp1);
4721         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4722         b(MATCH);
4723 
4724       BIND(DO1_SHORT);
4725         mov(result_tmp, cnt2);
4726         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4727         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4728       BIND(DO1_LOOP);
4729         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4730         cmpw(ch1, ch2);
4731         br(EQ, MATCH);
4732         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4733         br(LT, DO1_LOOP);


4736   BIND(NOMATCH);
4737     mov(result, -1);
4738     b(DONE);
4739   BIND(MATCH);
4740     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4741   BIND(DONE);
4742 }
4743 
4744 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4745 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4746 
4747 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4748                                          Register ch, Register result,
4749                                          Register tmp1, Register tmp2, Register tmp3)
4750 {
4751   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4752   Register cnt1_neg = cnt1;
4753   Register ch1 = rscratch1;
4754   Register result_tmp = rscratch2;
4755 
4756   cmp(cnt1, 4);
4757   br(LT, DO1_SHORT);
4758 
4759   orr(ch, ch, ch, LSL, 16);
4760   orr(ch, ch, ch, LSL, 32);
4761 
4762   sub(cnt1, cnt1, 4);
4763   mov(result_tmp, cnt1);
4764   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4765   sub(cnt1_neg, zr, cnt1, LSL, 1);
4766 
4767   mov(tmp3, 0x0001000100010001);
4768 
4769   BIND(CH1_LOOP);
4770     ldr(ch1, Address(str1, cnt1_neg));
4771     eor(ch1, ch, ch1);
4772     sub(tmp1, ch1, tmp3);
4773     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4774     bics(tmp1, tmp1, tmp2);
4775     br(NE, HAS_ZERO);
4776     adds(cnt1_neg, cnt1_neg, 8);
4777     br(LT, CH1_LOOP);
4778 
4779     cmp(cnt1_neg, 8);
4780     mov(cnt1_neg, 0);
4781     br(LT, CH1_LOOP);
4782     b(NOMATCH);
4783 
4784   BIND(HAS_ZERO);
4785     rev(tmp1, tmp1);
4786     clz(tmp1, tmp1);
4787     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4788     b(MATCH);
4789 
4790   BIND(DO1_SHORT);
4791     mov(result_tmp, cnt1);
4792     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4793     sub(cnt1_neg, zr, cnt1, LSL, 1);
4794   BIND(DO1_LOOP);
4795     ldrh(ch1, Address(str1, cnt1_neg));
4796     cmpw(ch, ch1);
4797     br(EQ, MATCH);
4798     adds(cnt1_neg, cnt1_neg, 2);
4799     br(LT, DO1_LOOP);
4800   BIND(NOMATCH);
4801     mov(result, -1);
4802     b(DONE);
4803   BIND(MATCH);
4804     add(result, result_tmp, cnt1_neg, ASR, 1);
4805   BIND(DONE);
4806 }
4807 
4808 // Compare strings.
4809 void MacroAssembler::string_compare(Register str1, Register str2,
4810     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4811     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4812   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4813       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4814       SHORT_LOOP_START, TAIL_CHECK;
4815 
4816   const int STUB_THRESHOLD = 64 + 8;
4817   bool isLL = ae == StrIntrinsicNode::LL;
4818   bool isLU = ae == StrIntrinsicNode::LU;
4819   bool isUL = ae == StrIntrinsicNode::UL;
4820 
4821   bool str1_isL = isLL || isLU;
4822   bool str2_isL = isLL || isUL;
4823 
4824   int str1_chr_shift = str1_isL ? 0 : 1;
4825   int str2_chr_shift = str2_isL ? 0 : 1;
4826   int str1_chr_size = str1_isL ? 1 : 2;
4827   int str2_chr_size = str2_isL ? 1 : 2;
4828   int minCharsInWord = isLL ? wordSize : wordSize/2;
4829 
4830   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4831   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4832                                       (chr_insn)&MacroAssembler::ldrh;
4833   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4834                                       (chr_insn)&MacroAssembler::ldrh;
4835   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4836                             (uxt_insn)&MacroAssembler::uxthw;


5191     bind(TAIL01);
5192     if (elem_size == 1) { // Only needed when comparing byte arrays.
5193       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5194       {
5195         ldrb(tmp1, a1);
5196         ldrb(tmp2, a2);
5197         eorw(tmp5, tmp1, tmp2);
5198         cbnzw(tmp5, DONE);
5199       }
5200     }
5201   } else {
5202     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5203         CSET_EQ, LAST_CHECK;
5204     mov(result, false);
5205     cbz(a1, DONE);
5206     ldrw(cnt1, Address(a1, length_offset));
5207     cbz(a2, DONE);
5208     ldrw(cnt2, Address(a2, length_offset));
5209     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5210     // faster to perform another branch before comparing a1 and a2
5211     cmp(cnt1, elem_per_word);
5212     br(LE, SHORT); // short or same
5213     ldr(tmp3, Address(pre(a1, base_offset)));
5214     cmp(cnt1, stubBytesThreshold);
5215     br(GE, STUB);
5216     ldr(tmp4, Address(pre(a2, base_offset)));
5217     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5218     cmp(cnt2, cnt1);
5219     br(NE, DONE);
5220 
5221     // Main 16 byte comparison loop with 2 exits
5222     bind(NEXT_DWORD); {
5223       ldr(tmp1, Address(pre(a1, wordSize)));
5224       ldr(tmp2, Address(pre(a2, wordSize)));
5225       subs(cnt1, cnt1, 2 * elem_per_word);
5226       br(LE, TAIL);
5227       eor(tmp4, tmp3, tmp4);
5228       cbnz(tmp4, DONE);
5229       ldr(tmp3, Address(pre(a1, wordSize)));
5230       ldr(tmp4, Address(pre(a2, wordSize)));
5231       cmp(cnt1, elem_per_word);
5232       br(LE, TAIL2);
5233       cmp(tmp1, tmp2);
5234     } br(EQ, NEXT_DWORD);
5235     b(DONE);
5236 
5237     bind(TAIL);
5238     eor(tmp4, tmp3, tmp4);
5239     eor(tmp2, tmp1, tmp2);
5240     lslv(tmp2, tmp2, tmp5);
5241     orr(tmp5, tmp4, tmp2);
5242     cmp(tmp5, zr);
5243     b(CSET_EQ);
5244 
5245     bind(TAIL2);
5246     eor(tmp2, tmp1, tmp2);
5247     cbnz(tmp2, DONE);
5248     b(LAST_CHECK);
5249 
5250     bind(STUB);
5251     ldr(tmp4, Address(pre(a2, base_offset)));


5384 
5385 // The size of the blocks erased by the zero_blocks stub.  We must
5386 // handle anything smaller than this ourselves in zero_words().
5387 const int MacroAssembler::zero_words_block_size = 8;
5388 
5389 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5390 // possible, handling small word counts locally and delegating
5391 // anything larger to the zero_blocks stub.  It is expanded many times
5392 // in compiled code, so it is important to keep it short.
5393 
5394 // ptr:   Address of a buffer to be zeroed.
5395 // cnt:   Count in HeapWords.
5396 //
5397 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5398 void MacroAssembler::zero_words(Register ptr, Register cnt)
5399 {
5400   assert(is_power_of_2(zero_words_block_size), "adjust this");
5401   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5402 
5403   BLOCK_COMMENT("zero_words {");
5404   cmp(cnt, zero_words_block_size);
5405   Label around, done, done16;
5406   br(LO, around);
5407   {
5408     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5409     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5410     if (StubRoutines::aarch64::complete()) {
5411       trampoline_call(zero_blocks);
5412     } else {
5413       bl(zero_blocks);
5414     }
5415   }
5416   bind(around);
5417   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5418     Label l;
5419     tbz(cnt, exact_log2(i), l);
5420     for (int j = 0; j < i; j += 2) {
5421       stp(zr, zr, post(ptr, 16));
5422     }
5423     bind(l);
5424   }


5565 
5566   tbz(cnt, 0, fini);
5567   str(value, Address(post(base, 8)));
5568   bind(fini);
5569 }
5570 
5571 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5572 // java/lang/StringUTF16.compress.
5573 void MacroAssembler::encode_iso_array(Register src, Register dst,
5574                       Register len, Register result,
5575                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5576                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5577 {
5578     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5579         NEXT_32_START, NEXT_32_PRFM_START;
5580     Register tmp1 = rscratch1, tmp2 = rscratch2;
5581 
5582       mov(result, len); // Save initial len
5583 
5584 #ifndef BUILTIN_SIM
5585       cmp(len, 8); // handle shortest strings first
5586       br(LT, LOOP_1);
5587       cmp(len, 32);
5588       br(LT, NEXT_8);
5589       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5590       // to convert chars to bytes
5591       if (SoftwarePrefetchHintDistance >= 0) {
5592         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5593         cmp(len, SoftwarePrefetchHintDistance/2 + 16);
5594         br(LE, NEXT_32_START);
5595         b(NEXT_32_PRFM_START);
5596         BIND(NEXT_32_PRFM);
5597           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5598         BIND(NEXT_32_PRFM_START);
5599           prfm(Address(src, SoftwarePrefetchHintDistance));
5600           orr(v4, T16B, Vtmp1, Vtmp2);
5601           orr(v5, T16B, Vtmp3, Vtmp4);
5602           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5603           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5604           stpq(Vtmp1, Vtmp3, dst);
5605           uzp2(v5, T16B, v4, v5); // high bytes
5606           umov(tmp2, v5, D, 1);
5607           fmovd(tmp1, v5);
5608           orr(tmp1, tmp1, tmp2);
5609           cbnz(tmp1, LOOP_8);
5610           sub(len, len, 32);
5611           add(dst, dst, 32);
5612           add(src, src, 64);
5613           cmp(len, SoftwarePrefetchHintDistance/2 + 16);
5614           br(GE, NEXT_32_PRFM);
5615           cmp(len, 32);
5616           br(LT, LOOP_8);
5617         BIND(NEXT_32);
5618           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5619         BIND(NEXT_32_START);
5620       } else {
5621         BIND(NEXT_32);
5622           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5623       }
5624       prfm(Address(src, SoftwarePrefetchHintDistance));
5625       uzp1(v4, T16B, Vtmp1, Vtmp2);
5626       uzp1(v5, T16B, Vtmp3, Vtmp4);
5627       stpq(v4, v5, dst);
5628       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5629       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5630       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5631       umov(tmp2, Vtmp1, D, 1);
5632       fmovd(tmp1, Vtmp1);
5633       orr(tmp1, tmp1, tmp2);
5634       cbnz(tmp1, LOOP_8);
5635       sub(len, len, 32);
5636       add(dst, dst, 32);
5637       add(src, src, 64);
5638       cmp(len, 32);
5639       br(GE, NEXT_32);
5640       cbz(len, DONE);
5641 
5642     BIND(LOOP_8);
5643       cmp(len, 8);
5644       br(LT, LOOP_1);
5645     BIND(NEXT_8);
5646       ld1(Vtmp1, T8H, src);
5647       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5648       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5649       strd(Vtmp2, dst);
5650       fmovd(tmp1, Vtmp3);
5651       cbnz(tmp1, NEXT_1);
5652 
5653       sub(len, len, 8);
5654       add(dst, dst, 8);
5655       add(src, src, 16);
5656       cmp(len, 8);
5657       br(GE, NEXT_8);
5658 
5659     BIND(LOOP_1);
5660 #endif
5661     cbz(len, DONE);
5662     BIND(NEXT_1);
5663       ldrh(tmp1, Address(post(src, 2)));
5664       strb(tmp1, Address(post(dst, 1)));
5665       tst(tmp1, 0xff00);
5666       br(NE, SET_RESULT);
5667       subs(len, len, 1);
5668       br(GT, NEXT_1);
5669 
5670     BIND(SET_RESULT);
5671       sub(result, result, len); // Return index where we stopped
5672                                 // Return len == 0 if we processed all
5673                                 // characters
5674     BIND(DONE);
5675 }
5676 


5713     b(done);
5714   }
5715 
5716   if (SoftwarePrefetchHintDistance >= 0) {
5717     bind(to_stub);
5718       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5719       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5720       trampoline_call(stub);
5721       b(after_init);
5722   }
5723 
5724   // Unpack the bytes 8 at a time.
5725   bind(big);
5726   {
5727     Label loop, around, loop_last, loop_start;
5728 
5729     if (SoftwarePrefetchHintDistance >= 0) {
5730       const int large_loop_threshold = (64 + 16)/8;
5731       ldrd(vtmp2, post(src, 8));
5732       andw(len, len, 7);
5733       cmp(tmp4, large_loop_threshold);
5734       br(GE, to_stub);
5735       b(loop_start);
5736 
5737       bind(loop);
5738       ldrd(vtmp2, post(src, 8));
5739       bind(loop_start);
5740       subs(tmp4, tmp4, 1);
5741       br(EQ, loop_last);
5742       zip1(vtmp2, T16B, vtmp2, vtmp1);
5743       ldrd(vtmp3, post(src, 8));
5744       st1(vtmp2, T8H, post(dst, 16));
5745       subs(tmp4, tmp4, 1);
5746       zip1(vtmp3, T16B, vtmp3, vtmp1);
5747       st1(vtmp3, T8H, post(dst, 16));
5748       br(NE, loop);
5749       b(around);
5750       bind(loop_last);
5751       zip1(vtmp2, T16B, vtmp2, vtmp1);
5752       st1(vtmp2, T8H, post(dst, 16));
5753       bind(around);




 477 
 478   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 479   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 480   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 481   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 482   Address saved_mark_addr(lock_reg, 0);
 483 
 484   // Biased locking
 485   // See whether the lock is currently biased toward our thread and
 486   // whether the epoch is still valid
 487   // Note that the runtime guarantees sufficient alignment of JavaThread
 488   // pointers to allow age to be placed into low bits
 489   // First check to see whether biasing is even enabled for this object
 490   Label cas_label;
 491   int null_check_offset = -1;
 492   if (!swap_reg_contains_mark) {
 493     null_check_offset = offset();
 494     ldr(swap_reg, mark_addr);
 495   }
 496   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 497   cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern);
 498   br(Assembler::NE, cas_label);
 499   // The bias pattern is present in the object's header. Need to check
 500   // whether the bias owner and the epoch are both still current.
 501   load_prototype_header(tmp_reg, obj_reg);
 502   orr(tmp_reg, tmp_reg, rthread);
 503   eor(tmp_reg, swap_reg, tmp_reg);
 504   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 505   if (counters != NULL) {
 506     Label around;
 507     cbnz(tmp_reg, around);
 508     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 509     b(done);
 510     bind(around);
 511   } else {
 512     cbz(tmp_reg, done);
 513   }
 514 
 515   Label try_revoke_bias;
 516   Label try_rebias;
 517 


 616     }
 617     bind(nope);
 618   }
 619 
 620   bind(cas_label);
 621 
 622   return null_check_offset;
 623 }
 624 
 625 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 626   assert(UseBiasedLocking, "why call this otherwise?");
 627 
 628   // Check for biased locking unlock case, which is a no-op
 629   // Note: we do not have to check the thread ID for two reasons.
 630   // First, the interpreter checks for IllegalMonitorStateException at
 631   // a higher level. Second, if the bias was revoked while we held the
 632   // lock, the object could not be rebiased toward another thread, so
 633   // the bias bit would be clear.
 634   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 635   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 636   cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern);
 637   br(Assembler::EQ, done);
 638 }
 639 
 640 static void pass_arg0(MacroAssembler* masm, Register arg) {
 641   if (c_rarg0 != arg ) {
 642     masm->mov(c_rarg0, arg);
 643   }
 644 }
 645 
 646 static void pass_arg1(MacroAssembler* masm, Register arg) {
 647   if (c_rarg1 != arg ) {
 648     masm->mov(c_rarg1, arg);
 649   }
 650 }
 651 
 652 static void pass_arg2(MacroAssembler* masm, Register arg) {
 653   if (c_rarg2 != arg ) {
 654     masm->mov(c_rarg2, arg);
 655   }
 656 }


1112     ldrw(temp_reg, super_check_offset_addr);
1113     super_check_offset = RegisterOrConstant(temp_reg);
1114   }
1115   Address super_check_addr(sub_klass, super_check_offset);
1116   ldr(rscratch1, super_check_addr);
1117   cmp(super_klass, rscratch1); // load displayed supertype
1118 
1119   // This check has worked decisively for primary supers.
1120   // Secondary supers are sought in the super_cache ('super_cache_addr').
1121   // (Secondary supers are interfaces and very deeply nested subtypes.)
1122   // This works in the same check above because of a tricky aliasing
1123   // between the super_cache and the primary super display elements.
1124   // (The 'super_check_addr' can address either, as the case requires.)
1125   // Note that the cache is updated below if it does not help us find
1126   // what we need immediately.
1127   // So if it was a primary super, we can just fail immediately.
1128   // Otherwise, it's the slow path for us (no success at this point).
1129 
1130   if (super_check_offset.is_register()) {
1131     br(Assembler::EQ, *L_success);
1132     subs(zr, super_check_offset.as_register(), sc_offset);
1133     if (L_failure == &L_fallthrough) {
1134       br(Assembler::EQ, *L_slow_path);
1135     } else {
1136       br(Assembler::NE, *L_failure);
1137       final_jmp(*L_slow_path);
1138     }
1139   } else if (super_check_offset.as_constant() == sc_offset) {
1140     // Need a slow path; fast failure is impossible.
1141     if (L_slow_path == &L_fallthrough) {
1142       br(Assembler::EQ, *L_success);
1143     } else {
1144       br(Assembler::NE, *L_slow_path);
1145       final_jmp(*L_success);
1146     }
1147   } else {
1148     // No slow path; it's a fast decision.
1149     if (L_failure == &L_fallthrough) {
1150       br(Assembler::EQ, *L_success);
1151     } else {
1152       br(Assembler::NE, *L_failure);


3287 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3288         Register table0, Register table1, Register table2, Register table3,
3289         Register tmp, Register tmp2, Register tmp3) {
3290   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3291   unsigned long offset;
3292 
3293   if (UseCRC32) {
3294       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3295       return;
3296   }
3297 
3298     mvnw(crc, crc);
3299 
3300     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3301     if (offset) add(table0, table0, offset);
3302     add(table1, table0, 1*256*sizeof(juint));
3303     add(table2, table0, 2*256*sizeof(juint));
3304     add(table3, table0, 3*256*sizeof(juint));
3305 
3306   if (UseNeon) {
3307       cmp(len, (u1)64);
3308       br(Assembler::LT, L_by16);
3309       eor(v16, T16B, v16, v16);
3310 
3311     Label L_fold;
3312 
3313       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3314 
3315       ld1(v0, v1, T2D, post(buf, 32));
3316       ld1r(v4, T2D, post(tmp, 8));
3317       ld1r(v5, T2D, post(tmp, 8));
3318       ld1r(v6, T2D, post(tmp, 8));
3319       ld1r(v7, T2D, post(tmp, 8));
3320       mov(v16, T4S, 0, crc);
3321 
3322       eor(v0, T16B, v0, v16);
3323       sub(len, len, 64);
3324 
3325     BIND(L_fold);
3326       pmull(v22, T8H, v0, v5, T8B);
3327       pmull(v20, T8H, v0, v7, T8B);


4337   int str2_chr_size = str2_isL ? 1:2;
4338   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4339                                       (chr_insn)&MacroAssembler::ldrh;
4340   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4341                                       (chr_insn)&MacroAssembler::ldrh;
4342   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4343   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4344 
4345   // Note, inline_string_indexOf() generates checks:
4346   // if (substr.count > string.count) return -1;
4347   // if (substr.count == 0) return 0;
4348 
4349   // We have two strings, a source string in str2, cnt2 and a pattern string
4350   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4351 
4352   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4353   // With a small pattern and source we use linear scan.
4354 
4355   if (icnt1 == -1) {
4356     sub(result_tmp, cnt2, cnt1);
4357     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4358     br(LT, LINEARSEARCH);
4359     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4360     subs(zr, cnt1, 256);
4361     lsr(tmp1, cnt2, 2);
4362     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4363     br(GE, LINEARSTUB);
4364   }
4365 
4366 // The Boyer Moore alogorithm is based on the description here:-
4367 //
4368 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4369 //
4370 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4371 // and the 'Good Suffix' rule.
4372 //
4373 // These rules are essentially heuristics for how far we can shift the
4374 // pattern along the search string.
4375 //
4376 // The implementation here uses the 'Bad Character' rule only because of the
4377 // complexity of initialisation for the 'Good Suffix' rule.
4378 //
4379 // This is also known as the Boyer-Moore-Horspool algorithm:-
4380 //


4446     const int firstStep = isL ? 7 : 3;
4447 
4448     const int ASIZE = 256;
4449     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4450     sub(sp, sp, ASIZE);
4451     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4452     mov(ch1, sp);
4453     BIND(BM_INIT_LOOP);
4454       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4455       subs(tmp5, tmp5, 1);
4456       br(GT, BM_INIT_LOOP);
4457 
4458       sub(cnt1tmp, cnt1, 1);
4459       mov(tmp5, str2);
4460       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4461       sub(ch2, cnt1, 1);
4462       mov(tmp3, str1);
4463     BIND(BCLOOP);
4464       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4465       if (!str1_isL) {
4466         subs(zr, ch1, ASIZE);
4467         br(HS, BCSKIP);
4468       }
4469       strb(ch2, Address(sp, ch1));
4470     BIND(BCSKIP);
4471       subs(ch2, ch2, 1);
4472       br(GT, BCLOOP);
4473 
4474       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4475       if (str1_isL == str2_isL) {
4476         // load last 8 bytes (8LL/4UU symbols)
4477         ldr(tmp6, Address(tmp6, -wordSize));
4478       } else {
4479         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4480         // convert Latin1 to UTF. We'll have to wait until load completed, but
4481         // it's still faster than per-character loads+checks
4482         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4483         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4484         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4485         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4486         orr(ch2, ch1, ch2, LSL, 16);


4510         b(BMLOOPSTR1_CMP);
4511       }
4512     BIND(BMLOOPSTR1);
4513       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4514       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4515     BIND(BMLOOPSTR1_AFTER_LOAD);
4516       subs(cnt1tmp, cnt1tmp, 1);
4517       br(LT, BMLOOPSTR1_LASTCMP);
4518     BIND(BMLOOPSTR1_CMP);
4519       cmp(ch1, ch2);
4520       br(EQ, BMLOOPSTR1);
4521     BIND(BMSKIP);
4522       if (!isL) {
4523         // if we've met UTF symbol while searching Latin1 pattern, then we can
4524         // skip cnt1 symbols
4525         if (str1_isL != str2_isL) {
4526           mov(result_tmp, cnt1);
4527         } else {
4528           mov(result_tmp, 1);
4529         }
4530         subs(zr, skipch, ASIZE);
4531         br(HS, BMADV);
4532       }
4533       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4534     BIND(BMADV);
4535       sub(cnt1tmp, cnt1, 1);
4536       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4537       cmp(str2, str2end);
4538       br(LE, BMLOOPSTR2);
4539       add(sp, sp, ASIZE);
4540       b(NOMATCH);
4541     BIND(BMLOOPSTR1_LASTCMP);
4542       cmp(ch1, ch2);
4543       br(NE, BMSKIP);
4544     BIND(BMMATCH);
4545       sub(result, str2, tmp5);
4546       if (!str2_isL) lsr(result, result, 1);
4547       add(sp, sp, ASIZE);
4548       b(DONE);
4549 
4550     BIND(LINEARSTUB);
4551     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
4552     br(LT, LINEAR_MEDIUM);
4553     mov(result, zr);
4554     RuntimeAddress stub = NULL;
4555     if (isL) {
4556       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4557       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4558     } else if (str1_isL) {
4559       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4560        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4561     } else {
4562       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4563       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4564     }
4565     trampoline_call(stub);
4566     b(DONE);
4567   }
4568 
4569   BIND(LINEARSEARCH);
4570   {
4571     Label DO1, DO2, DO3;
4572 
4573     Register str2tmp = tmp2;
4574     Register first = tmp3;
4575 
4576     if (icnt1 == -1)
4577     {
4578         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4579 
4580         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
4581         br(LT, DOSHORT);
4582       BIND(LINEAR_MEDIUM);
4583         (this->*str1_load_1chr)(first, Address(str1));
4584         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4585         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4586         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4587         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4588 
4589       BIND(FIRST_LOOP);
4590         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4591         cmp(first, ch2);
4592         br(EQ, STR1_LOOP);
4593       BIND(STR2_NEXT);
4594         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4595         br(LE, FIRST_LOOP);
4596         b(NOMATCH);
4597 
4598       BIND(STR1_LOOP);
4599         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4600         add(cnt2tmp, cnt2_neg, str2_chr_size);
4601         br(GE, MATCH);
4602 
4603       BIND(STR1_NEXT);
4604         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4605         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4606         cmp(ch1, ch2);
4607         br(NE, STR2_NEXT);
4608         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4609         add(cnt2tmp, cnt2tmp, str2_chr_size);
4610         br(LT, STR1_NEXT);
4611         b(MATCH);
4612 
4613       BIND(DOSHORT);
4614       if (str1_isL == str2_isL) {
4615         cmp(cnt1, (u1)2);
4616         br(LT, DO1);
4617         br(GT, DO3);
4618       }
4619     }
4620 
4621     if (icnt1 == 4) {
4622       Label CH1_LOOP;
4623 
4624         (this->*load_4chr)(ch1, str1);
4625         sub(result_tmp, cnt2, 4);
4626         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4627         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4628 
4629       BIND(CH1_LOOP);
4630         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4631         cmp(ch1, ch2);
4632         br(EQ, MATCH);
4633         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4634         br(LE, CH1_LOOP);
4635         b(NOMATCH);


4670         cmpw(first, ch2);
4671         br(EQ, STR1_LOOP);
4672       BIND(STR2_NEXT);
4673         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4674         br(LE, FIRST_LOOP);
4675         b(NOMATCH);
4676 
4677       BIND(STR1_LOOP);
4678         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4679         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4680         cmp(ch1, ch2);
4681         br(NE, STR2_NEXT);
4682         b(MATCH);
4683     }
4684 
4685     if (icnt1 == -1 || icnt1 == 1) {
4686       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4687 
4688       BIND(DO1);
4689         (this->*str1_load_1chr)(ch1, str1);
4690         cmp(cnt2, (u1)8);
4691         br(LT, DO1_SHORT);
4692 
4693         sub(result_tmp, cnt2, 8/str2_chr_size);
4694         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4695         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4696         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4697 
4698         if (str2_isL) {
4699           orr(ch1, ch1, ch1, LSL, 8);
4700         }
4701         orr(ch1, ch1, ch1, LSL, 16);
4702         orr(ch1, ch1, ch1, LSL, 32);
4703       BIND(CH1_LOOP);
4704         ldr(ch2, Address(str2, cnt2_neg));
4705         eor(ch2, ch1, ch2);
4706         sub(tmp1, ch2, tmp3);
4707         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4708         bics(tmp1, tmp1, tmp2);
4709         br(NE, HAS_ZERO);
4710         adds(cnt2_neg, cnt2_neg, 8);
4711         br(LT, CH1_LOOP);
4712 
4713         cmp(cnt2_neg, (u1)8);
4714         mov(cnt2_neg, 0);
4715         br(LT, CH1_LOOP);
4716         b(NOMATCH);
4717 
4718       BIND(HAS_ZERO);
4719         rev(tmp1, tmp1);
4720         clz(tmp1, tmp1);
4721         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4722         b(MATCH);
4723 
4724       BIND(DO1_SHORT);
4725         mov(result_tmp, cnt2);
4726         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4727         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4728       BIND(DO1_LOOP);
4729         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4730         cmpw(ch1, ch2);
4731         br(EQ, MATCH);
4732         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4733         br(LT, DO1_LOOP);


4736   BIND(NOMATCH);
4737     mov(result, -1);
4738     b(DONE);
4739   BIND(MATCH);
4740     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4741   BIND(DONE);
4742 }
4743 
4744 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4745 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4746 
4747 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4748                                          Register ch, Register result,
4749                                          Register tmp1, Register tmp2, Register tmp3)
4750 {
4751   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4752   Register cnt1_neg = cnt1;
4753   Register ch1 = rscratch1;
4754   Register result_tmp = rscratch2;
4755 
4756   cmp(cnt1, (u1)4);
4757   br(LT, DO1_SHORT);
4758 
4759   orr(ch, ch, ch, LSL, 16);
4760   orr(ch, ch, ch, LSL, 32);
4761 
4762   sub(cnt1, cnt1, 4);
4763   mov(result_tmp, cnt1);
4764   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4765   sub(cnt1_neg, zr, cnt1, LSL, 1);
4766 
4767   mov(tmp3, 0x0001000100010001);
4768 
4769   BIND(CH1_LOOP);
4770     ldr(ch1, Address(str1, cnt1_neg));
4771     eor(ch1, ch, ch1);
4772     sub(tmp1, ch1, tmp3);
4773     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4774     bics(tmp1, tmp1, tmp2);
4775     br(NE, HAS_ZERO);
4776     adds(cnt1_neg, cnt1_neg, 8);
4777     br(LT, CH1_LOOP);
4778 
4779     cmp(cnt1_neg, (u1)8);
4780     mov(cnt1_neg, 0);
4781     br(LT, CH1_LOOP);
4782     b(NOMATCH);
4783 
4784   BIND(HAS_ZERO);
4785     rev(tmp1, tmp1);
4786     clz(tmp1, tmp1);
4787     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4788     b(MATCH);
4789 
4790   BIND(DO1_SHORT);
4791     mov(result_tmp, cnt1);
4792     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4793     sub(cnt1_neg, zr, cnt1, LSL, 1);
4794   BIND(DO1_LOOP);
4795     ldrh(ch1, Address(str1, cnt1_neg));
4796     cmpw(ch, ch1);
4797     br(EQ, MATCH);
4798     adds(cnt1_neg, cnt1_neg, 2);
4799     br(LT, DO1_LOOP);
4800   BIND(NOMATCH);
4801     mov(result, -1);
4802     b(DONE);
4803   BIND(MATCH);
4804     add(result, result_tmp, cnt1_neg, ASR, 1);
4805   BIND(DONE);
4806 }
4807 
4808 // Compare strings.
4809 void MacroAssembler::string_compare(Register str1, Register str2,
4810     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4811     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4812   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4813       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4814       SHORT_LOOP_START, TAIL_CHECK;
4815 
4816   const u1 STUB_THRESHOLD = 64 + 8;
4817   bool isLL = ae == StrIntrinsicNode::LL;
4818   bool isLU = ae == StrIntrinsicNode::LU;
4819   bool isUL = ae == StrIntrinsicNode::UL;
4820 
4821   bool str1_isL = isLL || isLU;
4822   bool str2_isL = isLL || isUL;
4823 
4824   int str1_chr_shift = str1_isL ? 0 : 1;
4825   int str2_chr_shift = str2_isL ? 0 : 1;
4826   int str1_chr_size = str1_isL ? 1 : 2;
4827   int str2_chr_size = str2_isL ? 1 : 2;
4828   int minCharsInWord = isLL ? wordSize : wordSize/2;
4829 
4830   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4831   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4832                                       (chr_insn)&MacroAssembler::ldrh;
4833   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4834                                       (chr_insn)&MacroAssembler::ldrh;
4835   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4836                             (uxt_insn)&MacroAssembler::uxthw;


5191     bind(TAIL01);
5192     if (elem_size == 1) { // Only needed when comparing byte arrays.
5193       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5194       {
5195         ldrb(tmp1, a1);
5196         ldrb(tmp2, a2);
5197         eorw(tmp5, tmp1, tmp2);
5198         cbnzw(tmp5, DONE);
5199       }
5200     }
5201   } else {
5202     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5203         CSET_EQ, LAST_CHECK;
5204     mov(result, false);
5205     cbz(a1, DONE);
5206     ldrw(cnt1, Address(a1, length_offset));
5207     cbz(a2, DONE);
5208     ldrw(cnt2, Address(a2, length_offset));
5209     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5210     // faster to perform another branch before comparing a1 and a2
5211     cmp(cnt1, (u1)elem_per_word);
5212     br(LE, SHORT); // short or same
5213     ldr(tmp3, Address(pre(a1, base_offset)));
5214     subs(zr, cnt1, stubBytesThreshold);
5215     br(GE, STUB);
5216     ldr(tmp4, Address(pre(a2, base_offset)));
5217     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5218     cmp(cnt2, cnt1);
5219     br(NE, DONE);
5220 
5221     // Main 16 byte comparison loop with 2 exits
5222     bind(NEXT_DWORD); {
5223       ldr(tmp1, Address(pre(a1, wordSize)));
5224       ldr(tmp2, Address(pre(a2, wordSize)));
5225       subs(cnt1, cnt1, 2 * elem_per_word);
5226       br(LE, TAIL);
5227       eor(tmp4, tmp3, tmp4);
5228       cbnz(tmp4, DONE);
5229       ldr(tmp3, Address(pre(a1, wordSize)));
5230       ldr(tmp4, Address(pre(a2, wordSize)));
5231       cmp(cnt1, (u1)elem_per_word);
5232       br(LE, TAIL2);
5233       cmp(tmp1, tmp2);
5234     } br(EQ, NEXT_DWORD);
5235     b(DONE);
5236 
5237     bind(TAIL);
5238     eor(tmp4, tmp3, tmp4);
5239     eor(tmp2, tmp1, tmp2);
5240     lslv(tmp2, tmp2, tmp5);
5241     orr(tmp5, tmp4, tmp2);
5242     cmp(tmp5, zr);
5243     b(CSET_EQ);
5244 
5245     bind(TAIL2);
5246     eor(tmp2, tmp1, tmp2);
5247     cbnz(tmp2, DONE);
5248     b(LAST_CHECK);
5249 
5250     bind(STUB);
5251     ldr(tmp4, Address(pre(a2, base_offset)));


5384 
5385 // The size of the blocks erased by the zero_blocks stub.  We must
5386 // handle anything smaller than this ourselves in zero_words().
5387 const int MacroAssembler::zero_words_block_size = 8;
5388 
5389 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5390 // possible, handling small word counts locally and delegating
5391 // anything larger to the zero_blocks stub.  It is expanded many times
5392 // in compiled code, so it is important to keep it short.
5393 
5394 // ptr:   Address of a buffer to be zeroed.
5395 // cnt:   Count in HeapWords.
5396 //
5397 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5398 void MacroAssembler::zero_words(Register ptr, Register cnt)
5399 {
5400   assert(is_power_of_2(zero_words_block_size), "adjust this");
5401   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5402 
5403   BLOCK_COMMENT("zero_words {");
5404   cmp(cnt, (u1)zero_words_block_size);
5405   Label around, done, done16;
5406   br(LO, around);
5407   {
5408     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5409     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5410     if (StubRoutines::aarch64::complete()) {
5411       trampoline_call(zero_blocks);
5412     } else {
5413       bl(zero_blocks);
5414     }
5415   }
5416   bind(around);
5417   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5418     Label l;
5419     tbz(cnt, exact_log2(i), l);
5420     for (int j = 0; j < i; j += 2) {
5421       stp(zr, zr, post(ptr, 16));
5422     }
5423     bind(l);
5424   }


5565 
5566   tbz(cnt, 0, fini);
5567   str(value, Address(post(base, 8)));
5568   bind(fini);
5569 }
5570 
5571 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5572 // java/lang/StringUTF16.compress.
5573 void MacroAssembler::encode_iso_array(Register src, Register dst,
5574                       Register len, Register result,
5575                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5576                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5577 {
5578     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5579         NEXT_32_START, NEXT_32_PRFM_START;
5580     Register tmp1 = rscratch1, tmp2 = rscratch2;
5581 
5582       mov(result, len); // Save initial len
5583 
5584 #ifndef BUILTIN_SIM
5585       cmp(len, (u1)8); // handle shortest strings first
5586       br(LT, LOOP_1);
5587       cmp(len, (u1)32);
5588       br(LT, NEXT_8);
5589       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5590       // to convert chars to bytes
5591       if (SoftwarePrefetchHintDistance >= 0) {
5592         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5593         subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5594         br(LE, NEXT_32_START);
5595         b(NEXT_32_PRFM_START);
5596         BIND(NEXT_32_PRFM);
5597           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5598         BIND(NEXT_32_PRFM_START);
5599           prfm(Address(src, SoftwarePrefetchHintDistance));
5600           orr(v4, T16B, Vtmp1, Vtmp2);
5601           orr(v5, T16B, Vtmp3, Vtmp4);
5602           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5603           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5604           stpq(Vtmp1, Vtmp3, dst);
5605           uzp2(v5, T16B, v4, v5); // high bytes
5606           umov(tmp2, v5, D, 1);
5607           fmovd(tmp1, v5);
5608           orr(tmp1, tmp1, tmp2);
5609           cbnz(tmp1, LOOP_8);
5610           sub(len, len, 32);
5611           add(dst, dst, 32);
5612           add(src, src, 64);
5613           subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5614           br(GE, NEXT_32_PRFM);
5615           cmp(len, (u1)32);
5616           br(LT, LOOP_8);
5617         BIND(NEXT_32);
5618           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5619         BIND(NEXT_32_START);
5620       } else {
5621         BIND(NEXT_32);
5622           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5623       }
5624       prfm(Address(src, SoftwarePrefetchHintDistance));
5625       uzp1(v4, T16B, Vtmp1, Vtmp2);
5626       uzp1(v5, T16B, Vtmp3, Vtmp4);
5627       stpq(v4, v5, dst);
5628       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5629       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5630       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5631       umov(tmp2, Vtmp1, D, 1);
5632       fmovd(tmp1, Vtmp1);
5633       orr(tmp1, tmp1, tmp2);
5634       cbnz(tmp1, LOOP_8);
5635       sub(len, len, 32);
5636       add(dst, dst, 32);
5637       add(src, src, 64);
5638       cmp(len, (u1)32);
5639       br(GE, NEXT_32);
5640       cbz(len, DONE);
5641 
5642     BIND(LOOP_8);
5643       cmp(len, (u1)8);
5644       br(LT, LOOP_1);
5645     BIND(NEXT_8);
5646       ld1(Vtmp1, T8H, src);
5647       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5648       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5649       strd(Vtmp2, dst);
5650       fmovd(tmp1, Vtmp3);
5651       cbnz(tmp1, NEXT_1);
5652 
5653       sub(len, len, 8);
5654       add(dst, dst, 8);
5655       add(src, src, 16);
5656       cmp(len, (u1)8);
5657       br(GE, NEXT_8);
5658 
5659     BIND(LOOP_1);
5660 #endif
5661     cbz(len, DONE);
5662     BIND(NEXT_1);
5663       ldrh(tmp1, Address(post(src, 2)));
5664       strb(tmp1, Address(post(dst, 1)));
5665       tst(tmp1, 0xff00);
5666       br(NE, SET_RESULT);
5667       subs(len, len, 1);
5668       br(GT, NEXT_1);
5669 
5670     BIND(SET_RESULT);
5671       sub(result, result, len); // Return index where we stopped
5672                                 // Return len == 0 if we processed all
5673                                 // characters
5674     BIND(DONE);
5675 }
5676 


5713     b(done);
5714   }
5715 
5716   if (SoftwarePrefetchHintDistance >= 0) {
5717     bind(to_stub);
5718       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5719       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5720       trampoline_call(stub);
5721       b(after_init);
5722   }
5723 
5724   // Unpack the bytes 8 at a time.
5725   bind(big);
5726   {
5727     Label loop, around, loop_last, loop_start;
5728 
5729     if (SoftwarePrefetchHintDistance >= 0) {
5730       const int large_loop_threshold = (64 + 16)/8;
5731       ldrd(vtmp2, post(src, 8));
5732       andw(len, len, 7);
5733       cmp(tmp4, (u1)large_loop_threshold);
5734       br(GE, to_stub);
5735       b(loop_start);
5736 
5737       bind(loop);
5738       ldrd(vtmp2, post(src, 8));
5739       bind(loop_start);
5740       subs(tmp4, tmp4, 1);
5741       br(EQ, loop_last);
5742       zip1(vtmp2, T16B, vtmp2, vtmp1);
5743       ldrd(vtmp3, post(src, 8));
5744       st1(vtmp2, T8H, post(dst, 16));
5745       subs(tmp4, tmp4, 1);
5746       zip1(vtmp3, T16B, vtmp3, vtmp1);
5747       st1(vtmp3, T8H, post(dst, 16));
5748       br(NE, loop);
5749       b(around);
5750       bind(loop_last);
5751       zip1(vtmp2, T16B, vtmp2, vtmp1);
5752       st1(vtmp2, T8H, post(dst, 16));
5753       bind(around);


< prev index next >