4649 // Shift ary1 and ary2 to the end of the arrays, negate limit 4650 add(ary1, limit, ary1); 4651 add(ary2, limit, ary2); 4652 neg(limit, limit); 4653 4654 lduw(ary1, limit, result); 4655 bind(Lloop); 4656 lduw(ary2, limit, tmp); 4657 cmp(result, tmp); 4658 br(Assembler::notEqual, true, Assembler::pt, Ldone); 4659 delayed()->mov(G0, result); // not equal 4660 inccc(limit, 2*sizeof(jchar)); 4661 // annul LDUW if branch is not taken to prevent access past end of array 4662 br(Assembler::notZero, true, Assembler::pt, Lloop); 4663 delayed()->lduw(ary1, limit, result); // hoisted 4664 4665 add(G0, 1, result); // equals 4666 bind(Ldone); 4667 } 4668 4669 #endif 4670 4671 // Use BIS for zeroing (count is in bytes). 4672 void MacroAssembler::bis_zeroing(Register to, Register count, Register temp, Label& Ldone) { 4673 assert(UseBlockZeroing && VM_Version::has_block_zeroing(), "only works with BIS zeroing"); 4674 Register end = count; 4675 int cache_line_size = VM_Version::prefetch_data_size(); 4676 // Minimum count when BIS zeroing can be used since 4677 // it needs membar which is expensive. 4678 int block_zero_size = MAX2(cache_line_size*3, (int)BlockZeroingLowLimit); 4679 4680 Label small_loop; 4681 // Check if count is negative (dead code) or zero. 4682 // Note, count uses 64bit in 64 bit VM. 4683 cmp_and_brx_short(count, 0, Assembler::lessEqual, Assembler::pn, Ldone); 4684 4685 // Use BIS zeroing only for big arrays since it requires membar. 4686 if (Assembler::is_simm13(block_zero_size)) { // < 4096 4687 cmp(count, block_zero_size); 4688 } else { 4689 set(block_zero_size, temp); | 4649 // Shift ary1 and ary2 to the end of the arrays, negate limit 4650 add(ary1, limit, ary1); 4651 add(ary2, limit, ary2); 4652 neg(limit, limit); 4653 4654 lduw(ary1, limit, result); 4655 bind(Lloop); 4656 lduw(ary2, limit, tmp); 4657 cmp(result, tmp); 4658 br(Assembler::notEqual, true, Assembler::pt, Ldone); 4659 delayed()->mov(G0, result); // not equal 4660 inccc(limit, 2*sizeof(jchar)); 4661 // annul LDUW if branch is not taken to prevent access past end of array 4662 br(Assembler::notZero, true, Assembler::pt, Lloop); 4663 delayed()->lduw(ary1, limit, result); // hoisted 4664 4665 add(G0, 1, result); // equals 4666 bind(Ldone); 4667 } 4668 4669 void MacroAssembler::has_negatives(Register inp, Register size, Register result, Register t2, Register t3, Register t4, Register t5) { 4670 4671 // test for negative bytes in input string of a given size 4672 // result 1 if found, 0 otherwise. 4673 4674 Label Lcore, Ltail, Lreturn, Lcore_rpt; 4675 4676 assert_different_registers(inp, size, t2, t3, t4, t5, result); 4677 4678 Register i = result; // result used as integer index i until very end 4679 Register lmask = t2; // t2 is aliased to lmask 4680 4681 // INITIALIZATION 4682 // =========================================================== 4683 // initialize highbits mask -> lmask = 0x8080808080808080 (8B/64b) 4684 // compute unaligned offset -> i 4685 // compute core end index -> t5 4686 Assembler::sethi(0x80808000, t2); //! sethi macro fails to emit optimal 4687 add(t2, 0x80, t2); 4688 sllx(t2, 32, t3); 4689 or3(t3, t2, lmask); // 0x8080808080808080 -> lmask 4690 sra(size,0,size); 4691 andcc(inp, 0x7, i); // unaligned offset -> i 4692 br(Assembler::zero, true, Assembler::pn, Lcore); // starts 8B aligned? 4693 delayed()->add(size, -8, t5); // (annuled) core end index -> t5 4694 4695 // =========================================================== 4696 4697 // UNALIGNED HEAD 4698 // =========================================================== 4699 // * unaligned head handling: grab aligned 8B containing unaligned inp(ut) 4700 // * obliterate (ignore) bytes outside string by shifting off reg ends 4701 // * compare with bitmask, short circuit return true if one or more high 4702 // bits set. 4703 cmp(size, 0); 4704 br(Assembler::zero, true, Assembler::pn, Lreturn); // short-circuit? 4705 delayed()->mov(0,result); // annuled so i not clobbered for following 4706 neg(i, t4); 4707 add(i, size, t5); 4708 sra(t4, 0, t4); 4709 ldx(inp, t4, t3); // raw aligned 8B containing unaligned head -> t3 4710 mov(8, t4); 4711 sub(t4, t5, t4); 4712 sra(t4, 31, t5); 4713 andn(t4, t5, t5); 4714 add(i, t5, t4); 4715 sll(t5, 3, t5); 4716 sll(t4, 3, t4); // # bits to shift right, left -> t5,t4 4717 srlx(t3, t5, t3); 4718 sllx(t3, t4, t3); // bytes outside string in 8B header obliterated -> t3 4719 andcc(lmask, t3, G0); 4720 brx(Assembler::notZero, true, Assembler::pn, Lreturn); // short circuit? 4721 delayed()->mov(1,result); // annuled so i not clobbered for following 4722 add(size, -8, t5); // core end index -> t5 4723 mov(8, t4); 4724 sub(t4, i, i); // # bytes examined in unalgn head (<8) -> i 4725 // =========================================================== 4726 4727 // ALIGNED CORE 4728 // =========================================================== 4729 // * iterate index i over aligned 8B sections of core, comparing with 4730 // bitmask, short circuit return true if one or more high bits set 4731 // t5 contains core end index/loop limit which is the index 4732 // of the MSB of last (unaligned) 8B fully contained in the string. 4733 // inp contains address of first byte in string/array 4734 // lmask contains 8B high bit mask for comparison 4735 // i contains next index to be processed (adr. inp+i is on 8B boundary) 4736 bind(Lcore); 4737 cmp_and_br_short(i, t5, Assembler::greater, Assembler::pn, Ltail); 4738 bind(Lcore_rpt); 4739 ldx(inp, i, t3); 4740 andcc(t3, lmask, G0); 4741 brx(Assembler::notZero, true, Assembler::pn, Lreturn); 4742 delayed()->mov(1, result); // annuled so i not clobbered for following 4743 add(i, 8, i); 4744 cmp_and_br_short(i, t5, Assembler::lessEqual, Assembler::pn, Lcore_rpt); 4745 // =========================================================== 4746 4747 // ALIGNED TAIL (<8B) 4748 // =========================================================== 4749 // handle aligned tail of 7B or less as complete 8B, obliterating end of 4750 // string bytes by shifting them off end, compare what's left with bitmask 4751 // inp contains address of first byte in string/array 4752 // lmask contains 8B high bit mask for comparison 4753 // i contains next index to be processed (adr. inp+i is on 8B boundary) 4754 bind(Ltail); 4755 subcc(size, i, t4); // # of remaining bytes in string -> t4 4756 // return 0 if no more remaining bytes 4757 br(Assembler::lessEqual, true, Assembler::pn, Lreturn); 4758 delayed()->mov(0, result); // annuled so i not clobbered for following 4759 ldx(inp, i, t3); // load final 8B (aligned) containing tail -> t3 4760 mov(8, t5); 4761 sub(t5, t4, t4); 4762 mov(0, result); // ** i clobbered at this point 4763 sll(t4, 3, t4); // bits beyond end of string -> t4 4764 srlx(t3, t4, t3); // bytes beyond end now obliterated -> t3 4765 andcc(lmask, t3, G0); 4766 movcc(Assembler::notZero, false, xcc, 1, result); 4767 bind(Lreturn); 4768 } 4769 4770 #endif 4771 4772 4773 // Use BIS for zeroing (count is in bytes). 4774 void MacroAssembler::bis_zeroing(Register to, Register count, Register temp, Label& Ldone) { 4775 assert(UseBlockZeroing && VM_Version::has_block_zeroing(), "only works with BIS zeroing"); 4776 Register end = count; 4777 int cache_line_size = VM_Version::prefetch_data_size(); 4778 // Minimum count when BIS zeroing can be used since 4779 // it needs membar which is expensive. 4780 int block_zero_size = MAX2(cache_line_size*3, (int)BlockZeroingLowLimit); 4781 4782 Label small_loop; 4783 // Check if count is negative (dead code) or zero. 4784 // Note, count uses 64bit in 64 bit VM. 4785 cmp_and_brx_short(count, 0, Assembler::lessEqual, Assembler::pn, Ldone); 4786 4787 // Use BIS zeroing only for big arrays since it requires membar. 4788 if (Assembler::is_simm13(block_zero_size)) { // < 4096 4789 cmp(count, block_zero_size); 4790 } else { 4791 set(block_zero_size, temp); |