81 };
82
83
84 // Implementation of MacroAssembler
85
86 // First all the versions that have distinct versions depending on 32/64 bit
87 // Unless the difference is trivial (1 line or so).
88
89 #ifndef _LP64
90
91 // 32bit versions
92
93 Address MacroAssembler::as_Address(AddressLiteral adr) {
94 return Address(adr.target(), adr.rspec());
95 }
96
97 Address MacroAssembler::as_Address(ArrayAddress adr) {
98 return Address::make_array(adr);
99 }
100
101 int MacroAssembler::biased_locking_enter(Register lock_reg,
102 Register obj_reg,
103 Register swap_reg,
104 Register tmp_reg,
105 bool swap_reg_contains_mark,
106 Label& done,
107 Label* slow_case,
108 BiasedLockingCounters* counters) {
109 assert(UseBiasedLocking, "why call this otherwise?");
110 assert(swap_reg == rax, "swap_reg must be rax, for cmpxchg");
111 assert_different_registers(lock_reg, obj_reg, swap_reg);
112
113 if (PrintBiasedLockingStatistics && counters == NULL)
114 counters = BiasedLocking::counters();
115
116 bool need_tmp_reg = false;
117 if (tmp_reg == noreg) {
118 need_tmp_reg = true;
119 tmp_reg = lock_reg;
120 } else {
121 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
122 }
123 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
124 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes());
125 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes());
126 Address saved_mark_addr(lock_reg, 0);
127
128 // Biased locking
129 // See whether the lock is currently biased toward our thread and
130 // whether the epoch is still valid
131 // Note that the runtime guarantees sufficient alignment of JavaThread
132 // pointers to allow age to be placed into low bits
133 // First check to see whether biasing is even enabled for this object
134 Label cas_label;
135 int null_check_offset = -1;
136 if (!swap_reg_contains_mark) {
137 null_check_offset = offset();
138 movl(swap_reg, mark_addr);
139 }
140 if (need_tmp_reg) {
141 push(tmp_reg);
142 }
143 movl(tmp_reg, swap_reg);
144 andl(tmp_reg, markOopDesc::biased_lock_mask_in_place);
145 cmpl(tmp_reg, markOopDesc::biased_lock_pattern);
146 if (need_tmp_reg) {
147 pop(tmp_reg);
148 }
149 jcc(Assembler::notEqual, cas_label);
150 // The bias pattern is present in the object's header. Need to check
151 // whether the bias owner and the epoch are both still current.
152 // Note that because there is no current thread register on x86 we
153 // need to store off the mark word we read out of the object to
154 // avoid reloading it and needing to recheck invariants below. This
155 // store is unfortunate but it makes the overall code shorter and
156 // simpler.
157 movl(saved_mark_addr, swap_reg);
158 if (need_tmp_reg) {
159 push(tmp_reg);
160 }
161 get_thread(tmp_reg);
162 xorl(swap_reg, tmp_reg);
163 if (swap_reg_contains_mark) {
164 null_check_offset = offset();
165 }
166 movl(tmp_reg, klass_addr);
167 xorl(swap_reg, Address(tmp_reg, Klass::prototype_header_offset()));
168 andl(swap_reg, ~((int) markOopDesc::age_mask_in_place));
169 if (need_tmp_reg) {
170 pop(tmp_reg);
171 }
172 if (counters != NULL) {
173 cond_inc32(Assembler::zero,
174 ExternalAddress((address)counters->biased_lock_entry_count_addr()));
175 }
176 jcc(Assembler::equal, done);
177
178 Label try_revoke_bias;
179 Label try_rebias;
180
181 // At this point we know that the header has the bias pattern and
182 // that we are not the bias owner in the current epoch. We need to
183 // figure out more details about the state of the header in order to
184 // know what operations can be legally performed on the object's
185 // header.
186
187 // If the low three bits in the xor result aren't clear, that means
188 // the prototype header is no longer biased and we have to revoke
189 // the bias on this object.
190 testl(swap_reg, markOopDesc::biased_lock_mask_in_place);
191 jcc(Assembler::notZero, try_revoke_bias);
192
193 // Biasing is still enabled for this data type. See whether the
194 // epoch of the current bias is still valid, meaning that the epoch
195 // bits of the mark word are equal to the epoch bits of the
196 // prototype header. (Note that the prototype header's epoch bits
197 // only change at a safepoint.) If not, attempt to rebias the object
198 // toward the current thread. Note that we must be absolutely sure
199 // that the current epoch is invalid in order to do this because
200 // otherwise the manipulations it performs on the mark word are
201 // illegal.
202 testl(swap_reg, markOopDesc::epoch_mask_in_place);
203 jcc(Assembler::notZero, try_rebias);
204
205 // The epoch of the current bias is still valid but we know nothing
206 // about the owner; it might be set or it might be clear. Try to
207 // acquire the bias of the object using an atomic operation. If this
208 // fails we will go in to the runtime to revoke the object's bias.
209 // Note that we first construct the presumed unbiased header so we
210 // don't accidentally blow away another thread's valid bias.
211 movl(swap_reg, saved_mark_addr);
212 andl(swap_reg,
213 markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
214 if (need_tmp_reg) {
215 push(tmp_reg);
216 }
217 get_thread(tmp_reg);
218 orl(tmp_reg, swap_reg);
219 if (os::is_MP()) {
220 lock();
221 }
222 cmpxchgptr(tmp_reg, Address(obj_reg, 0));
223 if (need_tmp_reg) {
224 pop(tmp_reg);
225 }
226 // If the biasing toward our thread failed, this means that
227 // another thread succeeded in biasing it toward itself and we
228 // need to revoke that bias. The revocation will occur in the
229 // interpreter runtime in the slow case.
230 if (counters != NULL) {
231 cond_inc32(Assembler::zero,
232 ExternalAddress((address)counters->anonymously_biased_lock_entry_count_addr()));
233 }
234 if (slow_case != NULL) {
235 jcc(Assembler::notZero, *slow_case);
236 }
237 jmp(done);
238
239 bind(try_rebias);
240 // At this point we know the epoch has expired, meaning that the
241 // current "bias owner", if any, is actually invalid. Under these
242 // circumstances _only_, we are allowed to use the current header's
243 // value as the comparison value when doing the cas to acquire the
244 // bias in the current epoch. In other words, we allow transfer of
245 // the bias from one thread to another directly in this situation.
246 //
247 // FIXME: due to a lack of registers we currently blow away the age
248 // bits in this situation. Should attempt to preserve them.
249 if (need_tmp_reg) {
250 push(tmp_reg);
251 }
252 get_thread(tmp_reg);
253 movl(swap_reg, klass_addr);
254 orl(tmp_reg, Address(swap_reg, Klass::prototype_header_offset()));
255 movl(swap_reg, saved_mark_addr);
256 if (os::is_MP()) {
257 lock();
258 }
259 cmpxchgptr(tmp_reg, Address(obj_reg, 0));
260 if (need_tmp_reg) {
261 pop(tmp_reg);
262 }
263 // If the biasing toward our thread failed, then another thread
264 // succeeded in biasing it toward itself and we need to revoke that
265 // bias. The revocation will occur in the runtime in the slow case.
266 if (counters != NULL) {
267 cond_inc32(Assembler::zero,
268 ExternalAddress((address)counters->rebiased_lock_entry_count_addr()));
269 }
270 if (slow_case != NULL) {
271 jcc(Assembler::notZero, *slow_case);
272 }
273 jmp(done);
274
275 bind(try_revoke_bias);
276 // The prototype mark in the klass doesn't have the bias bit set any
277 // more, indicating that objects of this data type are not supposed
278 // to be biased any more. We are going to try to reset the mark of
279 // this object to the prototype value and fall through to the
280 // CAS-based locking scheme. Note that if our CAS fails, it means
281 // that another thread raced us for the privilege of revoking the
282 // bias of this particular object, so it's okay to continue in the
283 // normal locking code.
284 //
285 // FIXME: due to a lack of registers we currently blow away the age
286 // bits in this situation. Should attempt to preserve them.
287 movl(swap_reg, saved_mark_addr);
288 if (need_tmp_reg) {
289 push(tmp_reg);
290 }
291 movl(tmp_reg, klass_addr);
292 movl(tmp_reg, Address(tmp_reg, Klass::prototype_header_offset()));
293 if (os::is_MP()) {
294 lock();
295 }
296 cmpxchgptr(tmp_reg, Address(obj_reg, 0));
297 if (need_tmp_reg) {
298 pop(tmp_reg);
299 }
300 // Fall through to the normal CAS-based lock, because no matter what
301 // the result of the above CAS, some thread must have succeeded in
302 // removing the bias bit from the object's header.
303 if (counters != NULL) {
304 cond_inc32(Assembler::zero,
305 ExternalAddress((address)counters->revoked_lock_entry_count_addr()));
306 }
307
308 bind(cas_label);
309
310 return null_check_offset;
311 }
312 void MacroAssembler::call_VM_leaf_base(address entry_point,
313 int number_of_arguments) {
314 call(RuntimeAddress(entry_point));
315 increment(rsp, number_of_arguments * wordSize);
316 }
317
318 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
319 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
320 }
321
322 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
323 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
324 }
325
326 void MacroAssembler::cmpoop(Address src1, jobject obj) {
327 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
328 }
329
330 void MacroAssembler::cmpoop(Register src1, jobject obj) {
331 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
709
710 Address MacroAssembler::as_Address(AddressLiteral adr) {
711 // amd64 always does this as a pc-rel
712 // we can be absolute or disp based on the instruction type
713 // jmp/call are displacements others are absolute
714 assert(!adr.is_lval(), "must be rval");
715 assert(reachable(adr), "must be");
716 return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
717
718 }
719
720 Address MacroAssembler::as_Address(ArrayAddress adr) {
721 AddressLiteral base = adr.base();
722 lea(rscratch1, base);
723 Address index = adr.index();
724 assert(index._disp == 0, "must not have disp"); // maybe it can?
725 Address array(rscratch1, index._index, index._scale, index._disp);
726 return array;
727 }
728
729 int MacroAssembler::biased_locking_enter(Register lock_reg,
730 Register obj_reg,
731 Register swap_reg,
732 Register tmp_reg,
733 bool swap_reg_contains_mark,
734 Label& done,
735 Label* slow_case,
736 BiasedLockingCounters* counters) {
737 assert(UseBiasedLocking, "why call this otherwise?");
738 assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
739 assert(tmp_reg != noreg, "tmp_reg must be supplied");
740 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
741 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
742 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes());
743 Address saved_mark_addr(lock_reg, 0);
744
745 if (PrintBiasedLockingStatistics && counters == NULL)
746 counters = BiasedLocking::counters();
747
748 // Biased locking
749 // See whether the lock is currently biased toward our thread and
750 // whether the epoch is still valid
751 // Note that the runtime guarantees sufficient alignment of JavaThread
752 // pointers to allow age to be placed into low bits
753 // First check to see whether biasing is even enabled for this object
754 Label cas_label;
755 int null_check_offset = -1;
756 if (!swap_reg_contains_mark) {
757 null_check_offset = offset();
758 movq(swap_reg, mark_addr);
759 }
760 movq(tmp_reg, swap_reg);
761 andq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
762 cmpq(tmp_reg, markOopDesc::biased_lock_pattern);
763 jcc(Assembler::notEqual, cas_label);
764 // The bias pattern is present in the object's header. Need to check
765 // whether the bias owner and the epoch are both still current.
766 load_prototype_header(tmp_reg, obj_reg);
767 orq(tmp_reg, r15_thread);
768 xorq(tmp_reg, swap_reg);
769 andq(tmp_reg, ~((int) markOopDesc::age_mask_in_place));
770 if (counters != NULL) {
771 cond_inc32(Assembler::zero,
772 ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
773 }
774 jcc(Assembler::equal, done);
775
776 Label try_revoke_bias;
777 Label try_rebias;
778
779 // At this point we know that the header has the bias pattern and
780 // that we are not the bias owner in the current epoch. We need to
781 // figure out more details about the state of the header in order to
782 // know what operations can be legally performed on the object's
783 // header.
784
785 // If the low three bits in the xor result aren't clear, that means
786 // the prototype header is no longer biased and we have to revoke
787 // the bias on this object.
788 testq(tmp_reg, markOopDesc::biased_lock_mask_in_place);
789 jcc(Assembler::notZero, try_revoke_bias);
790
791 // Biasing is still enabled for this data type. See whether the
792 // epoch of the current bias is still valid, meaning that the epoch
793 // bits of the mark word are equal to the epoch bits of the
794 // prototype header. (Note that the prototype header's epoch bits
795 // only change at a safepoint.) If not, attempt to rebias the object
796 // toward the current thread. Note that we must be absolutely sure
797 // that the current epoch is invalid in order to do this because
798 // otherwise the manipulations it performs on the mark word are
799 // illegal.
800 testq(tmp_reg, markOopDesc::epoch_mask_in_place);
801 jcc(Assembler::notZero, try_rebias);
802
803 // The epoch of the current bias is still valid but we know nothing
804 // about the owner; it might be set or it might be clear. Try to
805 // acquire the bias of the object using an atomic operation. If this
806 // fails we will go in to the runtime to revoke the object's bias.
807 // Note that we first construct the presumed unbiased header so we
808 // don't accidentally blow away another thread's valid bias.
809 andq(swap_reg,
810 markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
811 movq(tmp_reg, swap_reg);
812 orq(tmp_reg, r15_thread);
813 if (os::is_MP()) {
814 lock();
815 }
816 cmpxchgq(tmp_reg, Address(obj_reg, 0));
817 // If the biasing toward our thread failed, this means that
818 // another thread succeeded in biasing it toward itself and we
819 // need to revoke that bias. The revocation will occur in the
820 // interpreter runtime in the slow case.
821 if (counters != NULL) {
822 cond_inc32(Assembler::zero,
823 ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
824 }
825 if (slow_case != NULL) {
826 jcc(Assembler::notZero, *slow_case);
827 }
828 jmp(done);
829
830 bind(try_rebias);
831 // At this point we know the epoch has expired, meaning that the
832 // current "bias owner", if any, is actually invalid. Under these
833 // circumstances _only_, we are allowed to use the current header's
834 // value as the comparison value when doing the cas to acquire the
835 // bias in the current epoch. In other words, we allow transfer of
836 // the bias from one thread to another directly in this situation.
837 //
838 // FIXME: due to a lack of registers we currently blow away the age
839 // bits in this situation. Should attempt to preserve them.
840 load_prototype_header(tmp_reg, obj_reg);
841 orq(tmp_reg, r15_thread);
842 if (os::is_MP()) {
843 lock();
844 }
845 cmpxchgq(tmp_reg, Address(obj_reg, 0));
846 // If the biasing toward our thread failed, then another thread
847 // succeeded in biasing it toward itself and we need to revoke that
848 // bias. The revocation will occur in the runtime in the slow case.
849 if (counters != NULL) {
850 cond_inc32(Assembler::zero,
851 ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
852 }
853 if (slow_case != NULL) {
854 jcc(Assembler::notZero, *slow_case);
855 }
856 jmp(done);
857
858 bind(try_revoke_bias);
859 // The prototype mark in the klass doesn't have the bias bit set any
860 // more, indicating that objects of this data type are not supposed
861 // to be biased any more. We are going to try to reset the mark of
862 // this object to the prototype value and fall through to the
863 // CAS-based locking scheme. Note that if our CAS fails, it means
864 // that another thread raced us for the privilege of revoking the
865 // bias of this particular object, so it's okay to continue in the
866 // normal locking code.
867 //
868 // FIXME: due to a lack of registers we currently blow away the age
869 // bits in this situation. Should attempt to preserve them.
870 load_prototype_header(tmp_reg, obj_reg);
871 if (os::is_MP()) {
872 lock();
873 }
874 cmpxchgq(tmp_reg, Address(obj_reg, 0));
875 // Fall through to the normal CAS-based lock, because no matter what
876 // the result of the above CAS, some thread must have succeeded in
877 // removing the bias bit from the object's header.
878 if (counters != NULL) {
879 cond_inc32(Assembler::zero,
880 ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
881 }
882
883 bind(cas_label);
884
885 return null_check_offset;
886 }
887
888 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
889 Label L, E;
890
891 #ifdef _WIN64
892 // Windows always allocates space for it's register args
893 assert(num_args <= 4, "only register arguments supported");
894 subq(rsp, frame::arg_reg_save_area_bytes);
895 #endif
896
897 // Align stack if necessary
898 testl(rsp, 15);
899 jcc(Assembler::zero, L);
900
901 subq(rsp, 8);
902 {
903 call(RuntimeAddress(entry_point));
904 }
905 addq(rsp, 8);
906 jmp(E);
1343 }
1344 }
1345
1346 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) {
1347 // Used in sign-masking with aligned address.
1348 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
1349 if (reachable(src)) {
1350 Assembler::andps(dst, as_Address(src));
1351 } else {
1352 lea(rscratch1, src);
1353 Assembler::andps(dst, Address(rscratch1, 0));
1354 }
1355 }
1356
1357 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1358 LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1359 }
1360
1361 void MacroAssembler::atomic_incl(AddressLiteral counter_addr) {
1362 pushf();
1363 if (os::is_MP())
1364 lock();
1365 incrementl(counter_addr);
1366 popf();
1367 }
1368
1369 // Writes to stack successive pages until offset reached to check for
1370 // stack overflow + shadow pages. This clobbers tmp.
1371 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1372 movptr(tmp, rsp);
1373 // Bang stack for total size given plus shadow page size.
1374 // Bang one page at a time because large size can bang beyond yellow and
1375 // red zones.
1376 Label loop;
1377 bind(loop);
1378 movl(Address(tmp, (-os::vm_page_size())), size );
1379 subptr(tmp, os::vm_page_size());
1380 subl(size, os::vm_page_size());
1381 jcc(Assembler::greater, loop);
1382
1383 // Bang down shadow pages too.
1384 // At this point, (tmp-0) is the last address touched, so don't
1385 // touch it again. (It was touched as (tmp-pagesize) but then tmp
1386 // was post-decremented.) Skip this address by starting at i=1, and
1387 // touch a few more pages below. N.B. It is important to touch all
1388 // the way down to and including i=StackShadowPages.
1389 for (int i = 1; i <= StackShadowPages; i++) {
1390 // this could be any sized move but this is can be a debugging crumb
1391 // so the bigger the better.
1392 movptr(Address(tmp, (-i*os::vm_page_size())), size );
1393 }
1394 }
1395
1396 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1397 assert(UseBiasedLocking, "why call this otherwise?");
1398
1399 // Check for biased locking unlock case, which is a no-op
1400 // Note: we do not have to check the thread ID for two reasons.
1401 // First, the interpreter checks for IllegalMonitorStateException at
1402 // a higher level. Second, if the bias was revoked while we held the
1403 // lock, the object could not be rebiased toward another thread, so
1404 // the bias bit would be clear.
1405 movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1406 andptr(temp_reg, markOopDesc::biased_lock_mask_in_place);
1407 cmpptr(temp_reg, markOopDesc::biased_lock_pattern);
1408 jcc(Assembler::equal, done);
1409 }
1410
1411 void MacroAssembler::c2bool(Register x) {
1412 // implements x == 0 ? 0 : 1
1413 // note: must only look at least-significant byte of x
1414 // since C-style booleans are stored in one byte
1415 // only! (was bug)
1416 andl(x, 0xFF);
1417 setb(Assembler::notZero, x);
1418 }
1419
1420 // Wouldn't need if AddressLiteral version had new name
1421 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
1422 Assembler::call(L, rtype);
1423 }
1424
1425 void MacroAssembler::call(Register entry) {
1426 Assembler::call(entry);
1427 }
1428
1429 void MacroAssembler::call(AddressLiteral entry) {
1430 if (reachable(entry)) {
|
81 };
82
83
84 // Implementation of MacroAssembler
85
86 // First all the versions that have distinct versions depending on 32/64 bit
87 // Unless the difference is trivial (1 line or so).
88
89 #ifndef _LP64
90
91 // 32bit versions
92
93 Address MacroAssembler::as_Address(AddressLiteral adr) {
94 return Address(adr.target(), adr.rspec());
95 }
96
97 Address MacroAssembler::as_Address(ArrayAddress adr) {
98 return Address::make_array(adr);
99 }
100
101 void MacroAssembler::call_VM_leaf_base(address entry_point,
102 int number_of_arguments) {
103 call(RuntimeAddress(entry_point));
104 increment(rsp, number_of_arguments * wordSize);
105 }
106
107 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
108 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
109 }
110
111 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
112 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
113 }
114
115 void MacroAssembler::cmpoop(Address src1, jobject obj) {
116 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
117 }
118
119 void MacroAssembler::cmpoop(Register src1, jobject obj) {
120 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
498
499 Address MacroAssembler::as_Address(AddressLiteral adr) {
500 // amd64 always does this as a pc-rel
501 // we can be absolute or disp based on the instruction type
502 // jmp/call are displacements others are absolute
503 assert(!adr.is_lval(), "must be rval");
504 assert(reachable(adr), "must be");
505 return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
506
507 }
508
509 Address MacroAssembler::as_Address(ArrayAddress adr) {
510 AddressLiteral base = adr.base();
511 lea(rscratch1, base);
512 Address index = adr.index();
513 assert(index._disp == 0, "must not have disp"); // maybe it can?
514 Address array(rscratch1, index._index, index._scale, index._disp);
515 return array;
516 }
517
518
519
520
521
522
523
524
525
526 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
527 Label L, E;
528
529 #ifdef _WIN64
530 // Windows always allocates space for it's register args
531 assert(num_args <= 4, "only register arguments supported");
532 subq(rsp, frame::arg_reg_save_area_bytes);
533 #endif
534
535 // Align stack if necessary
536 testl(rsp, 15);
537 jcc(Assembler::zero, L);
538
539 subq(rsp, 8);
540 {
541 call(RuntimeAddress(entry_point));
542 }
543 addq(rsp, 8);
544 jmp(E);
981 }
982 }
983
984 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) {
985 // Used in sign-masking with aligned address.
986 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
987 if (reachable(src)) {
988 Assembler::andps(dst, as_Address(src));
989 } else {
990 lea(rscratch1, src);
991 Assembler::andps(dst, Address(rscratch1, 0));
992 }
993 }
994
995 void MacroAssembler::andptr(Register dst, int32_t imm32) {
996 LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
997 }
998
999 void MacroAssembler::atomic_incl(AddressLiteral counter_addr) {
1000 pushf();
1001 if (reachable(counter_addr)) {
1002 if (os::is_MP())
1003 lock();
1004 incrementl(as_Address(counter_addr));
1005 } else {
1006 lea(rscratch1, counter_addr);
1007 if (os::is_MP())
1008 lock();
1009 incrementl(Address(rscratch1, 0));
1010 }
1011 popf();
1012 }
1013
1014 // Writes to stack successive pages until offset reached to check for
1015 // stack overflow + shadow pages. This clobbers tmp.
1016 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1017 movptr(tmp, rsp);
1018 // Bang stack for total size given plus shadow page size.
1019 // Bang one page at a time because large size can bang beyond yellow and
1020 // red zones.
1021 Label loop;
1022 bind(loop);
1023 movl(Address(tmp, (-os::vm_page_size())), size );
1024 subptr(tmp, os::vm_page_size());
1025 subl(size, os::vm_page_size());
1026 jcc(Assembler::greater, loop);
1027
1028 // Bang down shadow pages too.
1029 // At this point, (tmp-0) is the last address touched, so don't
1030 // touch it again. (It was touched as (tmp-pagesize) but then tmp
1031 // was post-decremented.) Skip this address by starting at i=1, and
1032 // touch a few more pages below. N.B. It is important to touch all
1033 // the way down to and including i=StackShadowPages.
1034 for (int i = 1; i <= StackShadowPages; i++) {
1035 // this could be any sized move but this is can be a debugging crumb
1036 // so the bigger the better.
1037 movptr(Address(tmp, (-i*os::vm_page_size())), size );
1038 }
1039 }
1040
1041 int MacroAssembler::biased_locking_enter(Register lock_reg,
1042 Register obj_reg,
1043 Register swap_reg,
1044 Register tmp_reg,
1045 bool swap_reg_contains_mark,
1046 Label& done,
1047 Label* slow_case,
1048 BiasedLockingCounters* counters) {
1049 assert(UseBiasedLocking, "why call this otherwise?");
1050 assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
1051 LP64_ONLY( assert(tmp_reg != noreg, "tmp_reg must be supplied"); )
1052 bool need_tmp_reg = false;
1053 if (tmp_reg == noreg) {
1054 need_tmp_reg = true;
1055 tmp_reg = lock_reg;
1056 assert_different_registers(lock_reg, obj_reg, swap_reg);
1057 } else {
1058 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
1059 }
1060 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
1061 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes());
1062 Address saved_mark_addr(lock_reg, 0);
1063
1064 if (PrintBiasedLockingStatistics && counters == NULL) {
1065 counters = BiasedLocking::counters();
1066 }
1067 // Biased locking
1068 // See whether the lock is currently biased toward our thread and
1069 // whether the epoch is still valid
1070 // Note that the runtime guarantees sufficient alignment of JavaThread
1071 // pointers to allow age to be placed into low bits
1072 // First check to see whether biasing is even enabled for this object
1073 Label cas_label;
1074 int null_check_offset = -1;
1075 if (!swap_reg_contains_mark) {
1076 null_check_offset = offset();
1077 movptr(swap_reg, mark_addr);
1078 }
1079 if (need_tmp_reg) {
1080 push(tmp_reg);
1081 }
1082 movptr(tmp_reg, swap_reg);
1083 andptr(tmp_reg, markOopDesc::biased_lock_mask_in_place);
1084 cmpptr(tmp_reg, markOopDesc::biased_lock_pattern);
1085 if (need_tmp_reg) {
1086 pop(tmp_reg);
1087 }
1088 jcc(Assembler::notEqual, cas_label);
1089 // The bias pattern is present in the object's header. Need to check
1090 // whether the bias owner and the epoch are both still current.
1091 #ifndef _LP64
1092 // Note that because there is no current thread register on x86_32 we
1093 // need to store off the mark word we read out of the object to
1094 // avoid reloading it and needing to recheck invariants below. This
1095 // store is unfortunate but it makes the overall code shorter and
1096 // simpler.
1097 movptr(saved_mark_addr, swap_reg);
1098 #endif
1099 if (need_tmp_reg) {
1100 push(tmp_reg);
1101 }
1102 if (swap_reg_contains_mark) {
1103 null_check_offset = offset();
1104 }
1105 load_prototype_header(tmp_reg, obj_reg);
1106 #ifdef _LP64
1107 orptr(tmp_reg, r15_thread);
1108 xorptr(tmp_reg, swap_reg);
1109 Register header_reg = tmp_reg;
1110 #else
1111 xorptr(tmp_reg, swap_reg);
1112 get_thread(swap_reg);
1113 xorptr(swap_reg, tmp_reg);
1114 Register header_reg = swap_reg;
1115 #endif
1116 andptr(header_reg, ~((int) markOopDesc::age_mask_in_place));
1117 if (need_tmp_reg) {
1118 pop(tmp_reg);
1119 }
1120 if (counters != NULL) {
1121 cond_inc32(Assembler::zero,
1122 ExternalAddress((address) counters->biased_lock_entry_count_addr()));
1123 }
1124 jcc(Assembler::equal, done);
1125
1126 Label try_revoke_bias;
1127 Label try_rebias;
1128
1129 // At this point we know that the header has the bias pattern and
1130 // that we are not the bias owner in the current epoch. We need to
1131 // figure out more details about the state of the header in order to
1132 // know what operations can be legally performed on the object's
1133 // header.
1134
1135 // If the low three bits in the xor result aren't clear, that means
1136 // the prototype header is no longer biased and we have to revoke
1137 // the bias on this object.
1138 testptr(header_reg, markOopDesc::biased_lock_mask_in_place);
1139 jccb(Assembler::notZero, try_revoke_bias);
1140
1141 // Biasing is still enabled for this data type. See whether the
1142 // epoch of the current bias is still valid, meaning that the epoch
1143 // bits of the mark word are equal to the epoch bits of the
1144 // prototype header. (Note that the prototype header's epoch bits
1145 // only change at a safepoint.) If not, attempt to rebias the object
1146 // toward the current thread. Note that we must be absolutely sure
1147 // that the current epoch is invalid in order to do this because
1148 // otherwise the manipulations it performs on the mark word are
1149 // illegal.
1150 testptr(header_reg, markOopDesc::epoch_mask_in_place);
1151 jccb(Assembler::notZero, try_rebias);
1152
1153 // The epoch of the current bias is still valid but we know nothing
1154 // about the owner; it might be set or it might be clear. Try to
1155 // acquire the bias of the object using an atomic operation. If this
1156 // fails we will go in to the runtime to revoke the object's bias.
1157 // Note that we first construct the presumed unbiased header so we
1158 // don't accidentally blow away another thread's valid bias.
1159 NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1160 andptr(swap_reg,
1161 markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
1162 if (need_tmp_reg) {
1163 push(tmp_reg);
1164 }
1165 #ifdef _LP64
1166 movptr(tmp_reg, swap_reg);
1167 orptr(tmp_reg, r15_thread);
1168 #else
1169 get_thread(tmp_reg);
1170 orptr(tmp_reg, swap_reg);
1171 #endif
1172 if (os::is_MP()) {
1173 lock();
1174 }
1175 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1176 if (need_tmp_reg) {
1177 pop(tmp_reg);
1178 }
1179 // If the biasing toward our thread failed, this means that
1180 // another thread succeeded in biasing it toward itself and we
1181 // need to revoke that bias. The revocation will occur in the
1182 // interpreter runtime in the slow case.
1183 if (counters != NULL) {
1184 cond_inc32(Assembler::zero,
1185 ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
1186 }
1187 if (slow_case != NULL) {
1188 jcc(Assembler::notZero, *slow_case);
1189 }
1190 jmp(done);
1191
1192 bind(try_rebias);
1193 // At this point we know the epoch has expired, meaning that the
1194 // current "bias owner", if any, is actually invalid. Under these
1195 // circumstances _only_, we are allowed to use the current header's
1196 // value as the comparison value when doing the cas to acquire the
1197 // bias in the current epoch. In other words, we allow transfer of
1198 // the bias from one thread to another directly in this situation.
1199 //
1200 // FIXME: due to a lack of registers we currently blow away the age
1201 // bits in this situation. Should attempt to preserve them.
1202 if (need_tmp_reg) {
1203 push(tmp_reg);
1204 }
1205 load_prototype_header(tmp_reg, obj_reg);
1206 #ifdef _LP64
1207 orptr(tmp_reg, r15_thread);
1208 #else
1209 get_thread(swap_reg);
1210 orptr(tmp_reg, swap_reg);
1211 movptr(swap_reg, saved_mark_addr);
1212 #endif
1213 if (os::is_MP()) {
1214 lock();
1215 }
1216 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1217 if (need_tmp_reg) {
1218 pop(tmp_reg);
1219 }
1220 // If the biasing toward our thread failed, then another thread
1221 // succeeded in biasing it toward itself and we need to revoke that
1222 // bias. The revocation will occur in the runtime in the slow case.
1223 if (counters != NULL) {
1224 cond_inc32(Assembler::zero,
1225 ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
1226 }
1227 if (slow_case != NULL) {
1228 jcc(Assembler::notZero, *slow_case);
1229 }
1230 jmp(done);
1231
1232 bind(try_revoke_bias);
1233 // The prototype mark in the klass doesn't have the bias bit set any
1234 // more, indicating that objects of this data type are not supposed
1235 // to be biased any more. We are going to try to reset the mark of
1236 // this object to the prototype value and fall through to the
1237 // CAS-based locking scheme. Note that if our CAS fails, it means
1238 // that another thread raced us for the privilege of revoking the
1239 // bias of this particular object, so it's okay to continue in the
1240 // normal locking code.
1241 //
1242 // FIXME: due to a lack of registers we currently blow away the age
1243 // bits in this situation. Should attempt to preserve them.
1244 NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1245 if (need_tmp_reg) {
1246 push(tmp_reg);
1247 }
1248 load_prototype_header(tmp_reg, obj_reg);
1249 if (os::is_MP()) {
1250 lock();
1251 }
1252 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1253 if (need_tmp_reg) {
1254 pop(tmp_reg);
1255 }
1256 // Fall through to the normal CAS-based lock, because no matter what
1257 // the result of the above CAS, some thread must have succeeded in
1258 // removing the bias bit from the object's header.
1259 if (counters != NULL) {
1260 cond_inc32(Assembler::zero,
1261 ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
1262 }
1263
1264 bind(cas_label);
1265
1266 return null_check_offset;
1267 }
1268
1269 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1270 assert(UseBiasedLocking, "why call this otherwise?");
1271
1272 // Check for biased locking unlock case, which is a no-op
1273 // Note: we do not have to check the thread ID for two reasons.
1274 // First, the interpreter checks for IllegalMonitorStateException at
1275 // a higher level. Second, if the bias was revoked while we held the
1276 // lock, the object could not be rebiased toward another thread, so
1277 // the bias bit would be clear.
1278 movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1279 andptr(temp_reg, markOopDesc::biased_lock_mask_in_place);
1280 cmpptr(temp_reg, markOopDesc::biased_lock_pattern);
1281 jcc(Assembler::equal, done);
1282 }
1283
1284 #ifdef COMPILER2
1285 // Fast_Lock and Fast_Unlock used by C2
1286
1287 // Because the transitions from emitted code to the runtime
1288 // monitorenter/exit helper stubs are so slow it's critical that
1289 // we inline both the stack-locking fast-path and the inflated fast path.
1290 //
1291 // See also: cmpFastLock and cmpFastUnlock.
1292 //
1293 // What follows is a specialized inline transliteration of the code
1294 // in slow_enter() and slow_exit(). If we're concerned about I$ bloat
1295 // another option would be to emit TrySlowEnter and TrySlowExit methods
1296 // at startup-time. These methods would accept arguments as
1297 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
1298 // indications in the icc.ZFlag. Fast_Lock and Fast_Unlock would simply
1299 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
1300 // In practice, however, the # of lock sites is bounded and is usually small.
1301 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
1302 // if the processor uses simple bimodal branch predictors keyed by EIP
1303 // Since the helper routines would be called from multiple synchronization
1304 // sites.
1305 //
1306 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
1307 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
1308 // to those specialized methods. That'd give us a mostly platform-independent
1309 // implementation that the JITs could optimize and inline at their pleasure.
1310 // Done correctly, the only time we'd need to cross to native could would be
1311 // to park() or unpark() threads. We'd also need a few more unsafe operators
1312 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
1313 // (b) explicit barriers or fence operations.
1314 //
1315 // TODO:
1316 //
1317 // * Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
1318 // This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
1319 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
1320 // the lock operators would typically be faster than reifying Self.
1321 //
1322 // * Ideally I'd define the primitives as:
1323 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
1324 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
1325 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
1326 // Instead, we're stuck with a rather awkward and brittle register assignments below.
1327 // Furthermore the register assignments are overconstrained, possibly resulting in
1328 // sub-optimal code near the synchronization site.
1329 //
1330 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
1331 // Alternately, use a better sp-proximity test.
1332 //
1333 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
1334 // Either one is sufficient to uniquely identify a thread.
1335 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
1336 //
1337 // * Intrinsify notify() and notifyAll() for the common cases where the
1338 // object is locked by the calling thread but the waitlist is empty.
1339 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
1340 //
1341 // * use jccb and jmpb instead of jcc and jmp to improve code density.
1342 // But beware of excessive branch density on AMD Opterons.
1343 //
1344 // * Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
1345 // or failure of the fast-path. If the fast-path fails then we pass
1346 // control to the slow-path, typically in C. In Fast_Lock and
1347 // Fast_Unlock we often branch to DONE_LABEL, just to find that C2
1348 // will emit a conditional branch immediately after the node.
1349 // So we have branches to branches and lots of ICC.ZF games.
1350 // Instead, it might be better to have C2 pass a "FailureLabel"
1351 // into Fast_Lock and Fast_Unlock. In the case of success, control
1352 // will drop through the node. ICC.ZF is undefined at exit.
1353 // In the case of failure, the node will branch directly to the
1354 // FailureLabel
1355
1356
1357 // obj: object to lock
1358 // box: on-stack box address (displaced header location) - KILLED
1359 // rax,: tmp -- KILLED
1360 // scr: tmp -- KILLED
1361 void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, Register scrReg, BiasedLockingCounters* counters) {
1362 // Ensure the register assignents are disjoint
1363 guarantee (objReg != boxReg, "");
1364 guarantee (objReg != tmpReg, "");
1365 guarantee (objReg != scrReg, "");
1366 guarantee (boxReg != tmpReg, "");
1367 guarantee (boxReg != scrReg, "");
1368 guarantee (tmpReg == rax, "");
1369
1370 if (counters != NULL) {
1371 atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()));
1372 }
1373 if (EmitSync & 1) {
1374 // set box->dhw = unused_mark (3)
1375 // Force all sync thru slow-path: slow_enter() and slow_exit()
1376 movptr (Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1377 cmpptr (rsp, (int32_t)NULL_WORD);
1378 } else
1379 if (EmitSync & 2) {
1380 Label DONE_LABEL ;
1381 if (UseBiasedLocking) {
1382 // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument.
1383 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
1384 }
1385
1386 movptr(tmpReg, Address(objReg, 0)); // fetch markword
1387 orptr (tmpReg, 0x1);
1388 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
1389 if (os::is_MP()) {
1390 lock();
1391 }
1392 cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg
1393 jccb(Assembler::equal, DONE_LABEL);
1394 // Recursive locking
1395 subptr(tmpReg, rsp);
1396 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
1397 movptr(Address(boxReg, 0), tmpReg);
1398 bind(DONE_LABEL);
1399 } else {
1400 // Possible cases that we'll encounter in fast_lock
1401 // ------------------------------------------------
1402 // * Inflated
1403 // -- unlocked
1404 // -- Locked
1405 // = by self
1406 // = by other
1407 // * biased
1408 // -- by Self
1409 // -- by other
1410 // * neutral
1411 // * stack-locked
1412 // -- by self
1413 // = sp-proximity test hits
1414 // = sp-proximity test generates false-negative
1415 // -- by other
1416 //
1417
1418 Label IsInflated, DONE_LABEL;
1419
1420 // it's stack-locked, biased or neutral
1421 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
1422 // order to reduce the number of conditional branches in the most common cases.
1423 // Beware -- there's a subtle invariant that fetch of the markword
1424 // at [FETCH], below, will never observe a biased encoding (*101b).
1425 // If this invariant is not held we risk exclusion (safety) failure.
1426 if (UseBiasedLocking && !UseOptoBiasInlining) {
1427 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, true, DONE_LABEL, NULL, counters);
1428 }
1429
1430 movptr(tmpReg, Address(objReg, 0)); // [FETCH]
1431 testl (tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased
1432 jccb (Assembler::notZero, IsInflated);
1433
1434 // Attempt stack-locking ...
1435 orptr (tmpReg, 0x1);
1436 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
1437 if (os::is_MP()) {
1438 lock();
1439 }
1440 cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg
1441 if (counters != NULL) {
1442 cond_inc32(Assembler::equal,
1443 ExternalAddress((address)counters->fast_path_entry_count_addr()));
1444 }
1445 jccb(Assembler::equal, DONE_LABEL);
1446
1447 // Recursive locking
1448 subptr(tmpReg, rsp);
1449 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
1450 movptr(Address(boxReg, 0), tmpReg);
1451 if (counters != NULL) {
1452 cond_inc32(Assembler::equal,
1453 ExternalAddress((address)counters->fast_path_entry_count_addr()));
1454 }
1455 jmpb(DONE_LABEL);
1456
1457 bind(IsInflated);
1458 #ifndef _LP64
1459 // The object is inflated.
1460 //
1461 // TODO-FIXME: eliminate the ugly use of manifest constants:
1462 // Use markOopDesc::monitor_value instead of "2".
1463 // use markOop::unused_mark() instead of "3".
1464 // The tmpReg value is an objectMonitor reference ORed with
1465 // markOopDesc::monitor_value (2). We can either convert tmpReg to an
1466 // objectmonitor pointer by masking off the "2" bit or we can just
1467 // use tmpReg as an objectmonitor pointer but bias the objectmonitor
1468 // field offsets with "-2" to compensate for and annul the low-order tag bit.
1469 //
1470 // I use the latter as it avoids AGI stalls.
1471 // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]"
1472 // instead of "mov r, [tmpReg+OFFSETOF(Owner)]".
1473 //
1474 #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2)
1475
1476 // boxReg refers to the on-stack BasicLock in the current frame.
1477 // We'd like to write:
1478 // set box->_displaced_header = markOop::unused_mark(). Any non-0 value suffices.
1479 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers
1480 // additional latency as we have another ST in the store buffer that must drain.
1481
1482 if (EmitSync & 8192) {
1483 movptr(Address(boxReg, 0), 3); // results in ST-before-CAS penalty
1484 get_thread (scrReg);
1485 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
1486 movptr(tmpReg, NULL_WORD); // consider: xor vs mov
1487 if (os::is_MP()) {
1488 lock();
1489 }
1490 cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
1491 } else
1492 if ((EmitSync & 128) == 0) { // avoid ST-before-CAS
1493 movptr(scrReg, boxReg);
1494 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
1495
1496 // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
1497 if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1498 // prefetchw [eax + Offset(_owner)-2]
1499 prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
1500 }
1501
1502 if ((EmitSync & 64) == 0) {
1503 // Optimistic form: consider XORL tmpReg,tmpReg
1504 movptr(tmpReg, NULL_WORD);
1505 } else {
1506 // Can suffer RTS->RTO upgrades on shared or cold $ lines
1507 // Test-And-CAS instead of CAS
1508 movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); // rax, = m->_owner
1509 testptr(tmpReg, tmpReg); // Locked ?
1510 jccb (Assembler::notZero, DONE_LABEL);
1511 }
1512
1513 // Appears unlocked - try to swing _owner from null to non-null.
1514 // Ideally, I'd manifest "Self" with get_thread and then attempt
1515 // to CAS the register containing Self into m->Owner.
1516 // But we don't have enough registers, so instead we can either try to CAS
1517 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds
1518 // we later store "Self" into m->Owner. Transiently storing a stack address
1519 // (rsp or the address of the box) into m->owner is harmless.
1520 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
1521 if (os::is_MP()) {
1522 lock();
1523 }
1524 cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
1525 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3
1526 jccb (Assembler::notZero, DONE_LABEL);
1527 get_thread (scrReg); // beware: clobbers ICCs
1528 movptr(Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg);
1529 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success
1530
1531 // If the CAS fails we can either retry or pass control to the slow-path.
1532 // We use the latter tactic.
1533 // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1534 // If the CAS was successful ...
1535 // Self has acquired the lock
1536 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1537 // Intentional fall-through into DONE_LABEL ...
1538 } else {
1539 movptr(Address(boxReg, 0), intptr_t(markOopDesc::unused_mark())); // results in ST-before-CAS penalty
1540 movptr(boxReg, tmpReg);
1541
1542 // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
1543 if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1544 // prefetchw [eax + Offset(_owner)-2]
1545 prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
1546 }
1547
1548 if ((EmitSync & 64) == 0) {
1549 // Optimistic form
1550 xorptr (tmpReg, tmpReg);
1551 } else {
1552 // Can suffer RTS->RTO upgrades on shared or cold $ lines
1553 movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); // rax, = m->_owner
1554 testptr(tmpReg, tmpReg); // Locked ?
1555 jccb (Assembler::notZero, DONE_LABEL);
1556 }
1557
1558 // Appears unlocked - try to swing _owner from null to non-null.
1559 // Use either "Self" (in scr) or rsp as thread identity in _owner.
1560 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
1561 get_thread (scrReg);
1562 if (os::is_MP()) {
1563 lock();
1564 }
1565 cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
1566
1567 // If the CAS fails we can either retry or pass control to the slow-path.
1568 // We use the latter tactic.
1569 // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1570 // If the CAS was successful ...
1571 // Self has acquired the lock
1572 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1573 // Intentional fall-through into DONE_LABEL ...
1574 }
1575 #else // _LP64
1576 // It's inflated
1577
1578 // TODO: someday avoid the ST-before-CAS penalty by
1579 // relocating (deferring) the following ST.
1580 // We should also think about trying a CAS without having
1581 // fetched _owner. If the CAS is successful we may
1582 // avoid an RTO->RTS upgrade on the $line.
1583
1584 // Without cast to int32_t a movptr will destroy r10 which is typically obj
1585 movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1586
1587 mov (boxReg, tmpReg);
1588 movptr (tmpReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
1589 testptr(tmpReg, tmpReg);
1590 jccb (Assembler::notZero, DONE_LABEL);
1591
1592 // It's inflated and appears unlocked
1593 if (os::is_MP()) {
1594 lock();
1595 }
1596 cmpxchgptr(r15_thread, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
1597 // Intentional fall-through into DONE_LABEL ...
1598
1599 #endif
1600
1601 // DONE_LABEL is a hot target - we'd really like to place it at the
1602 // start of cache line by padding with NOPs.
1603 // See the AMD and Intel software optimization manuals for the
1604 // most efficient "long" NOP encodings.
1605 // Unfortunately none of our alignment mechanisms suffice.
1606 bind(DONE_LABEL);
1607
1608 // At DONE_LABEL the icc ZFlag is set as follows ...
1609 // Fast_Unlock uses the same protocol.
1610 // ZFlag == 1 -> Success
1611 // ZFlag == 0 -> Failure - force control through the slow-path
1612 }
1613 }
1614
1615 // obj: object to unlock
1616 // box: box address (displaced header location), killed. Must be EAX.
1617 // tmp: killed, cannot be obj nor box.
1618 //
1619 // Some commentary on balanced locking:
1620 //
1621 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1622 // Methods that don't have provably balanced locking are forced to run in the
1623 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1624 // The interpreter provides two properties:
1625 // I1: At return-time the interpreter automatically and quietly unlocks any
1626 // objects acquired the current activation (frame). Recall that the
1627 // interpreter maintains an on-stack list of locks currently held by
1628 // a frame.
1629 // I2: If a method attempts to unlock an object that is not held by the
1630 // the frame the interpreter throws IMSX.
1631 //
1632 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
1633 // B() doesn't have provably balanced locking so it runs in the interpreter.
1634 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
1635 // is still locked by A().
1636 //
1637 // The only other source of unbalanced locking would be JNI. The "Java Native Interface:
1638 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
1639 // should not be unlocked by "normal" java-level locking and vice-versa. The specification
1640 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
1641
1642 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg) {
1643 guarantee (objReg != boxReg, "");
1644 guarantee (objReg != tmpReg, "");
1645 guarantee (boxReg != tmpReg, "");
1646 guarantee (boxReg == rax, "");
1647
1648 if (EmitSync & 4) {
1649 // Disable - inhibit all inlining. Force control through the slow-path
1650 cmpptr (rsp, 0);
1651 } else
1652 if (EmitSync & 8) {
1653 Label DONE_LABEL;
1654 if (UseBiasedLocking) {
1655 biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1656 }
1657 // Classic stack-locking code ...
1658 // Check whether the displaced header is 0
1659 //(=> recursive unlock)
1660 movptr(tmpReg, Address(boxReg, 0));
1661 testptr(tmpReg, tmpReg);
1662 jccb(Assembler::zero, DONE_LABEL);
1663 // If not recursive lock, reset the header to displaced header
1664 if (os::is_MP()) {
1665 lock();
1666 }
1667 cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
1668 bind(DONE_LABEL);
1669 } else {
1670 Label DONE_LABEL, Stacked, CheckSucc;
1671
1672 // Critically, the biased locking test must have precedence over
1673 // and appear before the (box->dhw == 0) recursive stack-lock test.
1674 if (UseBiasedLocking && !UseOptoBiasInlining) {
1675 biased_locking_exit(objReg, tmpReg, DONE_LABEL);
1676 }
1677
1678 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
1679 movptr(tmpReg, Address(objReg, 0)); // Examine the object's markword
1680 jccb (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock
1681
1682 testptr(tmpReg, 0x02); // Inflated?
1683 jccb (Assembler::zero, Stacked);
1684
1685 // It's inflated.
1686 // Despite our balanced locking property we still check that m->_owner == Self
1687 // as java routines or native JNI code called by this thread might
1688 // have released the lock.
1689 // Refer to the comments in synchronizer.cpp for how we might encode extra
1690 // state in _succ so we can avoid fetching EntryList|cxq.
1691 //
1692 // I'd like to add more cases in fast_lock() and fast_unlock() --
1693 // such as recursive enter and exit -- but we have to be wary of
1694 // I$ bloat, T$ effects and BP$ effects.
1695 //
1696 // If there's no contention try a 1-0 exit. That is, exit without
1697 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how
1698 // we detect and recover from the race that the 1-0 exit admits.
1699 //
1700 // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
1701 // before it STs null into _owner, releasing the lock. Updates
1702 // to data protected by the critical section must be visible before
1703 // we drop the lock (and thus before any other thread could acquire
1704 // the lock and observe the fields protected by the lock).
1705 // IA32's memory-model is SPO, so STs are ordered with respect to
1706 // each other and there's no need for an explicit barrier (fence).
1707 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
1708 #ifndef _LP64
1709 get_thread (boxReg);
1710 if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1711 // prefetchw [ebx + Offset(_owner)-2]
1712 prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
1713 }
1714
1715 // Note that we could employ various encoding schemes to reduce
1716 // the number of loads below (currently 4) to just 2 or 3.
1717 // Refer to the comments in synchronizer.cpp.
1718 // In practice the chain of fetches doesn't seem to impact performance, however.
1719 if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
1720 // Attempt to reduce branch density - AMD's branch predictor.
1721 xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
1722 orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
1723 orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
1724 orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
1725 jccb (Assembler::notZero, DONE_LABEL);
1726 movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
1727 jmpb (DONE_LABEL);
1728 } else {
1729 xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
1730 orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
1731 jccb (Assembler::notZero, DONE_LABEL);
1732 movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
1733 orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
1734 jccb (Assembler::notZero, CheckSucc);
1735 movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
1736 jmpb (DONE_LABEL);
1737 }
1738
1739 // The Following code fragment (EmitSync & 65536) improves the performance of
1740 // contended applications and contended synchronization microbenchmarks.
1741 // Unfortunately the emission of the code - even though not executed - causes regressions
1742 // in scimark and jetstream, evidently because of $ effects. Replacing the code
1743 // with an equal number of never-executed NOPs results in the same regression.
1744 // We leave it off by default.
1745
1746 if ((EmitSync & 65536) != 0) {
1747 Label LSuccess, LGoSlowPath ;
1748
1749 bind (CheckSucc);
1750
1751 // Optional pre-test ... it's safe to elide this
1752 if ((EmitSync & 16) == 0) {
1753 cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
1754 jccb (Assembler::zero, LGoSlowPath);
1755 }
1756
1757 // We have a classic Dekker-style idiom:
1758 // ST m->_owner = 0 ; MEMBAR; LD m->_succ
1759 // There are a number of ways to implement the barrier:
1760 // (1) lock:andl &m->_owner, 0
1761 // is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
1762 // LOCK: ANDL [ebx+Offset(_Owner)-2], 0
1763 // Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
1764 // (2) If supported, an explicit MFENCE is appealing.
1765 // In older IA32 processors MFENCE is slower than lock:add or xchg
1766 // particularly if the write-buffer is full as might be the case if
1767 // if stores closely precede the fence or fence-equivalent instruction.
1768 // In more modern implementations MFENCE appears faster, however.
1769 // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
1770 // The $lines underlying the top-of-stack should be in M-state.
1771 // The locked add instruction is serializing, of course.
1772 // (4) Use xchg, which is serializing
1773 // mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
1774 // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
1775 // The integer condition codes will tell us if succ was 0.
1776 // Since _succ and _owner should reside in the same $line and
1777 // we just stored into _owner, it's likely that the $line
1778 // remains in M-state for the lock:orl.
1779 //
1780 // We currently use (3), although it's likely that switching to (2)
1781 // is correct for the future.
1782
1783 movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
1784 if (os::is_MP()) {
1785 if (VM_Version::supports_sse2() && 1 == FenceInstruction) {
1786 mfence();
1787 } else {
1788 lock (); addptr(Address(rsp, 0), 0);
1789 }
1790 }
1791 // Ratify _succ remains non-null
1792 cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0);
1793 jccb (Assembler::notZero, LSuccess);
1794
1795 xorptr(boxReg, boxReg); // box is really EAX
1796 if (os::is_MP()) { lock(); }
1797 cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
1798 jccb (Assembler::notEqual, LSuccess);
1799 // Since we're low on registers we installed rsp as a placeholding in _owner.
1800 // Now install Self over rsp. This is safe as we're transitioning from
1801 // non-null to non=null
1802 get_thread (boxReg);
1803 movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg);
1804 // Intentional fall-through into LGoSlowPath ...
1805
1806 bind (LGoSlowPath);
1807 orptr(boxReg, 1); // set ICC.ZF=0 to indicate failure
1808 jmpb (DONE_LABEL);
1809
1810 bind (LSuccess);
1811 xorptr(boxReg, boxReg); // set ICC.ZF=1 to indicate success
1812 jmpb (DONE_LABEL);
1813 }
1814
1815 bind (Stacked);
1816 // It's not inflated and it's not recursively stack-locked and it's not biased.
1817 // It must be stack-locked.
1818 // Try to reset the header to displaced header.
1819 // The "box" value on the stack is stable, so we can reload
1820 // and be assured we observe the same value as above.
1821 movptr(tmpReg, Address(boxReg, 0));
1822 if (os::is_MP()) {
1823 lock();
1824 }
1825 cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
1826 // Intention fall-thru into DONE_LABEL
1827
1828 // DONE_LABEL is a hot target - we'd really like to place it at the
1829 // start of cache line by padding with NOPs.
1830 // See the AMD and Intel software optimization manuals for the
1831 // most efficient "long" NOP encodings.
1832 // Unfortunately none of our alignment mechanisms suffice.
1833 if ((EmitSync & 65536) == 0) {
1834 bind (CheckSucc);
1835 }
1836 #else // _LP64
1837 // It's inflated
1838 movptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
1839 xorptr(boxReg, r15_thread);
1840 orptr (boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
1841 jccb (Assembler::notZero, DONE_LABEL);
1842 movptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
1843 orptr (boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
1844 jccb (Assembler::notZero, CheckSucc);
1845 movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD);
1846 jmpb (DONE_LABEL);
1847
1848 if ((EmitSync & 65536) == 0) {
1849 Label LSuccess, LGoSlowPath ;
1850 bind (CheckSucc);
1851 cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
1852 jccb (Assembler::zero, LGoSlowPath);
1853
1854 // I'd much rather use lock:andl m->_owner, 0 as it's faster than the
1855 // the explicit ST;MEMBAR combination, but masm doesn't currently support
1856 // "ANDQ M,IMM". Don't use MFENCE here. lock:add to TOS, xchg, etc
1857 // are all faster when the write buffer is populated.
1858 movptr (Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD);
1859 if (os::is_MP()) {
1860 lock (); addl (Address(rsp, 0), 0);
1861 }
1862 cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
1863 jccb (Assembler::notZero, LSuccess);
1864
1865 movptr (boxReg, (int32_t)NULL_WORD); // box is really EAX
1866 if (os::is_MP()) { lock(); }
1867 cmpxchgptr(r15_thread, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
1868 jccb (Assembler::notEqual, LSuccess);
1869 // Intentional fall-through into slow-path
1870
1871 bind (LGoSlowPath);
1872 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure
1873 jmpb (DONE_LABEL);
1874
1875 bind (LSuccess);
1876 testl (boxReg, 0); // set ICC.ZF=1 to indicate success
1877 jmpb (DONE_LABEL);
1878 }
1879
1880 bind (Stacked);
1881 movptr(tmpReg, Address (boxReg, 0)); // re-fetch
1882 if (os::is_MP()) { lock(); }
1883 cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
1884
1885 if (EmitSync & 65536) {
1886 bind (CheckSucc);
1887 }
1888 #endif
1889 bind(DONE_LABEL);
1890 // Avoid branch to branch on AMD processors
1891 if (EmitSync & 32768) {
1892 nop();
1893 }
1894 }
1895 }
1896 #endif // COMPILER2
1897
1898 void MacroAssembler::c2bool(Register x) {
1899 // implements x == 0 ? 0 : 1
1900 // note: must only look at least-significant byte of x
1901 // since C-style booleans are stored in one byte
1902 // only! (was bug)
1903 andl(x, 0xFF);
1904 setb(Assembler::notZero, x);
1905 }
1906
1907 // Wouldn't need if AddressLiteral version had new name
1908 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
1909 Assembler::call(L, rtype);
1910 }
1911
1912 void MacroAssembler::call(Register entry) {
1913 Assembler::call(entry);
1914 }
1915
1916 void MacroAssembler::call(AddressLiteral entry) {
1917 if (reachable(entry)) {
|