Print this page
rev 6890 : 8062950: Bug in locking code when UseOptoBiasInlining is disabled: assert(dmw->is_neutral()) failed: invariant
Reviewed-by: dholmes, kvn
Split |
Split |
Close |
Expand all |
Collapse all |
--- old/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
+++ new/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
1 1 /*
2 2 * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
3 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 4 *
5 5 * This code is free software; you can redistribute it and/or modify it
6 6 * under the terms of the GNU General Public License version 2 only, as
7 7 * published by the Free Software Foundation.
8 8 *
9 9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 12 * version 2 for more details (a copy is included in the LICENSE file that
13 13 * accompanied this code).
14 14 *
15 15 * You should have received a copy of the GNU General Public License version
16 16 * 2 along with this work; if not, write to the Free Software Foundation,
17 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 18 *
19 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 20 * or visit www.oracle.com if you need additional information or have any
21 21 * questions.
22 22 *
23 23 */
24 24
25 25 #include "precompiled.hpp"
26 26 #include "asm/assembler.hpp"
27 27 #include "asm/assembler.inline.hpp"
28 28 #include "compiler/disassembler.hpp"
29 29 #include "gc_interface/collectedHeap.inline.hpp"
30 30 #include "interpreter/interpreter.hpp"
31 31 #include "memory/cardTableModRefBS.hpp"
32 32 #include "memory/resourceArea.hpp"
33 33 #include "memory/universe.hpp"
34 34 #include "prims/methodHandles.hpp"
35 35 #include "runtime/biasedLocking.hpp"
36 36 #include "runtime/interfaceSupport.hpp"
37 37 #include "runtime/objectMonitor.hpp"
38 38 #include "runtime/os.hpp"
39 39 #include "runtime/sharedRuntime.hpp"
40 40 #include "runtime/stubRoutines.hpp"
41 41 #include "utilities/macros.hpp"
42 42 #if INCLUDE_ALL_GCS
43 43 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
44 44 #include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
45 45 #include "gc_implementation/g1/heapRegion.hpp"
46 46 #endif // INCLUDE_ALL_GCS
47 47
48 48 #ifdef PRODUCT
49 49 #define BLOCK_COMMENT(str) /* nothing */
50 50 #define STOP(error) stop(error)
51 51 #else
52 52 #define BLOCK_COMMENT(str) block_comment(str)
53 53 #define STOP(error) block_comment(error); stop(error)
54 54 #endif
55 55
56 56 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
57 57
58 58 PRAGMA_FORMAT_MUTE_WARNINGS_FOR_GCC
59 59
60 60 #ifdef ASSERT
61 61 bool AbstractAssembler::pd_check_instruction_mark() { return true; }
62 62 #endif
63 63
64 64 static Assembler::Condition reverse[] = {
65 65 Assembler::noOverflow /* overflow = 0x0 */ ,
66 66 Assembler::overflow /* noOverflow = 0x1 */ ,
67 67 Assembler::aboveEqual /* carrySet = 0x2, below = 0x2 */ ,
68 68 Assembler::below /* aboveEqual = 0x3, carryClear = 0x3 */ ,
69 69 Assembler::notZero /* zero = 0x4, equal = 0x4 */ ,
70 70 Assembler::zero /* notZero = 0x5, notEqual = 0x5 */ ,
71 71 Assembler::above /* belowEqual = 0x6 */ ,
72 72 Assembler::belowEqual /* above = 0x7 */ ,
73 73 Assembler::positive /* negative = 0x8 */ ,
74 74 Assembler::negative /* positive = 0x9 */ ,
75 75 Assembler::noParity /* parity = 0xa */ ,
76 76 Assembler::parity /* noParity = 0xb */ ,
77 77 Assembler::greaterEqual /* less = 0xc */ ,
78 78 Assembler::less /* greaterEqual = 0xd */ ,
79 79 Assembler::greater /* lessEqual = 0xe */ ,
80 80 Assembler::lessEqual /* greater = 0xf, */
81 81
82 82 };
83 83
84 84
85 85 // Implementation of MacroAssembler
86 86
87 87 // First all the versions that have distinct versions depending on 32/64 bit
88 88 // Unless the difference is trivial (1 line or so).
89 89
90 90 #ifndef _LP64
91 91
92 92 // 32bit versions
93 93
94 94 Address MacroAssembler::as_Address(AddressLiteral adr) {
95 95 return Address(adr.target(), adr.rspec());
96 96 }
97 97
98 98 Address MacroAssembler::as_Address(ArrayAddress adr) {
99 99 return Address::make_array(adr);
100 100 }
101 101
102 102 void MacroAssembler::call_VM_leaf_base(address entry_point,
103 103 int number_of_arguments) {
104 104 call(RuntimeAddress(entry_point));
105 105 increment(rsp, number_of_arguments * wordSize);
106 106 }
107 107
108 108 void MacroAssembler::cmpklass(Address src1, Metadata* obj) {
109 109 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
110 110 }
111 111
112 112 void MacroAssembler::cmpklass(Register src1, Metadata* obj) {
113 113 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate());
114 114 }
115 115
116 116 void MacroAssembler::cmpoop(Address src1, jobject obj) {
117 117 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
118 118 }
119 119
120 120 void MacroAssembler::cmpoop(Register src1, jobject obj) {
121 121 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate());
122 122 }
123 123
124 124 void MacroAssembler::extend_sign(Register hi, Register lo) {
125 125 // According to Intel Doc. AP-526, "Integer Divide", p.18.
126 126 if (VM_Version::is_P6() && hi == rdx && lo == rax) {
127 127 cdql();
128 128 } else {
129 129 movl(hi, lo);
130 130 sarl(hi, 31);
131 131 }
132 132 }
133 133
134 134 void MacroAssembler::jC2(Register tmp, Label& L) {
135 135 // set parity bit if FPU flag C2 is set (via rax)
136 136 save_rax(tmp);
137 137 fwait(); fnstsw_ax();
138 138 sahf();
139 139 restore_rax(tmp);
140 140 // branch
141 141 jcc(Assembler::parity, L);
142 142 }
143 143
144 144 void MacroAssembler::jnC2(Register tmp, Label& L) {
145 145 // set parity bit if FPU flag C2 is set (via rax)
146 146 save_rax(tmp);
147 147 fwait(); fnstsw_ax();
148 148 sahf();
149 149 restore_rax(tmp);
150 150 // branch
151 151 jcc(Assembler::noParity, L);
152 152 }
153 153
154 154 // 32bit can do a case table jump in one instruction but we no longer allow the base
155 155 // to be installed in the Address class
156 156 void MacroAssembler::jump(ArrayAddress entry) {
157 157 jmp(as_Address(entry));
158 158 }
159 159
160 160 // Note: y_lo will be destroyed
161 161 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
162 162 // Long compare for Java (semantics as described in JVM spec.)
163 163 Label high, low, done;
164 164
165 165 cmpl(x_hi, y_hi);
166 166 jcc(Assembler::less, low);
167 167 jcc(Assembler::greater, high);
168 168 // x_hi is the return register
169 169 xorl(x_hi, x_hi);
170 170 cmpl(x_lo, y_lo);
171 171 jcc(Assembler::below, low);
172 172 jcc(Assembler::equal, done);
173 173
174 174 bind(high);
175 175 xorl(x_hi, x_hi);
176 176 increment(x_hi);
177 177 jmp(done);
178 178
179 179 bind(low);
180 180 xorl(x_hi, x_hi);
181 181 decrementl(x_hi);
182 182
183 183 bind(done);
184 184 }
185 185
186 186 void MacroAssembler::lea(Register dst, AddressLiteral src) {
187 187 mov_literal32(dst, (int32_t)src.target(), src.rspec());
188 188 }
189 189
190 190 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
191 191 // leal(dst, as_Address(adr));
192 192 // see note in movl as to why we must use a move
193 193 mov_literal32(dst, (int32_t) adr.target(), adr.rspec());
194 194 }
195 195
196 196 void MacroAssembler::leave() {
197 197 mov(rsp, rbp);
198 198 pop(rbp);
199 199 }
200 200
201 201 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) {
202 202 // Multiplication of two Java long values stored on the stack
203 203 // as illustrated below. Result is in rdx:rax.
204 204 //
205 205 // rsp ---> [ ?? ] \ \
206 206 // .... | y_rsp_offset |
207 207 // [ y_lo ] / (in bytes) | x_rsp_offset
208 208 // [ y_hi ] | (in bytes)
209 209 // .... |
210 210 // [ x_lo ] /
211 211 // [ x_hi ]
212 212 // ....
213 213 //
214 214 // Basic idea: lo(result) = lo(x_lo * y_lo)
215 215 // hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
216 216 Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset);
217 217 Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset);
218 218 Label quick;
219 219 // load x_hi, y_hi and check if quick
220 220 // multiplication is possible
221 221 movl(rbx, x_hi);
222 222 movl(rcx, y_hi);
223 223 movl(rax, rbx);
224 224 orl(rbx, rcx); // rbx, = 0 <=> x_hi = 0 and y_hi = 0
225 225 jcc(Assembler::zero, quick); // if rbx, = 0 do quick multiply
226 226 // do full multiplication
227 227 // 1st step
228 228 mull(y_lo); // x_hi * y_lo
229 229 movl(rbx, rax); // save lo(x_hi * y_lo) in rbx,
230 230 // 2nd step
231 231 movl(rax, x_lo);
232 232 mull(rcx); // x_lo * y_hi
233 233 addl(rbx, rax); // add lo(x_lo * y_hi) to rbx,
234 234 // 3rd step
235 235 bind(quick); // note: rbx, = 0 if quick multiply!
236 236 movl(rax, x_lo);
237 237 mull(y_lo); // x_lo * y_lo
238 238 addl(rdx, rbx); // correct hi(x_lo * y_lo)
239 239 }
240 240
241 241 void MacroAssembler::lneg(Register hi, Register lo) {
242 242 negl(lo);
243 243 adcl(hi, 0);
244 244 negl(hi);
245 245 }
246 246
247 247 void MacroAssembler::lshl(Register hi, Register lo) {
248 248 // Java shift left long support (semantics as described in JVM spec., p.305)
249 249 // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n))
250 250 // shift value is in rcx !
251 251 assert(hi != rcx, "must not use rcx");
252 252 assert(lo != rcx, "must not use rcx");
253 253 const Register s = rcx; // shift count
254 254 const int n = BitsPerWord;
255 255 Label L;
256 256 andl(s, 0x3f); // s := s & 0x3f (s < 0x40)
257 257 cmpl(s, n); // if (s < n)
258 258 jcc(Assembler::less, L); // else (s >= n)
259 259 movl(hi, lo); // x := x << n
260 260 xorl(lo, lo);
261 261 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
262 262 bind(L); // s (mod n) < n
263 263 shldl(hi, lo); // x := x << s
264 264 shll(lo);
265 265 }
266 266
267 267
268 268 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) {
269 269 // Java shift right long support (semantics as described in JVM spec., p.306 & p.310)
270 270 // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
271 271 assert(hi != rcx, "must not use rcx");
272 272 assert(lo != rcx, "must not use rcx");
273 273 const Register s = rcx; // shift count
274 274 const int n = BitsPerWord;
275 275 Label L;
276 276 andl(s, 0x3f); // s := s & 0x3f (s < 0x40)
277 277 cmpl(s, n); // if (s < n)
278 278 jcc(Assembler::less, L); // else (s >= n)
279 279 movl(lo, hi); // x := x >> n
280 280 if (sign_extension) sarl(hi, 31);
281 281 else xorl(hi, hi);
282 282 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
283 283 bind(L); // s (mod n) < n
284 284 shrdl(lo, hi); // x := x >> s
285 285 if (sign_extension) sarl(hi);
286 286 else shrl(hi);
287 287 }
288 288
289 289 void MacroAssembler::movoop(Register dst, jobject obj) {
290 290 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
291 291 }
292 292
293 293 void MacroAssembler::movoop(Address dst, jobject obj) {
294 294 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate());
295 295 }
296 296
297 297 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
298 298 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
299 299 }
300 300
301 301 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
302 302 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate());
303 303 }
304 304
305 305 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
306 306 // scratch register is not used,
307 307 // it is defined to match parameters of 64-bit version of this method.
308 308 if (src.is_lval()) {
309 309 mov_literal32(dst, (intptr_t)src.target(), src.rspec());
310 310 } else {
311 311 movl(dst, as_Address(src));
312 312 }
313 313 }
314 314
315 315 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
316 316 movl(as_Address(dst), src);
317 317 }
318 318
319 319 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
320 320 movl(dst, as_Address(src));
321 321 }
322 322
323 323 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
324 324 void MacroAssembler::movptr(Address dst, intptr_t src) {
325 325 movl(dst, src);
326 326 }
327 327
328 328
329 329 void MacroAssembler::pop_callee_saved_registers() {
330 330 pop(rcx);
331 331 pop(rdx);
332 332 pop(rdi);
333 333 pop(rsi);
334 334 }
335 335
336 336 void MacroAssembler::pop_fTOS() {
337 337 fld_d(Address(rsp, 0));
338 338 addl(rsp, 2 * wordSize);
339 339 }
340 340
341 341 void MacroAssembler::push_callee_saved_registers() {
342 342 push(rsi);
343 343 push(rdi);
344 344 push(rdx);
345 345 push(rcx);
346 346 }
347 347
348 348 void MacroAssembler::push_fTOS() {
349 349 subl(rsp, 2 * wordSize);
350 350 fstp_d(Address(rsp, 0));
351 351 }
352 352
353 353
354 354 void MacroAssembler::pushoop(jobject obj) {
355 355 push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate());
356 356 }
357 357
358 358 void MacroAssembler::pushklass(Metadata* obj) {
359 359 push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate());
360 360 }
361 361
362 362 void MacroAssembler::pushptr(AddressLiteral src) {
363 363 if (src.is_lval()) {
364 364 push_literal32((int32_t)src.target(), src.rspec());
365 365 } else {
366 366 pushl(as_Address(src));
367 367 }
368 368 }
369 369
370 370 void MacroAssembler::set_word_if_not_zero(Register dst) {
371 371 xorl(dst, dst);
372 372 set_byte_if_not_zero(dst);
373 373 }
374 374
375 375 static void pass_arg0(MacroAssembler* masm, Register arg) {
376 376 masm->push(arg);
377 377 }
378 378
379 379 static void pass_arg1(MacroAssembler* masm, Register arg) {
380 380 masm->push(arg);
381 381 }
382 382
383 383 static void pass_arg2(MacroAssembler* masm, Register arg) {
384 384 masm->push(arg);
385 385 }
386 386
387 387 static void pass_arg3(MacroAssembler* masm, Register arg) {
388 388 masm->push(arg);
389 389 }
390 390
391 391 #ifndef PRODUCT
392 392 extern "C" void findpc(intptr_t x);
393 393 #endif
394 394
395 395 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) {
396 396 // In order to get locks to work, we need to fake a in_VM state
397 397 JavaThread* thread = JavaThread::current();
398 398 JavaThreadState saved_state = thread->thread_state();
399 399 thread->set_thread_state(_thread_in_vm);
400 400 if (ShowMessageBoxOnError) {
401 401 JavaThread* thread = JavaThread::current();
402 402 JavaThreadState saved_state = thread->thread_state();
403 403 thread->set_thread_state(_thread_in_vm);
404 404 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
405 405 ttyLocker ttyl;
406 406 BytecodeCounter::print();
407 407 }
408 408 // To see where a verify_oop failed, get $ebx+40/X for this frame.
409 409 // This is the value of eip which points to where verify_oop will return.
410 410 if (os::message_box(msg, "Execution stopped, print registers?")) {
411 411 print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
412 412 BREAKPOINT;
413 413 }
414 414 } else {
415 415 ttyLocker ttyl;
416 416 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
417 417 }
418 418 // Don't assert holding the ttyLock
419 419 assert(false, err_msg("DEBUG MESSAGE: %s", msg));
420 420 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
421 421 }
422 422
423 423 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
424 424 ttyLocker ttyl;
425 425 FlagSetting fs(Debugging, true);
426 426 tty->print_cr("eip = 0x%08x", eip);
427 427 #ifndef PRODUCT
428 428 if ((WizardMode || Verbose) && PrintMiscellaneous) {
429 429 tty->cr();
430 430 findpc(eip);
431 431 tty->cr();
432 432 }
433 433 #endif
434 434 #define PRINT_REG(rax) \
435 435 { tty->print("%s = ", #rax); os::print_location(tty, rax); }
436 436 PRINT_REG(rax);
437 437 PRINT_REG(rbx);
438 438 PRINT_REG(rcx);
439 439 PRINT_REG(rdx);
440 440 PRINT_REG(rdi);
441 441 PRINT_REG(rsi);
442 442 PRINT_REG(rbp);
443 443 PRINT_REG(rsp);
444 444 #undef PRINT_REG
445 445 // Print some words near top of staack.
446 446 int* dump_sp = (int*) rsp;
447 447 for (int col1 = 0; col1 < 8; col1++) {
448 448 tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
449 449 os::print_location(tty, *dump_sp++);
450 450 }
451 451 for (int row = 0; row < 16; row++) {
452 452 tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
453 453 for (int col = 0; col < 8; col++) {
454 454 tty->print(" 0x%08x", *dump_sp++);
455 455 }
456 456 tty->cr();
457 457 }
458 458 // Print some instructions around pc:
459 459 Disassembler::decode((address)eip-64, (address)eip);
460 460 tty->print_cr("--------");
461 461 Disassembler::decode((address)eip, (address)eip+32);
462 462 }
463 463
464 464 void MacroAssembler::stop(const char* msg) {
465 465 ExternalAddress message((address)msg);
466 466 // push address of message
467 467 pushptr(message.addr());
468 468 { Label L; call(L, relocInfo::none); bind(L); } // push eip
469 469 pusha(); // push registers
470 470 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32)));
471 471 hlt();
472 472 }
473 473
474 474 void MacroAssembler::warn(const char* msg) {
475 475 push_CPU_state();
476 476
477 477 ExternalAddress message((address) msg);
478 478 // push address of message
479 479 pushptr(message.addr());
480 480
481 481 call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning)));
482 482 addl(rsp, wordSize); // discard argument
483 483 pop_CPU_state();
484 484 }
485 485
486 486 void MacroAssembler::print_state() {
487 487 { Label L; call(L, relocInfo::none); bind(L); } // push eip
488 488 pusha(); // push registers
489 489
490 490 push_CPU_state();
491 491 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32)));
492 492 pop_CPU_state();
493 493
494 494 popa();
495 495 addl(rsp, wordSize);
496 496 }
497 497
498 498 #else // _LP64
499 499
500 500 // 64 bit versions
501 501
502 502 Address MacroAssembler::as_Address(AddressLiteral adr) {
503 503 // amd64 always does this as a pc-rel
504 504 // we can be absolute or disp based on the instruction type
505 505 // jmp/call are displacements others are absolute
506 506 assert(!adr.is_lval(), "must be rval");
507 507 assert(reachable(adr), "must be");
508 508 return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc());
509 509
510 510 }
511 511
512 512 Address MacroAssembler::as_Address(ArrayAddress adr) {
513 513 AddressLiteral base = adr.base();
514 514 lea(rscratch1, base);
515 515 Address index = adr.index();
516 516 assert(index._disp == 0, "must not have disp"); // maybe it can?
517 517 Address array(rscratch1, index._index, index._scale, index._disp);
518 518 return array;
519 519 }
520 520
521 521 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
522 522 Label L, E;
523 523
524 524 #ifdef _WIN64
525 525 // Windows always allocates space for it's register args
526 526 assert(num_args <= 4, "only register arguments supported");
527 527 subq(rsp, frame::arg_reg_save_area_bytes);
528 528 #endif
529 529
530 530 // Align stack if necessary
531 531 testl(rsp, 15);
532 532 jcc(Assembler::zero, L);
533 533
534 534 subq(rsp, 8);
535 535 {
536 536 call(RuntimeAddress(entry_point));
537 537 }
538 538 addq(rsp, 8);
539 539 jmp(E);
540 540
541 541 bind(L);
542 542 {
543 543 call(RuntimeAddress(entry_point));
544 544 }
545 545
546 546 bind(E);
547 547
548 548 #ifdef _WIN64
549 549 // restore stack pointer
550 550 addq(rsp, frame::arg_reg_save_area_bytes);
551 551 #endif
552 552
553 553 }
554 554
555 555 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) {
556 556 assert(!src2.is_lval(), "should use cmpptr");
557 557
558 558 if (reachable(src2)) {
559 559 cmpq(src1, as_Address(src2));
560 560 } else {
561 561 lea(rscratch1, src2);
562 562 Assembler::cmpq(src1, Address(rscratch1, 0));
563 563 }
564 564 }
565 565
566 566 int MacroAssembler::corrected_idivq(Register reg) {
567 567 // Full implementation of Java ldiv and lrem; checks for special
568 568 // case as described in JVM spec., p.243 & p.271. The function
569 569 // returns the (pc) offset of the idivl instruction - may be needed
570 570 // for implicit exceptions.
571 571 //
572 572 // normal case special case
573 573 //
574 574 // input : rax: dividend min_long
575 575 // reg: divisor (may not be eax/edx) -1
576 576 //
577 577 // output: rax: quotient (= rax idiv reg) min_long
578 578 // rdx: remainder (= rax irem reg) 0
579 579 assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register");
580 580 static const int64_t min_long = 0x8000000000000000;
581 581 Label normal_case, special_case;
582 582
583 583 // check for special case
584 584 cmp64(rax, ExternalAddress((address) &min_long));
585 585 jcc(Assembler::notEqual, normal_case);
586 586 xorl(rdx, rdx); // prepare rdx for possible special case (where
587 587 // remainder = 0)
588 588 cmpq(reg, -1);
589 589 jcc(Assembler::equal, special_case);
590 590
591 591 // handle normal case
592 592 bind(normal_case);
593 593 cdqq();
594 594 int idivq_offset = offset();
595 595 idivq(reg);
596 596
597 597 // normal and special case exit
598 598 bind(special_case);
599 599
600 600 return idivq_offset;
601 601 }
602 602
603 603 void MacroAssembler::decrementq(Register reg, int value) {
604 604 if (value == min_jint) { subq(reg, value); return; }
605 605 if (value < 0) { incrementq(reg, -value); return; }
606 606 if (value == 0) { ; return; }
607 607 if (value == 1 && UseIncDec) { decq(reg) ; return; }
608 608 /* else */ { subq(reg, value) ; return; }
609 609 }
610 610
611 611 void MacroAssembler::decrementq(Address dst, int value) {
612 612 if (value == min_jint) { subq(dst, value); return; }
613 613 if (value < 0) { incrementq(dst, -value); return; }
614 614 if (value == 0) { ; return; }
615 615 if (value == 1 && UseIncDec) { decq(dst) ; return; }
616 616 /* else */ { subq(dst, value) ; return; }
617 617 }
618 618
619 619 void MacroAssembler::incrementq(AddressLiteral dst) {
620 620 if (reachable(dst)) {
621 621 incrementq(as_Address(dst));
622 622 } else {
623 623 lea(rscratch1, dst);
624 624 incrementq(Address(rscratch1, 0));
625 625 }
626 626 }
627 627
628 628 void MacroAssembler::incrementq(Register reg, int value) {
629 629 if (value == min_jint) { addq(reg, value); return; }
630 630 if (value < 0) { decrementq(reg, -value); return; }
631 631 if (value == 0) { ; return; }
632 632 if (value == 1 && UseIncDec) { incq(reg) ; return; }
633 633 /* else */ { addq(reg, value) ; return; }
634 634 }
635 635
636 636 void MacroAssembler::incrementq(Address dst, int value) {
637 637 if (value == min_jint) { addq(dst, value); return; }
638 638 if (value < 0) { decrementq(dst, -value); return; }
639 639 if (value == 0) { ; return; }
640 640 if (value == 1 && UseIncDec) { incq(dst) ; return; }
641 641 /* else */ { addq(dst, value) ; return; }
642 642 }
643 643
644 644 // 32bit can do a case table jump in one instruction but we no longer allow the base
645 645 // to be installed in the Address class
646 646 void MacroAssembler::jump(ArrayAddress entry) {
647 647 lea(rscratch1, entry.base());
648 648 Address dispatch = entry.index();
649 649 assert(dispatch._base == noreg, "must be");
650 650 dispatch._base = rscratch1;
651 651 jmp(dispatch);
652 652 }
653 653
654 654 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
655 655 ShouldNotReachHere(); // 64bit doesn't use two regs
656 656 cmpq(x_lo, y_lo);
657 657 }
658 658
659 659 void MacroAssembler::lea(Register dst, AddressLiteral src) {
660 660 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
661 661 }
662 662
663 663 void MacroAssembler::lea(Address dst, AddressLiteral adr) {
664 664 mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec());
665 665 movptr(dst, rscratch1);
666 666 }
667 667
668 668 void MacroAssembler::leave() {
669 669 // %%% is this really better? Why not on 32bit too?
670 670 emit_int8((unsigned char)0xC9); // LEAVE
671 671 }
672 672
673 673 void MacroAssembler::lneg(Register hi, Register lo) {
674 674 ShouldNotReachHere(); // 64bit doesn't use two regs
675 675 negq(lo);
676 676 }
677 677
678 678 void MacroAssembler::movoop(Register dst, jobject obj) {
679 679 mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate());
680 680 }
681 681
682 682 void MacroAssembler::movoop(Address dst, jobject obj) {
683 683 mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate());
684 684 movq(dst, rscratch1);
685 685 }
686 686
687 687 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
688 688 mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
689 689 }
690 690
691 691 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) {
692 692 mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate());
693 693 movq(dst, rscratch1);
694 694 }
695 695
696 696 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) {
697 697 if (src.is_lval()) {
698 698 mov_literal64(dst, (intptr_t)src.target(), src.rspec());
699 699 } else {
700 700 if (reachable(src)) {
701 701 movq(dst, as_Address(src));
702 702 } else {
703 703 lea(scratch, src);
704 704 movq(dst, Address(scratch, 0));
705 705 }
706 706 }
707 707 }
708 708
709 709 void MacroAssembler::movptr(ArrayAddress dst, Register src) {
710 710 movq(as_Address(dst), src);
711 711 }
712 712
713 713 void MacroAssembler::movptr(Register dst, ArrayAddress src) {
714 714 movq(dst, as_Address(src));
715 715 }
716 716
717 717 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
718 718 void MacroAssembler::movptr(Address dst, intptr_t src) {
719 719 mov64(rscratch1, src);
720 720 movq(dst, rscratch1);
721 721 }
722 722
723 723 // These are mostly for initializing NULL
724 724 void MacroAssembler::movptr(Address dst, int32_t src) {
725 725 movslq(dst, src);
726 726 }
727 727
728 728 void MacroAssembler::movptr(Register dst, int32_t src) {
729 729 mov64(dst, (intptr_t)src);
730 730 }
731 731
732 732 void MacroAssembler::pushoop(jobject obj) {
733 733 movoop(rscratch1, obj);
734 734 push(rscratch1);
735 735 }
736 736
737 737 void MacroAssembler::pushklass(Metadata* obj) {
738 738 mov_metadata(rscratch1, obj);
739 739 push(rscratch1);
740 740 }
741 741
742 742 void MacroAssembler::pushptr(AddressLiteral src) {
743 743 lea(rscratch1, src);
744 744 if (src.is_lval()) {
745 745 push(rscratch1);
746 746 } else {
747 747 pushq(Address(rscratch1, 0));
748 748 }
749 749 }
750 750
751 751 void MacroAssembler::reset_last_Java_frame(bool clear_fp,
752 752 bool clear_pc) {
753 753 // we must set sp to zero to clear frame
754 754 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
755 755 // must clear fp, so that compiled frames are not confused; it is
756 756 // possible that we need it only for debugging
757 757 if (clear_fp) {
758 758 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
759 759 }
760 760
761 761 if (clear_pc) {
762 762 movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
763 763 }
764 764 }
765 765
766 766 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
767 767 Register last_java_fp,
768 768 address last_java_pc) {
769 769 // determine last_java_sp register
770 770 if (!last_java_sp->is_valid()) {
771 771 last_java_sp = rsp;
772 772 }
773 773
774 774 // last_java_fp is optional
775 775 if (last_java_fp->is_valid()) {
776 776 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()),
777 777 last_java_fp);
778 778 }
779 779
780 780 // last_java_pc is optional
781 781 if (last_java_pc != NULL) {
782 782 Address java_pc(r15_thread,
783 783 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
784 784 lea(rscratch1, InternalAddress(last_java_pc));
785 785 movptr(java_pc, rscratch1);
786 786 }
787 787
788 788 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
789 789 }
790 790
791 791 static void pass_arg0(MacroAssembler* masm, Register arg) {
792 792 if (c_rarg0 != arg ) {
793 793 masm->mov(c_rarg0, arg);
794 794 }
795 795 }
796 796
797 797 static void pass_arg1(MacroAssembler* masm, Register arg) {
798 798 if (c_rarg1 != arg ) {
799 799 masm->mov(c_rarg1, arg);
800 800 }
801 801 }
802 802
803 803 static void pass_arg2(MacroAssembler* masm, Register arg) {
804 804 if (c_rarg2 != arg ) {
805 805 masm->mov(c_rarg2, arg);
806 806 }
807 807 }
808 808
809 809 static void pass_arg3(MacroAssembler* masm, Register arg) {
810 810 if (c_rarg3 != arg ) {
811 811 masm->mov(c_rarg3, arg);
812 812 }
813 813 }
814 814
815 815 void MacroAssembler::stop(const char* msg) {
816 816 address rip = pc();
817 817 pusha(); // get regs on stack
818 818 lea(c_rarg0, ExternalAddress((address) msg));
819 819 lea(c_rarg1, InternalAddress(rip));
820 820 movq(c_rarg2, rsp); // pass pointer to regs array
821 821 andq(rsp, -16); // align stack as required by ABI
822 822 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
823 823 hlt();
824 824 }
825 825
826 826 void MacroAssembler::warn(const char* msg) {
827 827 push(rbp);
828 828 movq(rbp, rsp);
829 829 andq(rsp, -16); // align stack as required by push_CPU_state and call
830 830 push_CPU_state(); // keeps alignment at 16 bytes
831 831 lea(c_rarg0, ExternalAddress((address) msg));
832 832 call_VM_leaf(CAST_FROM_FN_PTR(address, warning), c_rarg0);
833 833 pop_CPU_state();
834 834 mov(rsp, rbp);
835 835 pop(rbp);
836 836 }
837 837
838 838 void MacroAssembler::print_state() {
839 839 address rip = pc();
840 840 pusha(); // get regs on stack
841 841 push(rbp);
842 842 movq(rbp, rsp);
843 843 andq(rsp, -16); // align stack as required by push_CPU_state and call
844 844 push_CPU_state(); // keeps alignment at 16 bytes
845 845
846 846 lea(c_rarg0, InternalAddress(rip));
847 847 lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array
848 848 call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1);
849 849
850 850 pop_CPU_state();
851 851 mov(rsp, rbp);
852 852 pop(rbp);
853 853 popa();
854 854 }
855 855
856 856 #ifndef PRODUCT
857 857 extern "C" void findpc(intptr_t x);
858 858 #endif
859 859
860 860 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) {
861 861 // In order to get locks to work, we need to fake a in_VM state
862 862 if (ShowMessageBoxOnError) {
863 863 JavaThread* thread = JavaThread::current();
864 864 JavaThreadState saved_state = thread->thread_state();
865 865 thread->set_thread_state(_thread_in_vm);
866 866 #ifndef PRODUCT
867 867 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
868 868 ttyLocker ttyl;
869 869 BytecodeCounter::print();
870 870 }
871 871 #endif
872 872 // To see where a verify_oop failed, get $ebx+40/X for this frame.
873 873 // XXX correct this offset for amd64
874 874 // This is the value of eip which points to where verify_oop will return.
875 875 if (os::message_box(msg, "Execution stopped, print registers?")) {
876 876 print_state64(pc, regs);
877 877 BREAKPOINT;
878 878 assert(false, "start up GDB");
879 879 }
880 880 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
881 881 } else {
882 882 ttyLocker ttyl;
883 883 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
884 884 msg);
885 885 assert(false, err_msg("DEBUG MESSAGE: %s", msg));
886 886 }
887 887 }
888 888
889 889 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
890 890 ttyLocker ttyl;
891 891 FlagSetting fs(Debugging, true);
892 892 tty->print_cr("rip = 0x%016lx", pc);
893 893 #ifndef PRODUCT
894 894 tty->cr();
895 895 findpc(pc);
896 896 tty->cr();
897 897 #endif
898 898 #define PRINT_REG(rax, value) \
899 899 { tty->print("%s = ", #rax); os::print_location(tty, value); }
900 900 PRINT_REG(rax, regs[15]);
901 901 PRINT_REG(rbx, regs[12]);
902 902 PRINT_REG(rcx, regs[14]);
903 903 PRINT_REG(rdx, regs[13]);
904 904 PRINT_REG(rdi, regs[8]);
905 905 PRINT_REG(rsi, regs[9]);
906 906 PRINT_REG(rbp, regs[10]);
907 907 PRINT_REG(rsp, regs[11]);
908 908 PRINT_REG(r8 , regs[7]);
909 909 PRINT_REG(r9 , regs[6]);
910 910 PRINT_REG(r10, regs[5]);
911 911 PRINT_REG(r11, regs[4]);
912 912 PRINT_REG(r12, regs[3]);
913 913 PRINT_REG(r13, regs[2]);
914 914 PRINT_REG(r14, regs[1]);
915 915 PRINT_REG(r15, regs[0]);
916 916 #undef PRINT_REG
917 917 // Print some words near top of staack.
918 918 int64_t* rsp = (int64_t*) regs[11];
919 919 int64_t* dump_sp = rsp;
920 920 for (int col1 = 0; col1 < 8; col1++) {
921 921 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp);
922 922 os::print_location(tty, *dump_sp++);
923 923 }
924 924 for (int row = 0; row < 25; row++) {
925 925 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp);
926 926 for (int col = 0; col < 4; col++) {
927 927 tty->print(" 0x%016lx", *dump_sp++);
928 928 }
929 929 tty->cr();
930 930 }
931 931 // Print some instructions around pc:
932 932 Disassembler::decode((address)pc-64, (address)pc);
933 933 tty->print_cr("--------");
934 934 Disassembler::decode((address)pc, (address)pc+32);
935 935 }
936 936
937 937 #endif // _LP64
938 938
939 939 // Now versions that are common to 32/64 bit
940 940
941 941 void MacroAssembler::addptr(Register dst, int32_t imm32) {
942 942 LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32));
943 943 }
944 944
945 945 void MacroAssembler::addptr(Register dst, Register src) {
946 946 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
947 947 }
948 948
949 949 void MacroAssembler::addptr(Address dst, Register src) {
950 950 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src));
951 951 }
952 952
953 953 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) {
954 954 if (reachable(src)) {
955 955 Assembler::addsd(dst, as_Address(src));
956 956 } else {
957 957 lea(rscratch1, src);
958 958 Assembler::addsd(dst, Address(rscratch1, 0));
959 959 }
960 960 }
961 961
962 962 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) {
963 963 if (reachable(src)) {
964 964 addss(dst, as_Address(src));
965 965 } else {
966 966 lea(rscratch1, src);
967 967 addss(dst, Address(rscratch1, 0));
968 968 }
969 969 }
970 970
971 971 void MacroAssembler::align(int modulus) {
972 972 if (offset() % modulus != 0) {
973 973 nop(modulus - (offset() % modulus));
974 974 }
975 975 }
976 976
977 977 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src) {
978 978 // Used in sign-masking with aligned address.
979 979 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
980 980 if (reachable(src)) {
981 981 Assembler::andpd(dst, as_Address(src));
982 982 } else {
983 983 lea(rscratch1, src);
984 984 Assembler::andpd(dst, Address(rscratch1, 0));
985 985 }
986 986 }
987 987
988 988 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) {
989 989 // Used in sign-masking with aligned address.
990 990 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
991 991 if (reachable(src)) {
992 992 Assembler::andps(dst, as_Address(src));
993 993 } else {
994 994 lea(rscratch1, src);
995 995 Assembler::andps(dst, Address(rscratch1, 0));
996 996 }
997 997 }
998 998
999 999 void MacroAssembler::andptr(Register dst, int32_t imm32) {
1000 1000 LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32));
1001 1001 }
1002 1002
1003 1003 void MacroAssembler::atomic_incl(Address counter_addr) {
1004 1004 if (os::is_MP())
1005 1005 lock();
1006 1006 incrementl(counter_addr);
1007 1007 }
1008 1008
1009 1009 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) {
1010 1010 if (reachable(counter_addr)) {
1011 1011 atomic_incl(as_Address(counter_addr));
1012 1012 } else {
1013 1013 lea(scr, counter_addr);
1014 1014 atomic_incl(Address(scr, 0));
1015 1015 }
1016 1016 }
1017 1017
1018 1018 #ifdef _LP64
1019 1019 void MacroAssembler::atomic_incq(Address counter_addr) {
1020 1020 if (os::is_MP())
1021 1021 lock();
1022 1022 incrementq(counter_addr);
1023 1023 }
1024 1024
1025 1025 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) {
1026 1026 if (reachable(counter_addr)) {
1027 1027 atomic_incq(as_Address(counter_addr));
1028 1028 } else {
1029 1029 lea(scr, counter_addr);
1030 1030 atomic_incq(Address(scr, 0));
1031 1031 }
1032 1032 }
1033 1033 #endif
1034 1034
1035 1035 // Writes to stack successive pages until offset reached to check for
1036 1036 // stack overflow + shadow pages. This clobbers tmp.
1037 1037 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
1038 1038 movptr(tmp, rsp);
1039 1039 // Bang stack for total size given plus shadow page size.
1040 1040 // Bang one page at a time because large size can bang beyond yellow and
1041 1041 // red zones.
1042 1042 Label loop;
1043 1043 bind(loop);
1044 1044 movl(Address(tmp, (-os::vm_page_size())), size );
1045 1045 subptr(tmp, os::vm_page_size());
1046 1046 subl(size, os::vm_page_size());
1047 1047 jcc(Assembler::greater, loop);
1048 1048
1049 1049 // Bang down shadow pages too.
1050 1050 // At this point, (tmp-0) is the last address touched, so don't
1051 1051 // touch it again. (It was touched as (tmp-pagesize) but then tmp
1052 1052 // was post-decremented.) Skip this address by starting at i=1, and
1053 1053 // touch a few more pages below. N.B. It is important to touch all
1054 1054 // the way down to and including i=StackShadowPages.
1055 1055 for (int i = 1; i < StackShadowPages; i++) {
1056 1056 // this could be any sized move but this is can be a debugging crumb
1057 1057 // so the bigger the better.
1058 1058 movptr(Address(tmp, (-i*os::vm_page_size())), size );
1059 1059 }
1060 1060 }
1061 1061
1062 1062 int MacroAssembler::biased_locking_enter(Register lock_reg,
1063 1063 Register obj_reg,
1064 1064 Register swap_reg,
1065 1065 Register tmp_reg,
1066 1066 bool swap_reg_contains_mark,
1067 1067 Label& done,
1068 1068 Label* slow_case,
1069 1069 BiasedLockingCounters* counters) {
1070 1070 assert(UseBiasedLocking, "why call this otherwise?");
1071 1071 assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq");
1072 1072 LP64_ONLY( assert(tmp_reg != noreg, "tmp_reg must be supplied"); )
1073 1073 bool need_tmp_reg = false;
1074 1074 if (tmp_reg == noreg) {
1075 1075 need_tmp_reg = true;
1076 1076 tmp_reg = lock_reg;
1077 1077 assert_different_registers(lock_reg, obj_reg, swap_reg);
1078 1078 } else {
1079 1079 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg);
1080 1080 }
1081 1081 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
1082 1082 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes());
1083 1083 Address saved_mark_addr(lock_reg, 0);
1084 1084
1085 1085 if (PrintBiasedLockingStatistics && counters == NULL) {
1086 1086 counters = BiasedLocking::counters();
1087 1087 }
1088 1088 // Biased locking
1089 1089 // See whether the lock is currently biased toward our thread and
1090 1090 // whether the epoch is still valid
1091 1091 // Note that the runtime guarantees sufficient alignment of JavaThread
1092 1092 // pointers to allow age to be placed into low bits
1093 1093 // First check to see whether biasing is even enabled for this object
1094 1094 Label cas_label;
1095 1095 int null_check_offset = -1;
1096 1096 if (!swap_reg_contains_mark) {
1097 1097 null_check_offset = offset();
1098 1098 movptr(swap_reg, mark_addr);
1099 1099 }
1100 1100 if (need_tmp_reg) {
1101 1101 push(tmp_reg);
1102 1102 }
1103 1103 movptr(tmp_reg, swap_reg);
1104 1104 andptr(tmp_reg, markOopDesc::biased_lock_mask_in_place);
1105 1105 cmpptr(tmp_reg, markOopDesc::biased_lock_pattern);
1106 1106 if (need_tmp_reg) {
1107 1107 pop(tmp_reg);
1108 1108 }
1109 1109 jcc(Assembler::notEqual, cas_label);
1110 1110 // The bias pattern is present in the object's header. Need to check
1111 1111 // whether the bias owner and the epoch are both still current.
1112 1112 #ifndef _LP64
1113 1113 // Note that because there is no current thread register on x86_32 we
1114 1114 // need to store off the mark word we read out of the object to
1115 1115 // avoid reloading it and needing to recheck invariants below. This
1116 1116 // store is unfortunate but it makes the overall code shorter and
1117 1117 // simpler.
1118 1118 movptr(saved_mark_addr, swap_reg);
1119 1119 #endif
1120 1120 if (need_tmp_reg) {
1121 1121 push(tmp_reg);
1122 1122 }
1123 1123 if (swap_reg_contains_mark) {
1124 1124 null_check_offset = offset();
1125 1125 }
1126 1126 load_prototype_header(tmp_reg, obj_reg);
1127 1127 #ifdef _LP64
1128 1128 orptr(tmp_reg, r15_thread);
1129 1129 xorptr(tmp_reg, swap_reg);
1130 1130 Register header_reg = tmp_reg;
1131 1131 #else
1132 1132 xorptr(tmp_reg, swap_reg);
1133 1133 get_thread(swap_reg);
1134 1134 xorptr(swap_reg, tmp_reg);
1135 1135 Register header_reg = swap_reg;
1136 1136 #endif
1137 1137 andptr(header_reg, ~((int) markOopDesc::age_mask_in_place));
1138 1138 if (need_tmp_reg) {
1139 1139 pop(tmp_reg);
1140 1140 }
1141 1141 if (counters != NULL) {
1142 1142 cond_inc32(Assembler::zero,
1143 1143 ExternalAddress((address) counters->biased_lock_entry_count_addr()));
1144 1144 }
1145 1145 jcc(Assembler::equal, done);
1146 1146
1147 1147 Label try_revoke_bias;
1148 1148 Label try_rebias;
1149 1149
1150 1150 // At this point we know that the header has the bias pattern and
1151 1151 // that we are not the bias owner in the current epoch. We need to
1152 1152 // figure out more details about the state of the header in order to
1153 1153 // know what operations can be legally performed on the object's
1154 1154 // header.
1155 1155
1156 1156 // If the low three bits in the xor result aren't clear, that means
1157 1157 // the prototype header is no longer biased and we have to revoke
1158 1158 // the bias on this object.
1159 1159 testptr(header_reg, markOopDesc::biased_lock_mask_in_place);
1160 1160 jccb(Assembler::notZero, try_revoke_bias);
1161 1161
1162 1162 // Biasing is still enabled for this data type. See whether the
1163 1163 // epoch of the current bias is still valid, meaning that the epoch
1164 1164 // bits of the mark word are equal to the epoch bits of the
1165 1165 // prototype header. (Note that the prototype header's epoch bits
1166 1166 // only change at a safepoint.) If not, attempt to rebias the object
1167 1167 // toward the current thread. Note that we must be absolutely sure
1168 1168 // that the current epoch is invalid in order to do this because
1169 1169 // otherwise the manipulations it performs on the mark word are
1170 1170 // illegal.
1171 1171 testptr(header_reg, markOopDesc::epoch_mask_in_place);
1172 1172 jccb(Assembler::notZero, try_rebias);
1173 1173
1174 1174 // The epoch of the current bias is still valid but we know nothing
1175 1175 // about the owner; it might be set or it might be clear. Try to
1176 1176 // acquire the bias of the object using an atomic operation. If this
1177 1177 // fails we will go in to the runtime to revoke the object's bias.
1178 1178 // Note that we first construct the presumed unbiased header so we
1179 1179 // don't accidentally blow away another thread's valid bias.
1180 1180 NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1181 1181 andptr(swap_reg,
1182 1182 markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
1183 1183 if (need_tmp_reg) {
1184 1184 push(tmp_reg);
1185 1185 }
1186 1186 #ifdef _LP64
1187 1187 movptr(tmp_reg, swap_reg);
1188 1188 orptr(tmp_reg, r15_thread);
1189 1189 #else
1190 1190 get_thread(tmp_reg);
1191 1191 orptr(tmp_reg, swap_reg);
1192 1192 #endif
1193 1193 if (os::is_MP()) {
1194 1194 lock();
1195 1195 }
1196 1196 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1197 1197 if (need_tmp_reg) {
1198 1198 pop(tmp_reg);
1199 1199 }
1200 1200 // If the biasing toward our thread failed, this means that
1201 1201 // another thread succeeded in biasing it toward itself and we
1202 1202 // need to revoke that bias. The revocation will occur in the
1203 1203 // interpreter runtime in the slow case.
1204 1204 if (counters != NULL) {
1205 1205 cond_inc32(Assembler::zero,
1206 1206 ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr()));
1207 1207 }
1208 1208 if (slow_case != NULL) {
1209 1209 jcc(Assembler::notZero, *slow_case);
1210 1210 }
1211 1211 jmp(done);
1212 1212
1213 1213 bind(try_rebias);
1214 1214 // At this point we know the epoch has expired, meaning that the
1215 1215 // current "bias owner", if any, is actually invalid. Under these
1216 1216 // circumstances _only_, we are allowed to use the current header's
1217 1217 // value as the comparison value when doing the cas to acquire the
1218 1218 // bias in the current epoch. In other words, we allow transfer of
1219 1219 // the bias from one thread to another directly in this situation.
1220 1220 //
1221 1221 // FIXME: due to a lack of registers we currently blow away the age
1222 1222 // bits in this situation. Should attempt to preserve them.
1223 1223 if (need_tmp_reg) {
1224 1224 push(tmp_reg);
1225 1225 }
1226 1226 load_prototype_header(tmp_reg, obj_reg);
1227 1227 #ifdef _LP64
1228 1228 orptr(tmp_reg, r15_thread);
1229 1229 #else
1230 1230 get_thread(swap_reg);
1231 1231 orptr(tmp_reg, swap_reg);
1232 1232 movptr(swap_reg, saved_mark_addr);
1233 1233 #endif
1234 1234 if (os::is_MP()) {
1235 1235 lock();
1236 1236 }
1237 1237 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1238 1238 if (need_tmp_reg) {
1239 1239 pop(tmp_reg);
1240 1240 }
1241 1241 // If the biasing toward our thread failed, then another thread
1242 1242 // succeeded in biasing it toward itself and we need to revoke that
1243 1243 // bias. The revocation will occur in the runtime in the slow case.
1244 1244 if (counters != NULL) {
1245 1245 cond_inc32(Assembler::zero,
1246 1246 ExternalAddress((address) counters->rebiased_lock_entry_count_addr()));
1247 1247 }
1248 1248 if (slow_case != NULL) {
1249 1249 jcc(Assembler::notZero, *slow_case);
1250 1250 }
1251 1251 jmp(done);
1252 1252
1253 1253 bind(try_revoke_bias);
1254 1254 // The prototype mark in the klass doesn't have the bias bit set any
1255 1255 // more, indicating that objects of this data type are not supposed
1256 1256 // to be biased any more. We are going to try to reset the mark of
1257 1257 // this object to the prototype value and fall through to the
1258 1258 // CAS-based locking scheme. Note that if our CAS fails, it means
1259 1259 // that another thread raced us for the privilege of revoking the
1260 1260 // bias of this particular object, so it's okay to continue in the
1261 1261 // normal locking code.
1262 1262 //
1263 1263 // FIXME: due to a lack of registers we currently blow away the age
1264 1264 // bits in this situation. Should attempt to preserve them.
1265 1265 NOT_LP64( movptr(swap_reg, saved_mark_addr); )
1266 1266 if (need_tmp_reg) {
1267 1267 push(tmp_reg);
1268 1268 }
1269 1269 load_prototype_header(tmp_reg, obj_reg);
1270 1270 if (os::is_MP()) {
1271 1271 lock();
1272 1272 }
1273 1273 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg
1274 1274 if (need_tmp_reg) {
1275 1275 pop(tmp_reg);
1276 1276 }
1277 1277 // Fall through to the normal CAS-based lock, because no matter what
1278 1278 // the result of the above CAS, some thread must have succeeded in
1279 1279 // removing the bias bit from the object's header.
1280 1280 if (counters != NULL) {
1281 1281 cond_inc32(Assembler::zero,
1282 1282 ExternalAddress((address) counters->revoked_lock_entry_count_addr()));
1283 1283 }
1284 1284
1285 1285 bind(cas_label);
1286 1286
1287 1287 return null_check_offset;
1288 1288 }
1289 1289
1290 1290 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
1291 1291 assert(UseBiasedLocking, "why call this otherwise?");
1292 1292
1293 1293 // Check for biased locking unlock case, which is a no-op
1294 1294 // Note: we do not have to check the thread ID for two reasons.
1295 1295 // First, the interpreter checks for IllegalMonitorStateException at
1296 1296 // a higher level. Second, if the bias was revoked while we held the
1297 1297 // lock, the object could not be rebiased toward another thread, so
1298 1298 // the bias bit would be clear.
1299 1299 movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
1300 1300 andptr(temp_reg, markOopDesc::biased_lock_mask_in_place);
1301 1301 cmpptr(temp_reg, markOopDesc::biased_lock_pattern);
1302 1302 jcc(Assembler::equal, done);
1303 1303 }
1304 1304
1305 1305 #ifdef COMPILER2
1306 1306
1307 1307 #if INCLUDE_RTM_OPT
1308 1308
1309 1309 // Update rtm_counters based on abort status
1310 1310 // input: abort_status
1311 1311 // rtm_counters (RTMLockingCounters*)
1312 1312 // flags are killed
1313 1313 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
1314 1314
1315 1315 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
1316 1316 if (PrintPreciseRTMLockingStatistics) {
1317 1317 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
1318 1318 Label check_abort;
1319 1319 testl(abort_status, (1<<i));
1320 1320 jccb(Assembler::equal, check_abort);
1321 1321 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
1322 1322 bind(check_abort);
1323 1323 }
1324 1324 }
1325 1325 }
1326 1326
1327 1327 // Branch if (random & (count-1) != 0), count is 2^n
1328 1328 // tmp, scr and flags are killed
1329 1329 void MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
1330 1330 assert(tmp == rax, "");
1331 1331 assert(scr == rdx, "");
1332 1332 rdtsc(); // modifies EDX:EAX
1333 1333 andptr(tmp, count-1);
1334 1334 jccb(Assembler::notZero, brLabel);
1335 1335 }
1336 1336
1337 1337 // Perform abort ratio calculation, set no_rtm bit if high ratio
1338 1338 // input: rtm_counters_Reg (RTMLockingCounters* address)
1339 1339 // tmpReg, rtm_counters_Reg and flags are killed
1340 1340 void MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
1341 1341 Register rtm_counters_Reg,
1342 1342 RTMLockingCounters* rtm_counters,
1343 1343 Metadata* method_data) {
1344 1344 Label L_done, L_check_always_rtm1, L_check_always_rtm2;
1345 1345
1346 1346 if (RTMLockingCalculationDelay > 0) {
1347 1347 // Delay calculation
1348 1348 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
1349 1349 testptr(tmpReg, tmpReg);
1350 1350 jccb(Assembler::equal, L_done);
1351 1351 }
1352 1352 // Abort ratio calculation only if abort_count > RTMAbortThreshold
1353 1353 // Aborted transactions = abort_count * 100
1354 1354 // All transactions = total_count * RTMTotalCountIncrRate
1355 1355 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
1356 1356
1357 1357 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
1358 1358 cmpptr(tmpReg, RTMAbortThreshold);
1359 1359 jccb(Assembler::below, L_check_always_rtm2);
1360 1360 imulptr(tmpReg, tmpReg, 100);
1361 1361
1362 1362 Register scrReg = rtm_counters_Reg;
1363 1363 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1364 1364 imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
1365 1365 imulptr(scrReg, scrReg, RTMAbortRatio);
1366 1366 cmpptr(tmpReg, scrReg);
1367 1367 jccb(Assembler::below, L_check_always_rtm1);
1368 1368 if (method_data != NULL) {
1369 1369 // set rtm_state to "no rtm" in MDO
1370 1370 mov_metadata(tmpReg, method_data);
1371 1371 if (os::is_MP()) {
1372 1372 lock();
1373 1373 }
1374 1374 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
1375 1375 }
1376 1376 jmpb(L_done);
1377 1377 bind(L_check_always_rtm1);
1378 1378 // Reload RTMLockingCounters* address
1379 1379 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1380 1380 bind(L_check_always_rtm2);
1381 1381 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
1382 1382 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
1383 1383 jccb(Assembler::below, L_done);
1384 1384 if (method_data != NULL) {
1385 1385 // set rtm_state to "always rtm" in MDO
1386 1386 mov_metadata(tmpReg, method_data);
1387 1387 if (os::is_MP()) {
1388 1388 lock();
1389 1389 }
1390 1390 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
1391 1391 }
1392 1392 bind(L_done);
1393 1393 }
1394 1394
1395 1395 // Update counters and perform abort ratio calculation
1396 1396 // input: abort_status_Reg
1397 1397 // rtm_counters_Reg, flags are killed
1398 1398 void MacroAssembler::rtm_profiling(Register abort_status_Reg,
1399 1399 Register rtm_counters_Reg,
1400 1400 RTMLockingCounters* rtm_counters,
1401 1401 Metadata* method_data,
1402 1402 bool profile_rtm) {
1403 1403
1404 1404 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1405 1405 // update rtm counters based on rax value at abort
1406 1406 // reads abort_status_Reg, updates flags
1407 1407 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
1408 1408 rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
1409 1409 if (profile_rtm) {
1410 1410 // Save abort status because abort_status_Reg is used by following code.
1411 1411 if (RTMRetryCount > 0) {
1412 1412 push(abort_status_Reg);
1413 1413 }
1414 1414 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1415 1415 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
1416 1416 // restore abort status
1417 1417 if (RTMRetryCount > 0) {
1418 1418 pop(abort_status_Reg);
1419 1419 }
1420 1420 }
1421 1421 }
1422 1422
1423 1423 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
1424 1424 // inputs: retry_count_Reg
1425 1425 // : abort_status_Reg
1426 1426 // output: retry_count_Reg decremented by 1
1427 1427 // flags are killed
1428 1428 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
1429 1429 Label doneRetry;
1430 1430 assert(abort_status_Reg == rax, "");
1431 1431 // The abort reason bits are in eax (see all states in rtmLocking.hpp)
1432 1432 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
1433 1433 // if reason is in 0x6 and retry count != 0 then retry
1434 1434 andptr(abort_status_Reg, 0x6);
1435 1435 jccb(Assembler::zero, doneRetry);
1436 1436 testl(retry_count_Reg, retry_count_Reg);
1437 1437 jccb(Assembler::zero, doneRetry);
1438 1438 pause();
1439 1439 decrementl(retry_count_Reg);
1440 1440 jmp(retryLabel);
1441 1441 bind(doneRetry);
1442 1442 }
1443 1443
1444 1444 // Spin and retry if lock is busy,
1445 1445 // inputs: box_Reg (monitor address)
1446 1446 // : retry_count_Reg
1447 1447 // output: retry_count_Reg decremented by 1
1448 1448 // : clear z flag if retry count exceeded
1449 1449 // tmp_Reg, scr_Reg, flags are killed
1450 1450 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
1451 1451 Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
1452 1452 Label SpinLoop, SpinExit, doneRetry;
1453 1453 // Clean monitor_value bit to get valid pointer
1454 1454 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
1455 1455
1456 1456 testl(retry_count_Reg, retry_count_Reg);
1457 1457 jccb(Assembler::zero, doneRetry);
1458 1458 decrementl(retry_count_Reg);
1459 1459 movptr(scr_Reg, RTMSpinLoopCount);
1460 1460
1461 1461 bind(SpinLoop);
1462 1462 pause();
1463 1463 decrementl(scr_Reg);
1464 1464 jccb(Assembler::lessEqual, SpinExit);
1465 1465 movptr(tmp_Reg, Address(box_Reg, owner_offset));
1466 1466 testptr(tmp_Reg, tmp_Reg);
1467 1467 jccb(Assembler::notZero, SpinLoop);
1468 1468
1469 1469 bind(SpinExit);
1470 1470 jmp(retryLabel);
1471 1471 bind(doneRetry);
1472 1472 incrementl(retry_count_Reg); // clear z flag
1473 1473 }
1474 1474
1475 1475 // Use RTM for normal stack locks
1476 1476 // Input: objReg (object to lock)
1477 1477 void MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
1478 1478 Register retry_on_abort_count_Reg,
1479 1479 RTMLockingCounters* stack_rtm_counters,
1480 1480 Metadata* method_data, bool profile_rtm,
1481 1481 Label& DONE_LABEL, Label& IsInflated) {
1482 1482 assert(UseRTMForStackLocks, "why call this otherwise?");
1483 1483 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
1484 1484 assert(tmpReg == rax, "");
1485 1485 assert(scrReg == rdx, "");
1486 1486 Label L_rtm_retry, L_decrement_retry, L_on_abort;
1487 1487
1488 1488 if (RTMRetryCount > 0) {
1489 1489 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1490 1490 bind(L_rtm_retry);
1491 1491 }
1492 1492 movptr(tmpReg, Address(objReg, 0));
1493 1493 testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased
1494 1494 jcc(Assembler::notZero, IsInflated);
1495 1495
1496 1496 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1497 1497 Label L_noincrement;
1498 1498 if (RTMTotalCountIncrRate > 1) {
1499 1499 // tmpReg, scrReg and flags are killed
1500 1500 branch_on_random_using_rdtsc(tmpReg, scrReg, (int)RTMTotalCountIncrRate, L_noincrement);
1501 1501 }
1502 1502 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
1503 1503 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
1504 1504 bind(L_noincrement);
1505 1505 }
1506 1506 xbegin(L_on_abort);
1507 1507 movptr(tmpReg, Address(objReg, 0)); // fetch markword
1508 1508 andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
1509 1509 cmpptr(tmpReg, markOopDesc::unlocked_value); // bits = 001 unlocked
1510 1510 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked
1511 1511
1512 1512 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1513 1513 if (UseRTMXendForLockBusy) {
1514 1514 xend();
1515 1515 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry)
1516 1516 jmp(L_decrement_retry);
1517 1517 }
1518 1518 else {
1519 1519 xabort(0);
1520 1520 }
1521 1521 bind(L_on_abort);
1522 1522 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1523 1523 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
1524 1524 }
1525 1525 bind(L_decrement_retry);
1526 1526 if (RTMRetryCount > 0) {
1527 1527 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1528 1528 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1529 1529 }
1530 1530 }
1531 1531
1532 1532 // Use RTM for inflating locks
1533 1533 // inputs: objReg (object to lock)
1534 1534 // boxReg (on-stack box address (displaced header location) - KILLED)
1535 1535 // tmpReg (ObjectMonitor address + 2(monitor_value))
1536 1536 void MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
1537 1537 Register scrReg, Register retry_on_busy_count_Reg,
1538 1538 Register retry_on_abort_count_Reg,
1539 1539 RTMLockingCounters* rtm_counters,
1540 1540 Metadata* method_data, bool profile_rtm,
1541 1541 Label& DONE_LABEL) {
1542 1542 assert(UseRTMLocking, "why call this otherwise?");
1543 1543 assert(tmpReg == rax, "");
1544 1544 assert(scrReg == rdx, "");
1545 1545 Label L_rtm_retry, L_decrement_retry, L_on_abort;
1546 1546 // Clean monitor_value bit to get valid pointer
1547 1547 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
1548 1548
1549 1549 // Without cast to int32_t a movptr will destroy r10 which is typically obj
1550 1550 movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1551 1551 movptr(boxReg, tmpReg); // Save ObjectMonitor address
1552 1552
1553 1553 if (RTMRetryCount > 0) {
1554 1554 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy
1555 1555 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
1556 1556 bind(L_rtm_retry);
1557 1557 }
1558 1558 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1559 1559 Label L_noincrement;
1560 1560 if (RTMTotalCountIncrRate > 1) {
1561 1561 // tmpReg, scrReg and flags are killed
1562 1562 branch_on_random_using_rdtsc(tmpReg, scrReg, (int)RTMTotalCountIncrRate, L_noincrement);
1563 1563 }
1564 1564 assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
1565 1565 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
1566 1566 bind(L_noincrement);
1567 1567 }
1568 1568 xbegin(L_on_abort);
1569 1569 movptr(tmpReg, Address(objReg, 0));
1570 1570 movptr(tmpReg, Address(tmpReg, owner_offset));
1571 1571 testptr(tmpReg, tmpReg);
1572 1572 jcc(Assembler::zero, DONE_LABEL);
1573 1573 if (UseRTMXendForLockBusy) {
1574 1574 xend();
1575 1575 jmp(L_decrement_retry);
1576 1576 }
1577 1577 else {
1578 1578 xabort(0);
1579 1579 }
1580 1580 bind(L_on_abort);
1581 1581 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
1582 1582 if (PrintPreciseRTMLockingStatistics || profile_rtm) {
1583 1583 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
1584 1584 }
1585 1585 if (RTMRetryCount > 0) {
1586 1586 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
1587 1587 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
1588 1588 }
1589 1589
1590 1590 movptr(tmpReg, Address(boxReg, owner_offset)) ;
1591 1591 testptr(tmpReg, tmpReg) ;
1592 1592 jccb(Assembler::notZero, L_decrement_retry) ;
1593 1593
1594 1594 // Appears unlocked - try to swing _owner from null to non-null.
1595 1595 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
1596 1596 #ifdef _LP64
1597 1597 Register threadReg = r15_thread;
1598 1598 #else
1599 1599 get_thread(scrReg);
1600 1600 Register threadReg = scrReg;
1601 1601 #endif
1602 1602 if (os::is_MP()) {
1603 1603 lock();
1604 1604 }
1605 1605 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
1606 1606
1607 1607 if (RTMRetryCount > 0) {
1608 1608 // success done else retry
1609 1609 jccb(Assembler::equal, DONE_LABEL) ;
1610 1610 bind(L_decrement_retry);
1611 1611 // Spin and retry if lock is busy.
1612 1612 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
1613 1613 }
1614 1614 else {
1615 1615 bind(L_decrement_retry);
1616 1616 }
1617 1617 }
1618 1618
1619 1619 #endif // INCLUDE_RTM_OPT
1620 1620
1621 1621 // Fast_Lock and Fast_Unlock used by C2
1622 1622
1623 1623 // Because the transitions from emitted code to the runtime
1624 1624 // monitorenter/exit helper stubs are so slow it's critical that
1625 1625 // we inline both the stack-locking fast-path and the inflated fast path.
1626 1626 //
1627 1627 // See also: cmpFastLock and cmpFastUnlock.
1628 1628 //
1629 1629 // What follows is a specialized inline transliteration of the code
1630 1630 // in slow_enter() and slow_exit(). If we're concerned about I$ bloat
1631 1631 // another option would be to emit TrySlowEnter and TrySlowExit methods
1632 1632 // at startup-time. These methods would accept arguments as
1633 1633 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
1634 1634 // indications in the icc.ZFlag. Fast_Lock and Fast_Unlock would simply
1635 1635 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
1636 1636 // In practice, however, the # of lock sites is bounded and is usually small.
1637 1637 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
1638 1638 // if the processor uses simple bimodal branch predictors keyed by EIP
1639 1639 // Since the helper routines would be called from multiple synchronization
1640 1640 // sites.
1641 1641 //
1642 1642 // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
1643 1643 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
1644 1644 // to those specialized methods. That'd give us a mostly platform-independent
1645 1645 // implementation that the JITs could optimize and inline at their pleasure.
1646 1646 // Done correctly, the only time we'd need to cross to native could would be
1647 1647 // to park() or unpark() threads. We'd also need a few more unsafe operators
1648 1648 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
1649 1649 // (b) explicit barriers or fence operations.
1650 1650 //
1651 1651 // TODO:
1652 1652 //
1653 1653 // * Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
1654 1654 // This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
1655 1655 // Given TLAB allocation, Self is usually manifested in a register, so passing it into
1656 1656 // the lock operators would typically be faster than reifying Self.
1657 1657 //
1658 1658 // * Ideally I'd define the primitives as:
1659 1659 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
1660 1660 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
1661 1661 // Unfortunately ADLC bugs prevent us from expressing the ideal form.
1662 1662 // Instead, we're stuck with a rather awkward and brittle register assignments below.
1663 1663 // Furthermore the register assignments are overconstrained, possibly resulting in
1664 1664 // sub-optimal code near the synchronization site.
1665 1665 //
1666 1666 // * Eliminate the sp-proximity tests and just use "== Self" tests instead.
1667 1667 // Alternately, use a better sp-proximity test.
1668 1668 //
1669 1669 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
1670 1670 // Either one is sufficient to uniquely identify a thread.
1671 1671 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
1672 1672 //
1673 1673 // * Intrinsify notify() and notifyAll() for the common cases where the
1674 1674 // object is locked by the calling thread but the waitlist is empty.
1675 1675 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
1676 1676 //
1677 1677 // * use jccb and jmpb instead of jcc and jmp to improve code density.
1678 1678 // But beware of excessive branch density on AMD Opterons.
1679 1679 //
1680 1680 // * Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
1681 1681 // or failure of the fast-path. If the fast-path fails then we pass
1682 1682 // control to the slow-path, typically in C. In Fast_Lock and
1683 1683 // Fast_Unlock we often branch to DONE_LABEL, just to find that C2
1684 1684 // will emit a conditional branch immediately after the node.
1685 1685 // So we have branches to branches and lots of ICC.ZF games.
1686 1686 // Instead, it might be better to have C2 pass a "FailureLabel"
1687 1687 // into Fast_Lock and Fast_Unlock. In the case of success, control
1688 1688 // will drop through the node. ICC.ZF is undefined at exit.
1689 1689 // In the case of failure, the node will branch directly to the
1690 1690 // FailureLabel
1691 1691
1692 1692
1693 1693 // obj: object to lock
1694 1694 // box: on-stack box address (displaced header location) - KILLED
1695 1695 // rax,: tmp -- KILLED
1696 1696 // scr: tmp -- KILLED
1697 1697 void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
1698 1698 Register scrReg, Register cx1Reg, Register cx2Reg,
1699 1699 BiasedLockingCounters* counters,
1700 1700 RTMLockingCounters* rtm_counters,
1701 1701 RTMLockingCounters* stack_rtm_counters,
1702 1702 Metadata* method_data,
1703 1703 bool use_rtm, bool profile_rtm) {
1704 1704 // Ensure the register assignents are disjoint
1705 1705 assert(tmpReg == rax, "");
1706 1706
1707 1707 if (use_rtm) {
1708 1708 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
1709 1709 } else {
1710 1710 assert(cx1Reg == noreg, "");
1711 1711 assert(cx2Reg == noreg, "");
1712 1712 assert_different_registers(objReg, boxReg, tmpReg, scrReg);
1713 1713 }
1714 1714
1715 1715 if (counters != NULL) {
1716 1716 atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
1717 1717 }
1718 1718 if (EmitSync & 1) {
1719 1719 // set box->dhw = unused_mark (3)
1720 1720 // Force all sync thru slow-path: slow_enter() and slow_exit()
1721 1721 movptr (Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1722 1722 cmpptr (rsp, (int32_t)NULL_WORD);
1723 1723 } else
1724 1724 if (EmitSync & 2) {
1725 1725 Label DONE_LABEL ;
1726 1726 if (UseBiasedLocking) {
1727 1727 // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument.
1728 1728 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
1729 1729 }
1730 1730
1731 1731 movptr(tmpReg, Address(objReg, 0)); // fetch markword
1732 1732 orptr (tmpReg, 0x1);
1733 1733 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
1734 1734 if (os::is_MP()) {
1735 1735 lock();
1736 1736 }
1737 1737 cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg
1738 1738 jccb(Assembler::equal, DONE_LABEL);
1739 1739 // Recursive locking
1740 1740 subptr(tmpReg, rsp);
1741 1741 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
1742 1742 movptr(Address(boxReg, 0), tmpReg);
1743 1743 bind(DONE_LABEL);
1744 1744 } else {
1745 1745 // Possible cases that we'll encounter in fast_lock
1746 1746 // ------------------------------------------------
1747 1747 // * Inflated
1748 1748 // -- unlocked
1749 1749 // -- Locked
1750 1750 // = by self
1751 1751 // = by other
1752 1752 // * biased
1753 1753 // -- by Self
1754 1754 // -- by other
1755 1755 // * neutral
1756 1756 // * stack-locked
1757 1757 // -- by self
1758 1758 // = sp-proximity test hits
1759 1759 // = sp-proximity test generates false-negative
1760 1760 // -- by other
1761 1761 //
↓ open down ↓ |
1761 lines elided |
↑ open up ↑ |
1762 1762
1763 1763 Label IsInflated, DONE_LABEL;
1764 1764
1765 1765 // it's stack-locked, biased or neutral
1766 1766 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
1767 1767 // order to reduce the number of conditional branches in the most common cases.
1768 1768 // Beware -- there's a subtle invariant that fetch of the markword
1769 1769 // at [FETCH], below, will never observe a biased encoding (*101b).
1770 1770 // If this invariant is not held we risk exclusion (safety) failure.
1771 1771 if (UseBiasedLocking && !UseOptoBiasInlining) {
1772 - biased_locking_enter(boxReg, objReg, tmpReg, scrReg, true, DONE_LABEL, NULL, counters);
1772 + biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters);
1773 1773 }
1774 1774
1775 1775 #if INCLUDE_RTM_OPT
1776 1776 if (UseRTMForStackLocks && use_rtm) {
1777 1777 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
1778 1778 stack_rtm_counters, method_data, profile_rtm,
1779 1779 DONE_LABEL, IsInflated);
1780 1780 }
1781 1781 #endif // INCLUDE_RTM_OPT
1782 1782
1783 1783 movptr(tmpReg, Address(objReg, 0)); // [FETCH]
1784 1784 testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased
1785 1785 jccb(Assembler::notZero, IsInflated);
1786 1786
1787 1787 // Attempt stack-locking ...
1788 1788 orptr (tmpReg, markOopDesc::unlocked_value);
1789 1789 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
1790 1790 if (os::is_MP()) {
1791 1791 lock();
1792 1792 }
1793 1793 cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg
1794 1794 if (counters != NULL) {
1795 1795 cond_inc32(Assembler::equal,
1796 1796 ExternalAddress((address)counters->fast_path_entry_count_addr()));
1797 1797 }
1798 1798 jcc(Assembler::equal, DONE_LABEL); // Success
1799 1799
1800 1800 // Recursive locking.
1801 1801 // The object is stack-locked: markword contains stack pointer to BasicLock.
1802 1802 // Locked by current thread if difference with current SP is less than one page.
1803 1803 subptr(tmpReg, rsp);
1804 1804 // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
1805 1805 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
1806 1806 movptr(Address(boxReg, 0), tmpReg);
1807 1807 if (counters != NULL) {
1808 1808 cond_inc32(Assembler::equal,
1809 1809 ExternalAddress((address)counters->fast_path_entry_count_addr()));
1810 1810 }
1811 1811 jmp(DONE_LABEL);
1812 1812
1813 1813 bind(IsInflated);
1814 1814 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + 2(monitor_value)
1815 1815
1816 1816 #if INCLUDE_RTM_OPT
1817 1817 // Use the same RTM locking code in 32- and 64-bit VM.
1818 1818 if (use_rtm) {
1819 1819 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
1820 1820 rtm_counters, method_data, profile_rtm, DONE_LABEL);
1821 1821 } else {
1822 1822 #endif // INCLUDE_RTM_OPT
1823 1823
1824 1824 #ifndef _LP64
1825 1825 // The object is inflated.
1826 1826 //
1827 1827 // TODO-FIXME: eliminate the ugly use of manifest constants:
1828 1828 // Use markOopDesc::monitor_value instead of "2".
1829 1829 // use markOop::unused_mark() instead of "3".
1830 1830 // The tmpReg value is an objectMonitor reference ORed with
1831 1831 // markOopDesc::monitor_value (2). We can either convert tmpReg to an
1832 1832 // objectmonitor pointer by masking off the "2" bit or we can just
1833 1833 // use tmpReg as an objectmonitor pointer but bias the objectmonitor
1834 1834 // field offsets with "-2" to compensate for and annul the low-order tag bit.
1835 1835 //
1836 1836 // I use the latter as it avoids AGI stalls.
1837 1837 // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]"
1838 1838 // instead of "mov r, [tmpReg+OFFSETOF(Owner)]".
1839 1839 //
1840 1840 #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2)
1841 1841
1842 1842 // boxReg refers to the on-stack BasicLock in the current frame.
1843 1843 // We'd like to write:
1844 1844 // set box->_displaced_header = markOop::unused_mark(). Any non-0 value suffices.
1845 1845 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers
1846 1846 // additional latency as we have another ST in the store buffer that must drain.
1847 1847
1848 1848 if (EmitSync & 8192) {
1849 1849 movptr(Address(boxReg, 0), 3); // results in ST-before-CAS penalty
1850 1850 get_thread (scrReg);
1851 1851 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
1852 1852 movptr(tmpReg, NULL_WORD); // consider: xor vs mov
1853 1853 if (os::is_MP()) {
1854 1854 lock();
1855 1855 }
1856 1856 cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
1857 1857 } else
1858 1858 if ((EmitSync & 128) == 0) { // avoid ST-before-CAS
1859 1859 movptr(scrReg, boxReg);
1860 1860 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
1861 1861
1862 1862 // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
1863 1863 if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1864 1864 // prefetchw [eax + Offset(_owner)-2]
1865 1865 prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
1866 1866 }
1867 1867
1868 1868 if ((EmitSync & 64) == 0) {
1869 1869 // Optimistic form: consider XORL tmpReg,tmpReg
1870 1870 movptr(tmpReg, NULL_WORD);
1871 1871 } else {
1872 1872 // Can suffer RTS->RTO upgrades on shared or cold $ lines
1873 1873 // Test-And-CAS instead of CAS
1874 1874 movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); // rax, = m->_owner
1875 1875 testptr(tmpReg, tmpReg); // Locked ?
1876 1876 jccb (Assembler::notZero, DONE_LABEL);
1877 1877 }
1878 1878
1879 1879 // Appears unlocked - try to swing _owner from null to non-null.
1880 1880 // Ideally, I'd manifest "Self" with get_thread and then attempt
1881 1881 // to CAS the register containing Self into m->Owner.
1882 1882 // But we don't have enough registers, so instead we can either try to CAS
1883 1883 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds
1884 1884 // we later store "Self" into m->Owner. Transiently storing a stack address
1885 1885 // (rsp or the address of the box) into m->owner is harmless.
1886 1886 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
1887 1887 if (os::is_MP()) {
1888 1888 lock();
1889 1889 }
1890 1890 cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
1891 1891 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3
1892 1892 jccb (Assembler::notZero, DONE_LABEL);
1893 1893 get_thread (scrReg); // beware: clobbers ICCs
1894 1894 movptr(Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg);
1895 1895 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success
1896 1896
1897 1897 // If the CAS fails we can either retry or pass control to the slow-path.
1898 1898 // We use the latter tactic.
1899 1899 // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1900 1900 // If the CAS was successful ...
1901 1901 // Self has acquired the lock
1902 1902 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1903 1903 // Intentional fall-through into DONE_LABEL ...
1904 1904 } else {
1905 1905 movptr(Address(boxReg, 0), intptr_t(markOopDesc::unused_mark())); // results in ST-before-CAS penalty
1906 1906 movptr(boxReg, tmpReg);
1907 1907
1908 1908 // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
1909 1909 if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
1910 1910 // prefetchw [eax + Offset(_owner)-2]
1911 1911 prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
1912 1912 }
1913 1913
1914 1914 if ((EmitSync & 64) == 0) {
1915 1915 // Optimistic form
1916 1916 xorptr (tmpReg, tmpReg);
1917 1917 } else {
1918 1918 // Can suffer RTS->RTO upgrades on shared or cold $ lines
1919 1919 movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)); // rax, = m->_owner
1920 1920 testptr(tmpReg, tmpReg); // Locked ?
1921 1921 jccb (Assembler::notZero, DONE_LABEL);
1922 1922 }
1923 1923
1924 1924 // Appears unlocked - try to swing _owner from null to non-null.
1925 1925 // Use either "Self" (in scr) or rsp as thread identity in _owner.
1926 1926 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
1927 1927 get_thread (scrReg);
1928 1928 if (os::is_MP()) {
1929 1929 lock();
1930 1930 }
1931 1931 cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
1932 1932
1933 1933 // If the CAS fails we can either retry or pass control to the slow-path.
1934 1934 // We use the latter tactic.
1935 1935 // Pass the CAS result in the icc.ZFlag into DONE_LABEL
1936 1936 // If the CAS was successful ...
1937 1937 // Self has acquired the lock
1938 1938 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
1939 1939 // Intentional fall-through into DONE_LABEL ...
1940 1940 }
1941 1941 #else // _LP64
1942 1942 // It's inflated
1943 1943
1944 1944 // TODO: someday avoid the ST-before-CAS penalty by
1945 1945 // relocating (deferring) the following ST.
1946 1946 // We should also think about trying a CAS without having
1947 1947 // fetched _owner. If the CAS is successful we may
1948 1948 // avoid an RTO->RTS upgrade on the $line.
1949 1949
1950 1950 // Without cast to int32_t a movptr will destroy r10 which is typically obj
1951 1951 movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark()));
1952 1952
1953 1953 movptr (boxReg, tmpReg);
1954 1954 movptr (tmpReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
1955 1955 testptr(tmpReg, tmpReg);
1956 1956 jccb (Assembler::notZero, DONE_LABEL);
1957 1957
1958 1958 // It's inflated and appears unlocked
1959 1959 if (os::is_MP()) {
1960 1960 lock();
1961 1961 }
1962 1962 cmpxchgptr(r15_thread, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2));
1963 1963 // Intentional fall-through into DONE_LABEL ...
1964 1964 #endif // _LP64
1965 1965
1966 1966 #if INCLUDE_RTM_OPT
1967 1967 } // use_rtm()
1968 1968 #endif
1969 1969 // DONE_LABEL is a hot target - we'd really like to place it at the
1970 1970 // start of cache line by padding with NOPs.
1971 1971 // See the AMD and Intel software optimization manuals for the
1972 1972 // most efficient "long" NOP encodings.
1973 1973 // Unfortunately none of our alignment mechanisms suffice.
1974 1974 bind(DONE_LABEL);
1975 1975
1976 1976 // At DONE_LABEL the icc ZFlag is set as follows ...
1977 1977 // Fast_Unlock uses the same protocol.
1978 1978 // ZFlag == 1 -> Success
1979 1979 // ZFlag == 0 -> Failure - force control through the slow-path
1980 1980 }
1981 1981 }
1982 1982
1983 1983 // obj: object to unlock
1984 1984 // box: box address (displaced header location), killed. Must be EAX.
1985 1985 // tmp: killed, cannot be obj nor box.
1986 1986 //
1987 1987 // Some commentary on balanced locking:
1988 1988 //
1989 1989 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
1990 1990 // Methods that don't have provably balanced locking are forced to run in the
1991 1991 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
1992 1992 // The interpreter provides two properties:
1993 1993 // I1: At return-time the interpreter automatically and quietly unlocks any
1994 1994 // objects acquired the current activation (frame). Recall that the
1995 1995 // interpreter maintains an on-stack list of locks currently held by
1996 1996 // a frame.
1997 1997 // I2: If a method attempts to unlock an object that is not held by the
1998 1998 // the frame the interpreter throws IMSX.
1999 1999 //
2000 2000 // Lets say A(), which has provably balanced locking, acquires O and then calls B().
2001 2001 // B() doesn't have provably balanced locking so it runs in the interpreter.
2002 2002 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
2003 2003 // is still locked by A().
2004 2004 //
2005 2005 // The only other source of unbalanced locking would be JNI. The "Java Native Interface:
2006 2006 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
2007 2007 // should not be unlocked by "normal" java-level locking and vice-versa. The specification
2008 2008 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
2009 2009
2010 2010 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
2011 2011 assert(boxReg == rax, "");
2012 2012 assert_different_registers(objReg, boxReg, tmpReg);
2013 2013
2014 2014 if (EmitSync & 4) {
2015 2015 // Disable - inhibit all inlining. Force control through the slow-path
2016 2016 cmpptr (rsp, 0);
2017 2017 } else
2018 2018 if (EmitSync & 8) {
2019 2019 Label DONE_LABEL;
2020 2020 if (UseBiasedLocking) {
2021 2021 biased_locking_exit(objReg, tmpReg, DONE_LABEL);
2022 2022 }
2023 2023 // Classic stack-locking code ...
2024 2024 // Check whether the displaced header is 0
2025 2025 //(=> recursive unlock)
2026 2026 movptr(tmpReg, Address(boxReg, 0));
2027 2027 testptr(tmpReg, tmpReg);
2028 2028 jccb(Assembler::zero, DONE_LABEL);
2029 2029 // If not recursive lock, reset the header to displaced header
2030 2030 if (os::is_MP()) {
2031 2031 lock();
2032 2032 }
2033 2033 cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
2034 2034 bind(DONE_LABEL);
2035 2035 } else {
2036 2036 Label DONE_LABEL, Stacked, CheckSucc;
2037 2037
2038 2038 // Critically, the biased locking test must have precedence over
2039 2039 // and appear before the (box->dhw == 0) recursive stack-lock test.
2040 2040 if (UseBiasedLocking && !UseOptoBiasInlining) {
2041 2041 biased_locking_exit(objReg, tmpReg, DONE_LABEL);
2042 2042 }
2043 2043
2044 2044 #if INCLUDE_RTM_OPT
2045 2045 if (UseRTMForStackLocks && use_rtm) {
2046 2046 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2047 2047 Label L_regular_unlock;
2048 2048 movptr(tmpReg, Address(objReg, 0)); // fetch markword
2049 2049 andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits
2050 2050 cmpptr(tmpReg, markOopDesc::unlocked_value); // bits = 001 unlocked
2051 2051 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock
2052 2052 xend(); // otherwise end...
2053 2053 jmp(DONE_LABEL); // ... and we're done
2054 2054 bind(L_regular_unlock);
2055 2055 }
2056 2056 #endif
2057 2057
2058 2058 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
2059 2059 jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock
2060 2060 movptr(tmpReg, Address(objReg, 0)); // Examine the object's markword
2061 2061 testptr(tmpReg, markOopDesc::monitor_value); // Inflated?
2062 2062 jccb (Assembler::zero, Stacked);
2063 2063
2064 2064 // It's inflated.
2065 2065 #if INCLUDE_RTM_OPT
2066 2066 if (use_rtm) {
2067 2067 Label L_regular_inflated_unlock;
2068 2068 // Clean monitor_value bit to get valid pointer
2069 2069 int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markOopDesc::monitor_value;
2070 2070 movptr(boxReg, Address(tmpReg, owner_offset));
2071 2071 testptr(boxReg, boxReg);
2072 2072 jccb(Assembler::notZero, L_regular_inflated_unlock);
2073 2073 xend();
2074 2074 jmpb(DONE_LABEL);
2075 2075 bind(L_regular_inflated_unlock);
2076 2076 }
2077 2077 #endif
2078 2078
2079 2079 // Despite our balanced locking property we still check that m->_owner == Self
2080 2080 // as java routines or native JNI code called by this thread might
2081 2081 // have released the lock.
2082 2082 // Refer to the comments in synchronizer.cpp for how we might encode extra
2083 2083 // state in _succ so we can avoid fetching EntryList|cxq.
2084 2084 //
2085 2085 // I'd like to add more cases in fast_lock() and fast_unlock() --
2086 2086 // such as recursive enter and exit -- but we have to be wary of
2087 2087 // I$ bloat, T$ effects and BP$ effects.
2088 2088 //
2089 2089 // If there's no contention try a 1-0 exit. That is, exit without
2090 2090 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how
2091 2091 // we detect and recover from the race that the 1-0 exit admits.
2092 2092 //
2093 2093 // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
2094 2094 // before it STs null into _owner, releasing the lock. Updates
2095 2095 // to data protected by the critical section must be visible before
2096 2096 // we drop the lock (and thus before any other thread could acquire
2097 2097 // the lock and observe the fields protected by the lock).
2098 2098 // IA32's memory-model is SPO, so STs are ordered with respect to
2099 2099 // each other and there's no need for an explicit barrier (fence).
2100 2100 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
2101 2101 #ifndef _LP64
2102 2102 get_thread (boxReg);
2103 2103 if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
2104 2104 // prefetchw [ebx + Offset(_owner)-2]
2105 2105 prefetchw(Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
2106 2106 }
2107 2107
2108 2108 // Note that we could employ various encoding schemes to reduce
2109 2109 // the number of loads below (currently 4) to just 2 or 3.
2110 2110 // Refer to the comments in synchronizer.cpp.
2111 2111 // In practice the chain of fetches doesn't seem to impact performance, however.
2112 2112 if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
2113 2113 // Attempt to reduce branch density - AMD's branch predictor.
2114 2114 xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
2115 2115 orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
2116 2116 orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
2117 2117 orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
2118 2118 jccb (Assembler::notZero, DONE_LABEL);
2119 2119 movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
2120 2120 jmpb (DONE_LABEL);
2121 2121 } else {
2122 2122 xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
2123 2123 orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
2124 2124 jccb (Assembler::notZero, DONE_LABEL);
2125 2125 movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
2126 2126 orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
2127 2127 jccb (Assembler::notZero, CheckSucc);
2128 2128 movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
2129 2129 jmpb (DONE_LABEL);
2130 2130 }
2131 2131
2132 2132 // The Following code fragment (EmitSync & 65536) improves the performance of
2133 2133 // contended applications and contended synchronization microbenchmarks.
2134 2134 // Unfortunately the emission of the code - even though not executed - causes regressions
2135 2135 // in scimark and jetstream, evidently because of $ effects. Replacing the code
2136 2136 // with an equal number of never-executed NOPs results in the same regression.
2137 2137 // We leave it off by default.
2138 2138
2139 2139 if ((EmitSync & 65536) != 0) {
2140 2140 Label LSuccess, LGoSlowPath ;
2141 2141
2142 2142 bind (CheckSucc);
2143 2143
2144 2144 // Optional pre-test ... it's safe to elide this
2145 2145 if ((EmitSync & 16) == 0) {
2146 2146 cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
2147 2147 jccb (Assembler::zero, LGoSlowPath);
2148 2148 }
2149 2149
2150 2150 // We have a classic Dekker-style idiom:
2151 2151 // ST m->_owner = 0 ; MEMBAR; LD m->_succ
2152 2152 // There are a number of ways to implement the barrier:
2153 2153 // (1) lock:andl &m->_owner, 0
2154 2154 // is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
2155 2155 // LOCK: ANDL [ebx+Offset(_Owner)-2], 0
2156 2156 // Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
2157 2157 // (2) If supported, an explicit MFENCE is appealing.
2158 2158 // In older IA32 processors MFENCE is slower than lock:add or xchg
2159 2159 // particularly if the write-buffer is full as might be the case if
2160 2160 // if stores closely precede the fence or fence-equivalent instruction.
2161 2161 // In more modern implementations MFENCE appears faster, however.
2162 2162 // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
2163 2163 // The $lines underlying the top-of-stack should be in M-state.
2164 2164 // The locked add instruction is serializing, of course.
2165 2165 // (4) Use xchg, which is serializing
2166 2166 // mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
2167 2167 // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
2168 2168 // The integer condition codes will tell us if succ was 0.
2169 2169 // Since _succ and _owner should reside in the same $line and
2170 2170 // we just stored into _owner, it's likely that the $line
2171 2171 // remains in M-state for the lock:orl.
2172 2172 //
2173 2173 // We currently use (3), although it's likely that switching to (2)
2174 2174 // is correct for the future.
2175 2175
2176 2176 movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD);
2177 2177 if (os::is_MP()) {
2178 2178 if (VM_Version::supports_sse2() && 1 == FenceInstruction) {
2179 2179 mfence();
2180 2180 } else {
2181 2181 lock (); addptr(Address(rsp, 0), 0);
2182 2182 }
2183 2183 }
2184 2184 // Ratify _succ remains non-null
2185 2185 cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0);
2186 2186 jccb (Assembler::notZero, LSuccess);
2187 2187
2188 2188 xorptr(boxReg, boxReg); // box is really EAX
2189 2189 if (os::is_MP()) { lock(); }
2190 2190 cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
2191 2191 jccb (Assembler::notEqual, LSuccess);
2192 2192 // Since we're low on registers we installed rsp as a placeholding in _owner.
2193 2193 // Now install Self over rsp. This is safe as we're transitioning from
2194 2194 // non-null to non=null
2195 2195 get_thread (boxReg);
2196 2196 movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg);
2197 2197 // Intentional fall-through into LGoSlowPath ...
2198 2198
2199 2199 bind (LGoSlowPath);
2200 2200 orptr(boxReg, 1); // set ICC.ZF=0 to indicate failure
2201 2201 jmpb (DONE_LABEL);
2202 2202
2203 2203 bind (LSuccess);
2204 2204 xorptr(boxReg, boxReg); // set ICC.ZF=1 to indicate success
2205 2205 jmpb (DONE_LABEL);
2206 2206 }
2207 2207
2208 2208 bind (Stacked);
2209 2209 // It's not inflated and it's not recursively stack-locked and it's not biased.
2210 2210 // It must be stack-locked.
2211 2211 // Try to reset the header to displaced header.
2212 2212 // The "box" value on the stack is stable, so we can reload
2213 2213 // and be assured we observe the same value as above.
2214 2214 movptr(tmpReg, Address(boxReg, 0));
2215 2215 if (os::is_MP()) {
2216 2216 lock();
2217 2217 }
2218 2218 cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
2219 2219 // Intention fall-thru into DONE_LABEL
2220 2220
2221 2221 // DONE_LABEL is a hot target - we'd really like to place it at the
2222 2222 // start of cache line by padding with NOPs.
2223 2223 // See the AMD and Intel software optimization manuals for the
2224 2224 // most efficient "long" NOP encodings.
2225 2225 // Unfortunately none of our alignment mechanisms suffice.
2226 2226 if ((EmitSync & 65536) == 0) {
2227 2227 bind (CheckSucc);
2228 2228 }
2229 2229 #else // _LP64
2230 2230 // It's inflated
2231 2231 movptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
2232 2232 xorptr(boxReg, r15_thread);
2233 2233 orptr (boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2));
2234 2234 jccb (Assembler::notZero, DONE_LABEL);
2235 2235 movptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2));
2236 2236 orptr (boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2));
2237 2237 jccb (Assembler::notZero, CheckSucc);
2238 2238 movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD);
2239 2239 jmpb (DONE_LABEL);
2240 2240
2241 2241 if ((EmitSync & 65536) == 0) {
2242 2242 Label LSuccess, LGoSlowPath ;
2243 2243 bind (CheckSucc);
2244 2244 cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
2245 2245 jccb (Assembler::zero, LGoSlowPath);
2246 2246
2247 2247 // I'd much rather use lock:andl m->_owner, 0 as it's faster than the
2248 2248 // the explicit ST;MEMBAR combination, but masm doesn't currently support
2249 2249 // "ANDQ M,IMM". Don't use MFENCE here. lock:add to TOS, xchg, etc
2250 2250 // are all faster when the write buffer is populated.
2251 2251 movptr (Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), (int32_t)NULL_WORD);
2252 2252 if (os::is_MP()) {
2253 2253 lock (); addl (Address(rsp, 0), 0);
2254 2254 }
2255 2255 cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), (int32_t)NULL_WORD);
2256 2256 jccb (Assembler::notZero, LSuccess);
2257 2257
2258 2258 movptr (boxReg, (int32_t)NULL_WORD); // box is really EAX
2259 2259 if (os::is_MP()) { lock(); }
2260 2260 cmpxchgptr(r15_thread, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
2261 2261 jccb (Assembler::notEqual, LSuccess);
2262 2262 // Intentional fall-through into slow-path
2263 2263
2264 2264 bind (LGoSlowPath);
2265 2265 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure
2266 2266 jmpb (DONE_LABEL);
2267 2267
2268 2268 bind (LSuccess);
2269 2269 testl (boxReg, 0); // set ICC.ZF=1 to indicate success
2270 2270 jmpb (DONE_LABEL);
2271 2271 }
2272 2272
2273 2273 bind (Stacked);
2274 2274 movptr(tmpReg, Address (boxReg, 0)); // re-fetch
2275 2275 if (os::is_MP()) { lock(); }
2276 2276 cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box
2277 2277
2278 2278 if (EmitSync & 65536) {
2279 2279 bind (CheckSucc);
2280 2280 }
2281 2281 #endif
2282 2282 bind(DONE_LABEL);
2283 2283 // Avoid branch to branch on AMD processors
2284 2284 if (EmitSync & 32768) {
2285 2285 nop();
2286 2286 }
2287 2287 }
2288 2288 }
2289 2289 #endif // COMPILER2
2290 2290
2291 2291 void MacroAssembler::c2bool(Register x) {
2292 2292 // implements x == 0 ? 0 : 1
2293 2293 // note: must only look at least-significant byte of x
2294 2294 // since C-style booleans are stored in one byte
2295 2295 // only! (was bug)
2296 2296 andl(x, 0xFF);
2297 2297 setb(Assembler::notZero, x);
2298 2298 }
2299 2299
2300 2300 // Wouldn't need if AddressLiteral version had new name
2301 2301 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
2302 2302 Assembler::call(L, rtype);
2303 2303 }
2304 2304
2305 2305 void MacroAssembler::call(Register entry) {
2306 2306 Assembler::call(entry);
2307 2307 }
2308 2308
2309 2309 void MacroAssembler::call(AddressLiteral entry) {
2310 2310 if (reachable(entry)) {
2311 2311 Assembler::call_literal(entry.target(), entry.rspec());
2312 2312 } else {
2313 2313 lea(rscratch1, entry);
2314 2314 Assembler::call(rscratch1);
2315 2315 }
2316 2316 }
2317 2317
2318 2318 void MacroAssembler::ic_call(address entry) {
2319 2319 RelocationHolder rh = virtual_call_Relocation::spec(pc());
2320 2320 movptr(rax, (intptr_t)Universe::non_oop_word());
2321 2321 call(AddressLiteral(entry, rh));
2322 2322 }
2323 2323
2324 2324 // Implementation of call_VM versions
2325 2325
2326 2326 void MacroAssembler::call_VM(Register oop_result,
2327 2327 address entry_point,
2328 2328 bool check_exceptions) {
2329 2329 Label C, E;
2330 2330 call(C, relocInfo::none);
2331 2331 jmp(E);
2332 2332
2333 2333 bind(C);
2334 2334 call_VM_helper(oop_result, entry_point, 0, check_exceptions);
2335 2335 ret(0);
2336 2336
2337 2337 bind(E);
2338 2338 }
2339 2339
2340 2340 void MacroAssembler::call_VM(Register oop_result,
2341 2341 address entry_point,
2342 2342 Register arg_1,
2343 2343 bool check_exceptions) {
2344 2344 Label C, E;
2345 2345 call(C, relocInfo::none);
2346 2346 jmp(E);
2347 2347
2348 2348 bind(C);
2349 2349 pass_arg1(this, arg_1);
2350 2350 call_VM_helper(oop_result, entry_point, 1, check_exceptions);
2351 2351 ret(0);
2352 2352
2353 2353 bind(E);
2354 2354 }
2355 2355
2356 2356 void MacroAssembler::call_VM(Register oop_result,
2357 2357 address entry_point,
2358 2358 Register arg_1,
2359 2359 Register arg_2,
2360 2360 bool check_exceptions) {
2361 2361 Label C, E;
2362 2362 call(C, relocInfo::none);
2363 2363 jmp(E);
2364 2364
2365 2365 bind(C);
2366 2366
2367 2367 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2368 2368
2369 2369 pass_arg2(this, arg_2);
2370 2370 pass_arg1(this, arg_1);
2371 2371 call_VM_helper(oop_result, entry_point, 2, check_exceptions);
2372 2372 ret(0);
2373 2373
2374 2374 bind(E);
2375 2375 }
2376 2376
2377 2377 void MacroAssembler::call_VM(Register oop_result,
2378 2378 address entry_point,
2379 2379 Register arg_1,
2380 2380 Register arg_2,
2381 2381 Register arg_3,
2382 2382 bool check_exceptions) {
2383 2383 Label C, E;
2384 2384 call(C, relocInfo::none);
2385 2385 jmp(E);
2386 2386
2387 2387 bind(C);
2388 2388
2389 2389 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2390 2390 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2391 2391 pass_arg3(this, arg_3);
2392 2392
2393 2393 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2394 2394 pass_arg2(this, arg_2);
2395 2395
2396 2396 pass_arg1(this, arg_1);
2397 2397 call_VM_helper(oop_result, entry_point, 3, check_exceptions);
2398 2398 ret(0);
2399 2399
2400 2400 bind(E);
2401 2401 }
2402 2402
2403 2403 void MacroAssembler::call_VM(Register oop_result,
2404 2404 Register last_java_sp,
2405 2405 address entry_point,
2406 2406 int number_of_arguments,
2407 2407 bool check_exceptions) {
2408 2408 Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2409 2409 call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2410 2410 }
2411 2411
2412 2412 void MacroAssembler::call_VM(Register oop_result,
2413 2413 Register last_java_sp,
2414 2414 address entry_point,
2415 2415 Register arg_1,
2416 2416 bool check_exceptions) {
2417 2417 pass_arg1(this, arg_1);
2418 2418 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2419 2419 }
2420 2420
2421 2421 void MacroAssembler::call_VM(Register oop_result,
2422 2422 Register last_java_sp,
2423 2423 address entry_point,
2424 2424 Register arg_1,
2425 2425 Register arg_2,
2426 2426 bool check_exceptions) {
2427 2427
2428 2428 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2429 2429 pass_arg2(this, arg_2);
2430 2430 pass_arg1(this, arg_1);
2431 2431 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2432 2432 }
2433 2433
2434 2434 void MacroAssembler::call_VM(Register oop_result,
2435 2435 Register last_java_sp,
2436 2436 address entry_point,
2437 2437 Register arg_1,
2438 2438 Register arg_2,
2439 2439 Register arg_3,
2440 2440 bool check_exceptions) {
2441 2441 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2442 2442 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2443 2443 pass_arg3(this, arg_3);
2444 2444 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2445 2445 pass_arg2(this, arg_2);
2446 2446 pass_arg1(this, arg_1);
2447 2447 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2448 2448 }
2449 2449
2450 2450 void MacroAssembler::super_call_VM(Register oop_result,
2451 2451 Register last_java_sp,
2452 2452 address entry_point,
2453 2453 int number_of_arguments,
2454 2454 bool check_exceptions) {
2455 2455 Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg);
2456 2456 MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
2457 2457 }
2458 2458
2459 2459 void MacroAssembler::super_call_VM(Register oop_result,
2460 2460 Register last_java_sp,
2461 2461 address entry_point,
2462 2462 Register arg_1,
2463 2463 bool check_exceptions) {
2464 2464 pass_arg1(this, arg_1);
2465 2465 super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
2466 2466 }
2467 2467
2468 2468 void MacroAssembler::super_call_VM(Register oop_result,
2469 2469 Register last_java_sp,
2470 2470 address entry_point,
2471 2471 Register arg_1,
2472 2472 Register arg_2,
2473 2473 bool check_exceptions) {
2474 2474
2475 2475 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2476 2476 pass_arg2(this, arg_2);
2477 2477 pass_arg1(this, arg_1);
2478 2478 super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
2479 2479 }
2480 2480
2481 2481 void MacroAssembler::super_call_VM(Register oop_result,
2482 2482 Register last_java_sp,
2483 2483 address entry_point,
2484 2484 Register arg_1,
2485 2485 Register arg_2,
2486 2486 Register arg_3,
2487 2487 bool check_exceptions) {
2488 2488 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2489 2489 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2490 2490 pass_arg3(this, arg_3);
2491 2491 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2492 2492 pass_arg2(this, arg_2);
2493 2493 pass_arg1(this, arg_1);
2494 2494 super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
2495 2495 }
2496 2496
2497 2497 void MacroAssembler::call_VM_base(Register oop_result,
2498 2498 Register java_thread,
2499 2499 Register last_java_sp,
2500 2500 address entry_point,
2501 2501 int number_of_arguments,
2502 2502 bool check_exceptions) {
2503 2503 // determine java_thread register
2504 2504 if (!java_thread->is_valid()) {
2505 2505 #ifdef _LP64
2506 2506 java_thread = r15_thread;
2507 2507 #else
2508 2508 java_thread = rdi;
2509 2509 get_thread(java_thread);
2510 2510 #endif // LP64
2511 2511 }
2512 2512 // determine last_java_sp register
2513 2513 if (!last_java_sp->is_valid()) {
2514 2514 last_java_sp = rsp;
2515 2515 }
2516 2516 // debugging support
2517 2517 assert(number_of_arguments >= 0 , "cannot have negative number of arguments");
2518 2518 LP64_ONLY(assert(java_thread == r15_thread, "unexpected register"));
2519 2519 #ifdef ASSERT
2520 2520 // TraceBytecodes does not use r12 but saves it over the call, so don't verify
2521 2521 // r12 is the heapbase.
2522 2522 LP64_ONLY(if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");)
2523 2523 #endif // ASSERT
2524 2524
2525 2525 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result");
2526 2526 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
2527 2527
2528 2528 // push java thread (becomes first argument of C function)
2529 2529
2530 2530 NOT_LP64(push(java_thread); number_of_arguments++);
2531 2531 LP64_ONLY(mov(c_rarg0, r15_thread));
2532 2532
2533 2533 // set last Java frame before call
2534 2534 assert(last_java_sp != rbp, "can't use ebp/rbp");
2535 2535
2536 2536 // Only interpreter should have to set fp
2537 2537 set_last_Java_frame(java_thread, last_java_sp, rbp, NULL);
2538 2538
2539 2539 // do the call, remove parameters
2540 2540 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
2541 2541
2542 2542 // restore the thread (cannot use the pushed argument since arguments
2543 2543 // may be overwritten by C code generated by an optimizing compiler);
2544 2544 // however can use the register value directly if it is callee saved.
2545 2545 if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) {
2546 2546 // rdi & rsi (also r15) are callee saved -> nothing to do
2547 2547 #ifdef ASSERT
2548 2548 guarantee(java_thread != rax, "change this code");
2549 2549 push(rax);
2550 2550 { Label L;
2551 2551 get_thread(rax);
2552 2552 cmpptr(java_thread, rax);
2553 2553 jcc(Assembler::equal, L);
2554 2554 STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
2555 2555 bind(L);
2556 2556 }
2557 2557 pop(rax);
2558 2558 #endif
2559 2559 } else {
2560 2560 get_thread(java_thread);
2561 2561 }
2562 2562 // reset last Java frame
2563 2563 // Only interpreter should have to clear fp
2564 2564 reset_last_Java_frame(java_thread, true, false);
2565 2565
2566 2566 #ifndef CC_INTERP
2567 2567 // C++ interp handles this in the interpreter
2568 2568 check_and_handle_popframe(java_thread);
2569 2569 check_and_handle_earlyret(java_thread);
2570 2570 #endif /* CC_INTERP */
2571 2571
2572 2572 if (check_exceptions) {
2573 2573 // check for pending exceptions (java_thread is set upon return)
2574 2574 cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD);
2575 2575 #ifndef _LP64
2576 2576 jump_cc(Assembler::notEqual,
2577 2577 RuntimeAddress(StubRoutines::forward_exception_entry()));
2578 2578 #else
2579 2579 // This used to conditionally jump to forward_exception however it is
2580 2580 // possible if we relocate that the branch will not reach. So we must jump
2581 2581 // around so we can always reach
2582 2582
2583 2583 Label ok;
2584 2584 jcc(Assembler::equal, ok);
2585 2585 jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
2586 2586 bind(ok);
2587 2587 #endif // LP64
2588 2588 }
2589 2589
2590 2590 // get oop result if there is one and reset the value in the thread
2591 2591 if (oop_result->is_valid()) {
2592 2592 get_vm_result(oop_result, java_thread);
2593 2593 }
2594 2594 }
2595 2595
2596 2596 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
2597 2597
2598 2598 // Calculate the value for last_Java_sp
2599 2599 // somewhat subtle. call_VM does an intermediate call
2600 2600 // which places a return address on the stack just under the
2601 2601 // stack pointer as the user finsihed with it. This allows
2602 2602 // use to retrieve last_Java_pc from last_Java_sp[-1].
2603 2603 // On 32bit we then have to push additional args on the stack to accomplish
2604 2604 // the actual requested call. On 64bit call_VM only can use register args
2605 2605 // so the only extra space is the return address that call_VM created.
2606 2606 // This hopefully explains the calculations here.
2607 2607
2608 2608 #ifdef _LP64
2609 2609 // We've pushed one address, correct last_Java_sp
2610 2610 lea(rax, Address(rsp, wordSize));
2611 2611 #else
2612 2612 lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize));
2613 2613 #endif // LP64
2614 2614
2615 2615 call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions);
2616 2616
2617 2617 }
2618 2618
2619 2619 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
2620 2620 call_VM_leaf_base(entry_point, number_of_arguments);
2621 2621 }
2622 2622
2623 2623 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
2624 2624 pass_arg0(this, arg_0);
2625 2625 call_VM_leaf(entry_point, 1);
2626 2626 }
2627 2627
2628 2628 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2629 2629
2630 2630 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2631 2631 pass_arg1(this, arg_1);
2632 2632 pass_arg0(this, arg_0);
2633 2633 call_VM_leaf(entry_point, 2);
2634 2634 }
2635 2635
2636 2636 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2637 2637 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2638 2638 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2639 2639 pass_arg2(this, arg_2);
2640 2640 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2641 2641 pass_arg1(this, arg_1);
2642 2642 pass_arg0(this, arg_0);
2643 2643 call_VM_leaf(entry_point, 3);
2644 2644 }
2645 2645
2646 2646 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
2647 2647 pass_arg0(this, arg_0);
2648 2648 MacroAssembler::call_VM_leaf_base(entry_point, 1);
2649 2649 }
2650 2650
2651 2651 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
2652 2652
2653 2653 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2654 2654 pass_arg1(this, arg_1);
2655 2655 pass_arg0(this, arg_0);
2656 2656 MacroAssembler::call_VM_leaf_base(entry_point, 2);
2657 2657 }
2658 2658
2659 2659 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
2660 2660 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2661 2661 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2662 2662 pass_arg2(this, arg_2);
2663 2663 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2664 2664 pass_arg1(this, arg_1);
2665 2665 pass_arg0(this, arg_0);
2666 2666 MacroAssembler::call_VM_leaf_base(entry_point, 3);
2667 2667 }
2668 2668
2669 2669 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
2670 2670 LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg"));
2671 2671 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg"));
2672 2672 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg"));
2673 2673 pass_arg3(this, arg_3);
2674 2674 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg"));
2675 2675 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg"));
2676 2676 pass_arg2(this, arg_2);
2677 2677 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg"));
2678 2678 pass_arg1(this, arg_1);
2679 2679 pass_arg0(this, arg_0);
2680 2680 MacroAssembler::call_VM_leaf_base(entry_point, 4);
2681 2681 }
2682 2682
2683 2683 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
2684 2684 movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
2685 2685 movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD);
2686 2686 verify_oop(oop_result, "broken oop in call_VM_base");
2687 2687 }
2688 2688
2689 2689 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
2690 2690 movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
2691 2691 movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD);
2692 2692 }
2693 2693
2694 2694 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {
2695 2695 }
2696 2696
2697 2697 void MacroAssembler::check_and_handle_popframe(Register java_thread) {
2698 2698 }
2699 2699
2700 2700 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) {
2701 2701 if (reachable(src1)) {
2702 2702 cmpl(as_Address(src1), imm);
2703 2703 } else {
2704 2704 lea(rscratch1, src1);
2705 2705 cmpl(Address(rscratch1, 0), imm);
2706 2706 }
2707 2707 }
2708 2708
2709 2709 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) {
2710 2710 assert(!src2.is_lval(), "use cmpptr");
2711 2711 if (reachable(src2)) {
2712 2712 cmpl(src1, as_Address(src2));
2713 2713 } else {
2714 2714 lea(rscratch1, src2);
2715 2715 cmpl(src1, Address(rscratch1, 0));
2716 2716 }
2717 2717 }
2718 2718
2719 2719 void MacroAssembler::cmp32(Register src1, int32_t imm) {
2720 2720 Assembler::cmpl(src1, imm);
2721 2721 }
2722 2722
2723 2723 void MacroAssembler::cmp32(Register src1, Address src2) {
2724 2724 Assembler::cmpl(src1, src2);
2725 2725 }
2726 2726
2727 2727 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2728 2728 ucomisd(opr1, opr2);
2729 2729
2730 2730 Label L;
2731 2731 if (unordered_is_less) {
2732 2732 movl(dst, -1);
2733 2733 jcc(Assembler::parity, L);
2734 2734 jcc(Assembler::below , L);
2735 2735 movl(dst, 0);
2736 2736 jcc(Assembler::equal , L);
2737 2737 increment(dst);
2738 2738 } else { // unordered is greater
2739 2739 movl(dst, 1);
2740 2740 jcc(Assembler::parity, L);
2741 2741 jcc(Assembler::above , L);
2742 2742 movl(dst, 0);
2743 2743 jcc(Assembler::equal , L);
2744 2744 decrementl(dst);
2745 2745 }
2746 2746 bind(L);
2747 2747 }
2748 2748
2749 2749 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) {
2750 2750 ucomiss(opr1, opr2);
2751 2751
2752 2752 Label L;
2753 2753 if (unordered_is_less) {
2754 2754 movl(dst, -1);
2755 2755 jcc(Assembler::parity, L);
2756 2756 jcc(Assembler::below , L);
2757 2757 movl(dst, 0);
2758 2758 jcc(Assembler::equal , L);
2759 2759 increment(dst);
2760 2760 } else { // unordered is greater
2761 2761 movl(dst, 1);
2762 2762 jcc(Assembler::parity, L);
2763 2763 jcc(Assembler::above , L);
2764 2764 movl(dst, 0);
2765 2765 jcc(Assembler::equal , L);
2766 2766 decrementl(dst);
2767 2767 }
2768 2768 bind(L);
2769 2769 }
2770 2770
2771 2771
2772 2772 void MacroAssembler::cmp8(AddressLiteral src1, int imm) {
2773 2773 if (reachable(src1)) {
2774 2774 cmpb(as_Address(src1), imm);
2775 2775 } else {
2776 2776 lea(rscratch1, src1);
2777 2777 cmpb(Address(rscratch1, 0), imm);
2778 2778 }
2779 2779 }
2780 2780
2781 2781 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) {
2782 2782 #ifdef _LP64
2783 2783 if (src2.is_lval()) {
2784 2784 movptr(rscratch1, src2);
2785 2785 Assembler::cmpq(src1, rscratch1);
2786 2786 } else if (reachable(src2)) {
2787 2787 cmpq(src1, as_Address(src2));
2788 2788 } else {
2789 2789 lea(rscratch1, src2);
2790 2790 Assembler::cmpq(src1, Address(rscratch1, 0));
2791 2791 }
2792 2792 #else
2793 2793 if (src2.is_lval()) {
2794 2794 cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2795 2795 } else {
2796 2796 cmpl(src1, as_Address(src2));
2797 2797 }
2798 2798 #endif // _LP64
2799 2799 }
2800 2800
2801 2801 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) {
2802 2802 assert(src2.is_lval(), "not a mem-mem compare");
2803 2803 #ifdef _LP64
2804 2804 // moves src2's literal address
2805 2805 movptr(rscratch1, src2);
2806 2806 Assembler::cmpq(src1, rscratch1);
2807 2807 #else
2808 2808 cmp_literal32(src1, (int32_t) src2.target(), src2.rspec());
2809 2809 #endif // _LP64
2810 2810 }
2811 2811
2812 2812 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) {
2813 2813 if (reachable(adr)) {
2814 2814 if (os::is_MP())
2815 2815 lock();
2816 2816 cmpxchgptr(reg, as_Address(adr));
2817 2817 } else {
2818 2818 lea(rscratch1, adr);
2819 2819 if (os::is_MP())
2820 2820 lock();
2821 2821 cmpxchgptr(reg, Address(rscratch1, 0));
2822 2822 }
2823 2823 }
2824 2824
2825 2825 void MacroAssembler::cmpxchgptr(Register reg, Address adr) {
2826 2826 LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr));
2827 2827 }
2828 2828
2829 2829 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) {
2830 2830 if (reachable(src)) {
2831 2831 Assembler::comisd(dst, as_Address(src));
2832 2832 } else {
2833 2833 lea(rscratch1, src);
2834 2834 Assembler::comisd(dst, Address(rscratch1, 0));
2835 2835 }
2836 2836 }
2837 2837
2838 2838 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) {
2839 2839 if (reachable(src)) {
2840 2840 Assembler::comiss(dst, as_Address(src));
2841 2841 } else {
2842 2842 lea(rscratch1, src);
2843 2843 Assembler::comiss(dst, Address(rscratch1, 0));
2844 2844 }
2845 2845 }
2846 2846
2847 2847
2848 2848 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) {
2849 2849 Condition negated_cond = negate_condition(cond);
2850 2850 Label L;
2851 2851 jcc(negated_cond, L);
2852 2852 pushf(); // Preserve flags
2853 2853 atomic_incl(counter_addr);
2854 2854 popf();
2855 2855 bind(L);
2856 2856 }
2857 2857
2858 2858 int MacroAssembler::corrected_idivl(Register reg) {
2859 2859 // Full implementation of Java idiv and irem; checks for
2860 2860 // special case as described in JVM spec., p.243 & p.271.
2861 2861 // The function returns the (pc) offset of the idivl
2862 2862 // instruction - may be needed for implicit exceptions.
2863 2863 //
2864 2864 // normal case special case
2865 2865 //
2866 2866 // input : rax,: dividend min_int
2867 2867 // reg: divisor (may not be rax,/rdx) -1
2868 2868 //
2869 2869 // output: rax,: quotient (= rax, idiv reg) min_int
2870 2870 // rdx: remainder (= rax, irem reg) 0
2871 2871 assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register");
2872 2872 const int min_int = 0x80000000;
2873 2873 Label normal_case, special_case;
2874 2874
2875 2875 // check for special case
2876 2876 cmpl(rax, min_int);
2877 2877 jcc(Assembler::notEqual, normal_case);
2878 2878 xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
2879 2879 cmpl(reg, -1);
2880 2880 jcc(Assembler::equal, special_case);
2881 2881
2882 2882 // handle normal case
2883 2883 bind(normal_case);
2884 2884 cdql();
2885 2885 int idivl_offset = offset();
2886 2886 idivl(reg);
2887 2887
2888 2888 // normal and special case exit
2889 2889 bind(special_case);
2890 2890
2891 2891 return idivl_offset;
2892 2892 }
2893 2893
2894 2894
2895 2895
2896 2896 void MacroAssembler::decrementl(Register reg, int value) {
2897 2897 if (value == min_jint) {subl(reg, value) ; return; }
2898 2898 if (value < 0) { incrementl(reg, -value); return; }
2899 2899 if (value == 0) { ; return; }
2900 2900 if (value == 1 && UseIncDec) { decl(reg) ; return; }
2901 2901 /* else */ { subl(reg, value) ; return; }
2902 2902 }
2903 2903
2904 2904 void MacroAssembler::decrementl(Address dst, int value) {
2905 2905 if (value == min_jint) {subl(dst, value) ; return; }
2906 2906 if (value < 0) { incrementl(dst, -value); return; }
2907 2907 if (value == 0) { ; return; }
2908 2908 if (value == 1 && UseIncDec) { decl(dst) ; return; }
2909 2909 /* else */ { subl(dst, value) ; return; }
2910 2910 }
2911 2911
2912 2912 void MacroAssembler::division_with_shift (Register reg, int shift_value) {
2913 2913 assert (shift_value > 0, "illegal shift value");
2914 2914 Label _is_positive;
2915 2915 testl (reg, reg);
2916 2916 jcc (Assembler::positive, _is_positive);
2917 2917 int offset = (1 << shift_value) - 1 ;
2918 2918
2919 2919 if (offset == 1) {
2920 2920 incrementl(reg);
2921 2921 } else {
2922 2922 addl(reg, offset);
2923 2923 }
2924 2924
2925 2925 bind (_is_positive);
2926 2926 sarl(reg, shift_value);
2927 2927 }
2928 2928
2929 2929 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) {
2930 2930 if (reachable(src)) {
2931 2931 Assembler::divsd(dst, as_Address(src));
2932 2932 } else {
2933 2933 lea(rscratch1, src);
2934 2934 Assembler::divsd(dst, Address(rscratch1, 0));
2935 2935 }
2936 2936 }
2937 2937
2938 2938 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) {
2939 2939 if (reachable(src)) {
2940 2940 Assembler::divss(dst, as_Address(src));
2941 2941 } else {
2942 2942 lea(rscratch1, src);
2943 2943 Assembler::divss(dst, Address(rscratch1, 0));
2944 2944 }
2945 2945 }
2946 2946
2947 2947 // !defined(COMPILER2) is because of stupid core builds
2948 2948 #if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2)
2949 2949 void MacroAssembler::empty_FPU_stack() {
2950 2950 if (VM_Version::supports_mmx()) {
2951 2951 emms();
2952 2952 } else {
2953 2953 for (int i = 8; i-- > 0; ) ffree(i);
2954 2954 }
2955 2955 }
2956 2956 #endif // !LP64 || C1 || !C2
2957 2957
2958 2958
2959 2959 // Defines obj, preserves var_size_in_bytes
2960 2960 void MacroAssembler::eden_allocate(Register obj,
2961 2961 Register var_size_in_bytes,
2962 2962 int con_size_in_bytes,
2963 2963 Register t1,
2964 2964 Label& slow_case) {
2965 2965 assert(obj == rax, "obj must be in rax, for cmpxchg");
2966 2966 assert_different_registers(obj, var_size_in_bytes, t1);
2967 2967 if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
2968 2968 jmp(slow_case);
2969 2969 } else {
2970 2970 Register end = t1;
2971 2971 Label retry;
2972 2972 bind(retry);
2973 2973 ExternalAddress heap_top((address) Universe::heap()->top_addr());
2974 2974 movptr(obj, heap_top);
2975 2975 if (var_size_in_bytes == noreg) {
2976 2976 lea(end, Address(obj, con_size_in_bytes));
2977 2977 } else {
2978 2978 lea(end, Address(obj, var_size_in_bytes, Address::times_1));
2979 2979 }
2980 2980 // if end < obj then we wrapped around => object too long => slow case
2981 2981 cmpptr(end, obj);
2982 2982 jcc(Assembler::below, slow_case);
2983 2983 cmpptr(end, ExternalAddress((address) Universe::heap()->end_addr()));
2984 2984 jcc(Assembler::above, slow_case);
2985 2985 // Compare obj with the top addr, and if still equal, store the new top addr in
2986 2986 // end at the address of the top addr pointer. Sets ZF if was equal, and clears
2987 2987 // it otherwise. Use lock prefix for atomicity on MPs.
2988 2988 locked_cmpxchgptr(end, heap_top);
2989 2989 jcc(Assembler::notEqual, retry);
2990 2990 }
2991 2991 }
2992 2992
2993 2993 void MacroAssembler::enter() {
2994 2994 push(rbp);
2995 2995 mov(rbp, rsp);
2996 2996 }
2997 2997
2998 2998 // A 5 byte nop that is safe for patching (see patch_verified_entry)
2999 2999 void MacroAssembler::fat_nop() {
3000 3000 if (UseAddressNop) {
3001 3001 addr_nop_5();
3002 3002 } else {
3003 3003 emit_int8(0x26); // es:
3004 3004 emit_int8(0x2e); // cs:
3005 3005 emit_int8(0x64); // fs:
3006 3006 emit_int8(0x65); // gs:
3007 3007 emit_int8((unsigned char)0x90);
3008 3008 }
3009 3009 }
3010 3010
3011 3011 void MacroAssembler::fcmp(Register tmp) {
3012 3012 fcmp(tmp, 1, true, true);
3013 3013 }
3014 3014
3015 3015 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) {
3016 3016 assert(!pop_right || pop_left, "usage error");
3017 3017 if (VM_Version::supports_cmov()) {
3018 3018 assert(tmp == noreg, "unneeded temp");
3019 3019 if (pop_left) {
3020 3020 fucomip(index);
3021 3021 } else {
3022 3022 fucomi(index);
3023 3023 }
3024 3024 if (pop_right) {
3025 3025 fpop();
3026 3026 }
3027 3027 } else {
3028 3028 assert(tmp != noreg, "need temp");
3029 3029 if (pop_left) {
3030 3030 if (pop_right) {
3031 3031 fcompp();
3032 3032 } else {
3033 3033 fcomp(index);
3034 3034 }
3035 3035 } else {
3036 3036 fcom(index);
3037 3037 }
3038 3038 // convert FPU condition into eflags condition via rax,
3039 3039 save_rax(tmp);
3040 3040 fwait(); fnstsw_ax();
3041 3041 sahf();
3042 3042 restore_rax(tmp);
3043 3043 }
3044 3044 // condition codes set as follows:
3045 3045 //
3046 3046 // CF (corresponds to C0) if x < y
3047 3047 // PF (corresponds to C2) if unordered
3048 3048 // ZF (corresponds to C3) if x = y
3049 3049 }
3050 3050
3051 3051 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) {
3052 3052 fcmp2int(dst, unordered_is_less, 1, true, true);
3053 3053 }
3054 3054
3055 3055 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) {
3056 3056 fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right);
3057 3057 Label L;
3058 3058 if (unordered_is_less) {
3059 3059 movl(dst, -1);
3060 3060 jcc(Assembler::parity, L);
3061 3061 jcc(Assembler::below , L);
3062 3062 movl(dst, 0);
3063 3063 jcc(Assembler::equal , L);
3064 3064 increment(dst);
3065 3065 } else { // unordered is greater
3066 3066 movl(dst, 1);
3067 3067 jcc(Assembler::parity, L);
3068 3068 jcc(Assembler::above , L);
3069 3069 movl(dst, 0);
3070 3070 jcc(Assembler::equal , L);
3071 3071 decrementl(dst);
3072 3072 }
3073 3073 bind(L);
3074 3074 }
3075 3075
3076 3076 void MacroAssembler::fld_d(AddressLiteral src) {
3077 3077 fld_d(as_Address(src));
3078 3078 }
3079 3079
3080 3080 void MacroAssembler::fld_s(AddressLiteral src) {
3081 3081 fld_s(as_Address(src));
3082 3082 }
3083 3083
3084 3084 void MacroAssembler::fld_x(AddressLiteral src) {
3085 3085 Assembler::fld_x(as_Address(src));
3086 3086 }
3087 3087
3088 3088 void MacroAssembler::fldcw(AddressLiteral src) {
3089 3089 Assembler::fldcw(as_Address(src));
3090 3090 }
3091 3091
3092 3092 void MacroAssembler::pow_exp_core_encoding() {
3093 3093 // kills rax, rcx, rdx
3094 3094 subptr(rsp,sizeof(jdouble));
3095 3095 // computes 2^X. Stack: X ...
3096 3096 // f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and
3097 3097 // keep it on the thread's stack to compute 2^int(X) later
3098 3098 // then compute 2^(X-int(X)) as (2^(X-int(X)-1+1)
3099 3099 // final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X))
3100 3100 fld_s(0); // Stack: X X ...
3101 3101 frndint(); // Stack: int(X) X ...
3102 3102 fsuba(1); // Stack: int(X) X-int(X) ...
3103 3103 fistp_s(Address(rsp,0)); // move int(X) as integer to thread's stack. Stack: X-int(X) ...
3104 3104 f2xm1(); // Stack: 2^(X-int(X))-1 ...
3105 3105 fld1(); // Stack: 1 2^(X-int(X))-1 ...
3106 3106 faddp(1); // Stack: 2^(X-int(X))
3107 3107 // computes 2^(int(X)): add exponent bias (1023) to int(X), then
3108 3108 // shift int(X)+1023 to exponent position.
3109 3109 // Exponent is limited to 11 bits if int(X)+1023 does not fit in 11
3110 3110 // bits, set result to NaN. 0x000 and 0x7FF are reserved exponent
3111 3111 // values so detect them and set result to NaN.
3112 3112 movl(rax,Address(rsp,0));
3113 3113 movl(rcx, -2048); // 11 bit mask and valid NaN binary encoding
3114 3114 addl(rax, 1023);
3115 3115 movl(rdx,rax);
3116 3116 shll(rax,20);
3117 3117 // Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN.
3118 3118 addl(rdx,1);
3119 3119 // Check that 1 < int(X)+1023+1 < 2048
3120 3120 // in 3 steps:
3121 3121 // 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048
3122 3122 // 2- (int(X)+1023+1)&-2048 != 0
3123 3123 // 3- (int(X)+1023+1)&-2048 != 1
3124 3124 // Do 2- first because addl just updated the flags.
3125 3125 cmov32(Assembler::equal,rax,rcx);
3126 3126 cmpl(rdx,1);
3127 3127 cmov32(Assembler::equal,rax,rcx);
3128 3128 testl(rdx,rcx);
3129 3129 cmov32(Assembler::notEqual,rax,rcx);
3130 3130 movl(Address(rsp,4),rax);
3131 3131 movl(Address(rsp,0),0);
3132 3132 fmul_d(Address(rsp,0)); // Stack: 2^X ...
3133 3133 addptr(rsp,sizeof(jdouble));
3134 3134 }
3135 3135
3136 3136 void MacroAssembler::increase_precision() {
3137 3137 subptr(rsp, BytesPerWord);
3138 3138 fnstcw(Address(rsp, 0));
3139 3139 movl(rax, Address(rsp, 0));
3140 3140 orl(rax, 0x300);
3141 3141 push(rax);
3142 3142 fldcw(Address(rsp, 0));
3143 3143 pop(rax);
3144 3144 }
3145 3145
3146 3146 void MacroAssembler::restore_precision() {
3147 3147 fldcw(Address(rsp, 0));
3148 3148 addptr(rsp, BytesPerWord);
3149 3149 }
3150 3150
3151 3151 void MacroAssembler::fast_pow() {
3152 3152 // computes X^Y = 2^(Y * log2(X))
3153 3153 // if fast computation is not possible, result is NaN. Requires
3154 3154 // fallback from user of this macro.
3155 3155 // increase precision for intermediate steps of the computation
3156 3156 BLOCK_COMMENT("fast_pow {");
3157 3157 increase_precision();
3158 3158 fyl2x(); // Stack: (Y*log2(X)) ...
3159 3159 pow_exp_core_encoding(); // Stack: exp(X) ...
3160 3160 restore_precision();
3161 3161 BLOCK_COMMENT("} fast_pow");
3162 3162 }
3163 3163
3164 3164 void MacroAssembler::fast_exp() {
3165 3165 // computes exp(X) = 2^(X * log2(e))
3166 3166 // if fast computation is not possible, result is NaN. Requires
3167 3167 // fallback from user of this macro.
3168 3168 // increase precision for intermediate steps of the computation
3169 3169 increase_precision();
3170 3170 fldl2e(); // Stack: log2(e) X ...
3171 3171 fmulp(1); // Stack: (X*log2(e)) ...
3172 3172 pow_exp_core_encoding(); // Stack: exp(X) ...
3173 3173 restore_precision();
3174 3174 }
3175 3175
3176 3176 void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) {
3177 3177 // kills rax, rcx, rdx
3178 3178 // pow and exp needs 2 extra registers on the fpu stack.
3179 3179 Label slow_case, done;
3180 3180 Register tmp = noreg;
3181 3181 if (!VM_Version::supports_cmov()) {
3182 3182 // fcmp needs a temporary so preserve rdx,
3183 3183 tmp = rdx;
3184 3184 }
3185 3185 Register tmp2 = rax;
3186 3186 Register tmp3 = rcx;
3187 3187
3188 3188 if (is_exp) {
3189 3189 // Stack: X
3190 3190 fld_s(0); // duplicate argument for runtime call. Stack: X X
3191 3191 fast_exp(); // Stack: exp(X) X
3192 3192 fcmp(tmp, 0, false, false); // Stack: exp(X) X
3193 3193 // exp(X) not equal to itself: exp(X) is NaN go to slow case.
3194 3194 jcc(Assembler::parity, slow_case);
3195 3195 // get rid of duplicate argument. Stack: exp(X)
3196 3196 if (num_fpu_regs_in_use > 0) {
3197 3197 fxch();
3198 3198 fpop();
3199 3199 } else {
3200 3200 ffree(1);
3201 3201 }
3202 3202 jmp(done);
3203 3203 } else {
3204 3204 // Stack: X Y
3205 3205 Label x_negative, y_odd;
3206 3206
3207 3207 fldz(); // Stack: 0 X Y
3208 3208 fcmp(tmp, 1, true, false); // Stack: X Y
3209 3209 jcc(Assembler::above, x_negative);
3210 3210
3211 3211 // X >= 0
3212 3212
3213 3213 fld_s(1); // duplicate arguments for runtime call. Stack: Y X Y
3214 3214 fld_s(1); // Stack: X Y X Y
3215 3215 fast_pow(); // Stack: X^Y X Y
3216 3216 fcmp(tmp, 0, false, false); // Stack: X^Y X Y
3217 3217 // X^Y not equal to itself: X^Y is NaN go to slow case.
3218 3218 jcc(Assembler::parity, slow_case);
3219 3219 // get rid of duplicate arguments. Stack: X^Y
3220 3220 if (num_fpu_regs_in_use > 0) {
3221 3221 fxch(); fpop();
3222 3222 fxch(); fpop();
3223 3223 } else {
3224 3224 ffree(2);
3225 3225 ffree(1);
3226 3226 }
3227 3227 jmp(done);
3228 3228
3229 3229 // X <= 0
3230 3230 bind(x_negative);
3231 3231
3232 3232 fld_s(1); // Stack: Y X Y
3233 3233 frndint(); // Stack: int(Y) X Y
3234 3234 fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
3235 3235 jcc(Assembler::notEqual, slow_case);
3236 3236
3237 3237 subptr(rsp, 8);
3238 3238
3239 3239 // For X^Y, when X < 0, Y has to be an integer and the final
3240 3240 // result depends on whether it's odd or even. We just checked
3241 3241 // that int(Y) == Y. We move int(Y) to gp registers as a 64 bit
3242 3242 // integer to test its parity. If int(Y) is huge and doesn't fit
3243 3243 // in the 64 bit integer range, the integer indefinite value will
3244 3244 // end up in the gp registers. Huge numbers are all even, the
3245 3245 // integer indefinite number is even so it's fine.
3246 3246
3247 3247 #ifdef ASSERT
3248 3248 // Let's check we don't end up with an integer indefinite number
3249 3249 // when not expected. First test for huge numbers: check whether
3250 3250 // int(Y)+1 == int(Y) which is true for very large numbers and
3251 3251 // those are all even. A 64 bit integer is guaranteed to not
3252 3252 // overflow for numbers where y+1 != y (when precision is set to
3253 3253 // double precision).
3254 3254 Label y_not_huge;
3255 3255
3256 3256 fld1(); // Stack: 1 int(Y) X Y
3257 3257 fadd(1); // Stack: 1+int(Y) int(Y) X Y
3258 3258
3259 3259 #ifdef _LP64
3260 3260 // trip to memory to force the precision down from double extended
3261 3261 // precision
3262 3262 fstp_d(Address(rsp, 0));
3263 3263 fld_d(Address(rsp, 0));
3264 3264 #endif
3265 3265
3266 3266 fcmp(tmp, 1, true, false); // Stack: int(Y) X Y
3267 3267 #endif
3268 3268
3269 3269 // move int(Y) as 64 bit integer to thread's stack
3270 3270 fistp_d(Address(rsp,0)); // Stack: X Y
3271 3271
3272 3272 #ifdef ASSERT
3273 3273 jcc(Assembler::notEqual, y_not_huge);
3274 3274
3275 3275 // Y is huge so we know it's even. It may not fit in a 64 bit
3276 3276 // integer and we don't want the debug code below to see the
3277 3277 // integer indefinite value so overwrite int(Y) on the thread's
3278 3278 // stack with 0.
3279 3279 movl(Address(rsp, 0), 0);
3280 3280 movl(Address(rsp, 4), 0);
3281 3281
3282 3282 bind(y_not_huge);
3283 3283 #endif
3284 3284
3285 3285 fld_s(1); // duplicate arguments for runtime call. Stack: Y X Y
3286 3286 fld_s(1); // Stack: X Y X Y
3287 3287 fabs(); // Stack: abs(X) Y X Y
3288 3288 fast_pow(); // Stack: abs(X)^Y X Y
3289 3289 fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
3290 3290 // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
3291 3291
3292 3292 pop(tmp2);
3293 3293 NOT_LP64(pop(tmp3));
3294 3294 jcc(Assembler::parity, slow_case);
3295 3295
3296 3296 #ifdef ASSERT
3297 3297 // Check that int(Y) is not integer indefinite value (int
3298 3298 // overflow). Shouldn't happen because for values that would
3299 3299 // overflow, 1+int(Y)==Y which was tested earlier.
3300 3300 #ifndef _LP64
3301 3301 {
3302 3302 Label integer;
3303 3303 testl(tmp2, tmp2);
3304 3304 jcc(Assembler::notZero, integer);
3305 3305 cmpl(tmp3, 0x80000000);
3306 3306 jcc(Assembler::notZero, integer);
3307 3307 STOP("integer indefinite value shouldn't be seen here");
3308 3308 bind(integer);
3309 3309 }
3310 3310 #else
3311 3311 {
3312 3312 Label integer;
3313 3313 mov(tmp3, tmp2); // preserve tmp2 for parity check below
3314 3314 shlq(tmp3, 1);
3315 3315 jcc(Assembler::carryClear, integer);
3316 3316 jcc(Assembler::notZero, integer);
3317 3317 STOP("integer indefinite value shouldn't be seen here");
3318 3318 bind(integer);
3319 3319 }
3320 3320 #endif
3321 3321 #endif
3322 3322
3323 3323 // get rid of duplicate arguments. Stack: X^Y
3324 3324 if (num_fpu_regs_in_use > 0) {
3325 3325 fxch(); fpop();
3326 3326 fxch(); fpop();
3327 3327 } else {
3328 3328 ffree(2);
3329 3329 ffree(1);
3330 3330 }
3331 3331
3332 3332 testl(tmp2, 1);
3333 3333 jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
3334 3334 // X <= 0, Y even: X^Y = -abs(X)^Y
3335 3335
3336 3336 fchs(); // Stack: -abs(X)^Y Y
3337 3337 jmp(done);
3338 3338 }
3339 3339
3340 3340 // slow case: runtime call
3341 3341 bind(slow_case);
3342 3342
3343 3343 fpop(); // pop incorrect result or int(Y)
3344 3344
3345 3345 fp_runtime_fallback(is_exp ? CAST_FROM_FN_PTR(address, SharedRuntime::dexp) : CAST_FROM_FN_PTR(address, SharedRuntime::dpow),
3346 3346 is_exp ? 1 : 2, num_fpu_regs_in_use);
3347 3347
3348 3348 // Come here with result in F-TOS
3349 3349 bind(done);
3350 3350 }
3351 3351
3352 3352 void MacroAssembler::fpop() {
3353 3353 ffree();
3354 3354 fincstp();
3355 3355 }
3356 3356
3357 3357 void MacroAssembler::fremr(Register tmp) {
3358 3358 save_rax(tmp);
3359 3359 { Label L;
3360 3360 bind(L);
3361 3361 fprem();
3362 3362 fwait(); fnstsw_ax();
3363 3363 #ifdef _LP64
3364 3364 testl(rax, 0x400);
3365 3365 jcc(Assembler::notEqual, L);
3366 3366 #else
3367 3367 sahf();
3368 3368 jcc(Assembler::parity, L);
3369 3369 #endif // _LP64
3370 3370 }
3371 3371 restore_rax(tmp);
3372 3372 // Result is in ST0.
3373 3373 // Note: fxch & fpop to get rid of ST1
3374 3374 // (otherwise FPU stack could overflow eventually)
3375 3375 fxch(1);
3376 3376 fpop();
3377 3377 }
3378 3378
3379 3379
3380 3380 void MacroAssembler::incrementl(AddressLiteral dst) {
3381 3381 if (reachable(dst)) {
3382 3382 incrementl(as_Address(dst));
3383 3383 } else {
3384 3384 lea(rscratch1, dst);
3385 3385 incrementl(Address(rscratch1, 0));
3386 3386 }
3387 3387 }
3388 3388
3389 3389 void MacroAssembler::incrementl(ArrayAddress dst) {
3390 3390 incrementl(as_Address(dst));
3391 3391 }
3392 3392
3393 3393 void MacroAssembler::incrementl(Register reg, int value) {
3394 3394 if (value == min_jint) {addl(reg, value) ; return; }
3395 3395 if (value < 0) { decrementl(reg, -value); return; }
3396 3396 if (value == 0) { ; return; }
3397 3397 if (value == 1 && UseIncDec) { incl(reg) ; return; }
3398 3398 /* else */ { addl(reg, value) ; return; }
3399 3399 }
3400 3400
3401 3401 void MacroAssembler::incrementl(Address dst, int value) {
3402 3402 if (value == min_jint) {addl(dst, value) ; return; }
3403 3403 if (value < 0) { decrementl(dst, -value); return; }
3404 3404 if (value == 0) { ; return; }
3405 3405 if (value == 1 && UseIncDec) { incl(dst) ; return; }
3406 3406 /* else */ { addl(dst, value) ; return; }
3407 3407 }
3408 3408
3409 3409 void MacroAssembler::jump(AddressLiteral dst) {
3410 3410 if (reachable(dst)) {
3411 3411 jmp_literal(dst.target(), dst.rspec());
3412 3412 } else {
3413 3413 lea(rscratch1, dst);
3414 3414 jmp(rscratch1);
3415 3415 }
3416 3416 }
3417 3417
3418 3418 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) {
3419 3419 if (reachable(dst)) {
3420 3420 InstructionMark im(this);
3421 3421 relocate(dst.reloc());
3422 3422 const int short_size = 2;
3423 3423 const int long_size = 6;
3424 3424 int offs = (intptr_t)dst.target() - ((intptr_t)pc());
3425 3425 if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) {
3426 3426 // 0111 tttn #8-bit disp
3427 3427 emit_int8(0x70 | cc);
3428 3428 emit_int8((offs - short_size) & 0xFF);
3429 3429 } else {
3430 3430 // 0000 1111 1000 tttn #32-bit disp
3431 3431 emit_int8(0x0F);
3432 3432 emit_int8((unsigned char)(0x80 | cc));
3433 3433 emit_int32(offs - long_size);
3434 3434 }
3435 3435 } else {
3436 3436 #ifdef ASSERT
3437 3437 warning("reversing conditional branch");
3438 3438 #endif /* ASSERT */
3439 3439 Label skip;
3440 3440 jccb(reverse[cc], skip);
3441 3441 lea(rscratch1, dst);
3442 3442 Assembler::jmp(rscratch1);
3443 3443 bind(skip);
3444 3444 }
3445 3445 }
3446 3446
3447 3447 void MacroAssembler::ldmxcsr(AddressLiteral src) {
3448 3448 if (reachable(src)) {
3449 3449 Assembler::ldmxcsr(as_Address(src));
3450 3450 } else {
3451 3451 lea(rscratch1, src);
3452 3452 Assembler::ldmxcsr(Address(rscratch1, 0));
3453 3453 }
3454 3454 }
3455 3455
3456 3456 int MacroAssembler::load_signed_byte(Register dst, Address src) {
3457 3457 int off;
3458 3458 if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3459 3459 off = offset();
3460 3460 movsbl(dst, src); // movsxb
3461 3461 } else {
3462 3462 off = load_unsigned_byte(dst, src);
3463 3463 shll(dst, 24);
3464 3464 sarl(dst, 24);
3465 3465 }
3466 3466 return off;
3467 3467 }
3468 3468
3469 3469 // Note: load_signed_short used to be called load_signed_word.
3470 3470 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler
3471 3471 // manual, which means 16 bits, that usage is found nowhere in HotSpot code.
3472 3472 // The term "word" in HotSpot means a 32- or 64-bit machine word.
3473 3473 int MacroAssembler::load_signed_short(Register dst, Address src) {
3474 3474 int off;
3475 3475 if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3476 3476 // This is dubious to me since it seems safe to do a signed 16 => 64 bit
3477 3477 // version but this is what 64bit has always done. This seems to imply
3478 3478 // that users are only using 32bits worth.
3479 3479 off = offset();
3480 3480 movswl(dst, src); // movsxw
3481 3481 } else {
3482 3482 off = load_unsigned_short(dst, src);
3483 3483 shll(dst, 16);
3484 3484 sarl(dst, 16);
3485 3485 }
3486 3486 return off;
3487 3487 }
3488 3488
3489 3489 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
3490 3490 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3491 3491 // and "3.9 Partial Register Penalties", p. 22).
3492 3492 int off;
3493 3493 if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
3494 3494 off = offset();
3495 3495 movzbl(dst, src); // movzxb
3496 3496 } else {
3497 3497 xorl(dst, dst);
3498 3498 off = offset();
3499 3499 movb(dst, src);
3500 3500 }
3501 3501 return off;
3502 3502 }
3503 3503
3504 3504 // Note: load_unsigned_short used to be called load_unsigned_word.
3505 3505 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
3506 3506 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16,
3507 3507 // and "3.9 Partial Register Penalties", p. 22).
3508 3508 int off;
3509 3509 if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
3510 3510 off = offset();
3511 3511 movzwl(dst, src); // movzxw
3512 3512 } else {
3513 3513 xorl(dst, dst);
3514 3514 off = offset();
3515 3515 movw(dst, src);
3516 3516 }
3517 3517 return off;
3518 3518 }
3519 3519
3520 3520 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
3521 3521 switch (size_in_bytes) {
3522 3522 #ifndef _LP64
3523 3523 case 8:
3524 3524 assert(dst2 != noreg, "second dest register required");
3525 3525 movl(dst, src);
3526 3526 movl(dst2, src.plus_disp(BytesPerInt));
3527 3527 break;
3528 3528 #else
3529 3529 case 8: movq(dst, src); break;
3530 3530 #endif
3531 3531 case 4: movl(dst, src); break;
3532 3532 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
3533 3533 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
3534 3534 default: ShouldNotReachHere();
3535 3535 }
3536 3536 }
3537 3537
3538 3538 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
3539 3539 switch (size_in_bytes) {
3540 3540 #ifndef _LP64
3541 3541 case 8:
3542 3542 assert(src2 != noreg, "second source register required");
3543 3543 movl(dst, src);
3544 3544 movl(dst.plus_disp(BytesPerInt), src2);
3545 3545 break;
3546 3546 #else
3547 3547 case 8: movq(dst, src); break;
3548 3548 #endif
3549 3549 case 4: movl(dst, src); break;
3550 3550 case 2: movw(dst, src); break;
3551 3551 case 1: movb(dst, src); break;
3552 3552 default: ShouldNotReachHere();
3553 3553 }
3554 3554 }
3555 3555
3556 3556 void MacroAssembler::mov32(AddressLiteral dst, Register src) {
3557 3557 if (reachable(dst)) {
3558 3558 movl(as_Address(dst), src);
3559 3559 } else {
3560 3560 lea(rscratch1, dst);
3561 3561 movl(Address(rscratch1, 0), src);
3562 3562 }
3563 3563 }
3564 3564
3565 3565 void MacroAssembler::mov32(Register dst, AddressLiteral src) {
3566 3566 if (reachable(src)) {
3567 3567 movl(dst, as_Address(src));
3568 3568 } else {
3569 3569 lea(rscratch1, src);
3570 3570 movl(dst, Address(rscratch1, 0));
3571 3571 }
3572 3572 }
3573 3573
3574 3574 // C++ bool manipulation
3575 3575
3576 3576 void MacroAssembler::movbool(Register dst, Address src) {
3577 3577 if(sizeof(bool) == 1)
3578 3578 movb(dst, src);
3579 3579 else if(sizeof(bool) == 2)
3580 3580 movw(dst, src);
3581 3581 else if(sizeof(bool) == 4)
3582 3582 movl(dst, src);
3583 3583 else
3584 3584 // unsupported
3585 3585 ShouldNotReachHere();
3586 3586 }
3587 3587
3588 3588 void MacroAssembler::movbool(Address dst, bool boolconst) {
3589 3589 if(sizeof(bool) == 1)
3590 3590 movb(dst, (int) boolconst);
3591 3591 else if(sizeof(bool) == 2)
3592 3592 movw(dst, (int) boolconst);
3593 3593 else if(sizeof(bool) == 4)
3594 3594 movl(dst, (int) boolconst);
3595 3595 else
3596 3596 // unsupported
3597 3597 ShouldNotReachHere();
3598 3598 }
3599 3599
3600 3600 void MacroAssembler::movbool(Address dst, Register src) {
3601 3601 if(sizeof(bool) == 1)
3602 3602 movb(dst, src);
3603 3603 else if(sizeof(bool) == 2)
3604 3604 movw(dst, src);
3605 3605 else if(sizeof(bool) == 4)
3606 3606 movl(dst, src);
3607 3607 else
3608 3608 // unsupported
3609 3609 ShouldNotReachHere();
3610 3610 }
3611 3611
3612 3612 void MacroAssembler::movbyte(ArrayAddress dst, int src) {
3613 3613 movb(as_Address(dst), src);
3614 3614 }
3615 3615
3616 3616 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) {
3617 3617 if (reachable(src)) {
3618 3618 movdl(dst, as_Address(src));
3619 3619 } else {
3620 3620 lea(rscratch1, src);
3621 3621 movdl(dst, Address(rscratch1, 0));
3622 3622 }
3623 3623 }
3624 3624
3625 3625 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) {
3626 3626 if (reachable(src)) {
3627 3627 movq(dst, as_Address(src));
3628 3628 } else {
3629 3629 lea(rscratch1, src);
3630 3630 movq(dst, Address(rscratch1, 0));
3631 3631 }
3632 3632 }
3633 3633
3634 3634 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) {
3635 3635 if (reachable(src)) {
3636 3636 if (UseXmmLoadAndClearUpper) {
3637 3637 movsd (dst, as_Address(src));
3638 3638 } else {
3639 3639 movlpd(dst, as_Address(src));
3640 3640 }
3641 3641 } else {
3642 3642 lea(rscratch1, src);
3643 3643 if (UseXmmLoadAndClearUpper) {
3644 3644 movsd (dst, Address(rscratch1, 0));
3645 3645 } else {
3646 3646 movlpd(dst, Address(rscratch1, 0));
3647 3647 }
3648 3648 }
3649 3649 }
3650 3650
3651 3651 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) {
3652 3652 if (reachable(src)) {
3653 3653 movss(dst, as_Address(src));
3654 3654 } else {
3655 3655 lea(rscratch1, src);
3656 3656 movss(dst, Address(rscratch1, 0));
3657 3657 }
3658 3658 }
3659 3659
3660 3660 void MacroAssembler::movptr(Register dst, Register src) {
3661 3661 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3662 3662 }
3663 3663
3664 3664 void MacroAssembler::movptr(Register dst, Address src) {
3665 3665 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3666 3666 }
3667 3667
3668 3668 // src should NEVER be a real pointer. Use AddressLiteral for true pointers
3669 3669 void MacroAssembler::movptr(Register dst, intptr_t src) {
3670 3670 LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
3671 3671 }
3672 3672
3673 3673 void MacroAssembler::movptr(Address dst, Register src) {
3674 3674 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src));
3675 3675 }
3676 3676
3677 3677 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src) {
3678 3678 if (reachable(src)) {
3679 3679 Assembler::movdqu(dst, as_Address(src));
3680 3680 } else {
3681 3681 lea(rscratch1, src);
3682 3682 Assembler::movdqu(dst, Address(rscratch1, 0));
3683 3683 }
3684 3684 }
3685 3685
3686 3686 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) {
3687 3687 if (reachable(src)) {
3688 3688 Assembler::movdqa(dst, as_Address(src));
3689 3689 } else {
3690 3690 lea(rscratch1, src);
3691 3691 Assembler::movdqa(dst, Address(rscratch1, 0));
3692 3692 }
3693 3693 }
3694 3694
3695 3695 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) {
3696 3696 if (reachable(src)) {
3697 3697 Assembler::movsd(dst, as_Address(src));
3698 3698 } else {
3699 3699 lea(rscratch1, src);
3700 3700 Assembler::movsd(dst, Address(rscratch1, 0));
3701 3701 }
3702 3702 }
3703 3703
3704 3704 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) {
3705 3705 if (reachable(src)) {
3706 3706 Assembler::movss(dst, as_Address(src));
3707 3707 } else {
3708 3708 lea(rscratch1, src);
3709 3709 Assembler::movss(dst, Address(rscratch1, 0));
3710 3710 }
3711 3711 }
3712 3712
3713 3713 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) {
3714 3714 if (reachable(src)) {
3715 3715 Assembler::mulsd(dst, as_Address(src));
3716 3716 } else {
3717 3717 lea(rscratch1, src);
3718 3718 Assembler::mulsd(dst, Address(rscratch1, 0));
3719 3719 }
3720 3720 }
3721 3721
3722 3722 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) {
3723 3723 if (reachable(src)) {
3724 3724 Assembler::mulss(dst, as_Address(src));
3725 3725 } else {
3726 3726 lea(rscratch1, src);
3727 3727 Assembler::mulss(dst, Address(rscratch1, 0));
3728 3728 }
3729 3729 }
3730 3730
3731 3731 void MacroAssembler::null_check(Register reg, int offset) {
3732 3732 if (needs_explicit_null_check(offset)) {
3733 3733 // provoke OS NULL exception if reg = NULL by
3734 3734 // accessing M[reg] w/o changing any (non-CC) registers
3735 3735 // NOTE: cmpl is plenty here to provoke a segv
3736 3736 cmpptr(rax, Address(reg, 0));
3737 3737 // Note: should probably use testl(rax, Address(reg, 0));
3738 3738 // may be shorter code (however, this version of
3739 3739 // testl needs to be implemented first)
3740 3740 } else {
3741 3741 // nothing to do, (later) access of M[reg + offset]
3742 3742 // will provoke OS NULL exception if reg = NULL
3743 3743 }
3744 3744 }
3745 3745
3746 3746 void MacroAssembler::os_breakpoint() {
3747 3747 // instead of directly emitting a breakpoint, call os:breakpoint for better debugability
3748 3748 // (e.g., MSVC can't call ps() otherwise)
3749 3749 call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
3750 3750 }
3751 3751
3752 3752 void MacroAssembler::pop_CPU_state() {
3753 3753 pop_FPU_state();
3754 3754 pop_IU_state();
3755 3755 }
3756 3756
3757 3757 void MacroAssembler::pop_FPU_state() {
3758 3758 NOT_LP64(frstor(Address(rsp, 0));)
3759 3759 LP64_ONLY(fxrstor(Address(rsp, 0));)
3760 3760 addptr(rsp, FPUStateSizeInWords * wordSize);
3761 3761 }
3762 3762
3763 3763 void MacroAssembler::pop_IU_state() {
3764 3764 popa();
3765 3765 LP64_ONLY(addq(rsp, 8));
3766 3766 popf();
3767 3767 }
3768 3768
3769 3769 // Save Integer and Float state
3770 3770 // Warning: Stack must be 16 byte aligned (64bit)
3771 3771 void MacroAssembler::push_CPU_state() {
3772 3772 push_IU_state();
3773 3773 push_FPU_state();
3774 3774 }
3775 3775
3776 3776 void MacroAssembler::push_FPU_state() {
3777 3777 subptr(rsp, FPUStateSizeInWords * wordSize);
3778 3778 #ifndef _LP64
3779 3779 fnsave(Address(rsp, 0));
3780 3780 fwait();
3781 3781 #else
3782 3782 fxsave(Address(rsp, 0));
3783 3783 #endif // LP64
3784 3784 }
3785 3785
3786 3786 void MacroAssembler::push_IU_state() {
3787 3787 // Push flags first because pusha kills them
3788 3788 pushf();
3789 3789 // Make sure rsp stays 16-byte aligned
3790 3790 LP64_ONLY(subq(rsp, 8));
3791 3791 pusha();
3792 3792 }
3793 3793
3794 3794 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp, bool clear_pc) {
3795 3795 // determine java_thread register
3796 3796 if (!java_thread->is_valid()) {
3797 3797 java_thread = rdi;
3798 3798 get_thread(java_thread);
3799 3799 }
3800 3800 // we must set sp to zero to clear frame
3801 3801 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD);
3802 3802 if (clear_fp) {
3803 3803 movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
3804 3804 }
3805 3805
3806 3806 if (clear_pc)
3807 3807 movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
3808 3808
3809 3809 }
3810 3810
3811 3811 void MacroAssembler::restore_rax(Register tmp) {
3812 3812 if (tmp == noreg) pop(rax);
3813 3813 else if (tmp != rax) mov(rax, tmp);
3814 3814 }
3815 3815
3816 3816 void MacroAssembler::round_to(Register reg, int modulus) {
3817 3817 addptr(reg, modulus - 1);
3818 3818 andptr(reg, -modulus);
3819 3819 }
3820 3820
3821 3821 void MacroAssembler::save_rax(Register tmp) {
3822 3822 if (tmp == noreg) push(rax);
3823 3823 else if (tmp != rax) mov(tmp, rax);
3824 3824 }
3825 3825
3826 3826 // Write serialization page so VM thread can do a pseudo remote membar.
3827 3827 // We use the current thread pointer to calculate a thread specific
3828 3828 // offset to write to within the page. This minimizes bus traffic
3829 3829 // due to cache line collision.
3830 3830 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
3831 3831 movl(tmp, thread);
3832 3832 shrl(tmp, os::get_serialize_page_shift_count());
3833 3833 andl(tmp, (os::vm_page_size() - sizeof(int)));
3834 3834
3835 3835 Address index(noreg, tmp, Address::times_1);
3836 3836 ExternalAddress page(os::get_memory_serialize_page());
3837 3837
3838 3838 // Size of store must match masking code above
3839 3839 movl(as_Address(ArrayAddress(page, index)), tmp);
3840 3840 }
3841 3841
3842 3842 // Calls to C land
3843 3843 //
3844 3844 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded
3845 3845 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
3846 3846 // has to be reset to 0. This is required to allow proper stack traversal.
3847 3847 void MacroAssembler::set_last_Java_frame(Register java_thread,
3848 3848 Register last_java_sp,
3849 3849 Register last_java_fp,
3850 3850 address last_java_pc) {
3851 3851 // determine java_thread register
3852 3852 if (!java_thread->is_valid()) {
3853 3853 java_thread = rdi;
3854 3854 get_thread(java_thread);
3855 3855 }
3856 3856 // determine last_java_sp register
3857 3857 if (!last_java_sp->is_valid()) {
3858 3858 last_java_sp = rsp;
3859 3859 }
3860 3860
3861 3861 // last_java_fp is optional
3862 3862
3863 3863 if (last_java_fp->is_valid()) {
3864 3864 movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
3865 3865 }
3866 3866
3867 3867 // last_java_pc is optional
3868 3868
3869 3869 if (last_java_pc != NULL) {
3870 3870 lea(Address(java_thread,
3871 3871 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()),
3872 3872 InternalAddress(last_java_pc));
3873 3873
3874 3874 }
3875 3875 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
3876 3876 }
3877 3877
3878 3878 void MacroAssembler::shlptr(Register dst, int imm8) {
3879 3879 LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8));
3880 3880 }
3881 3881
3882 3882 void MacroAssembler::shrptr(Register dst, int imm8) {
3883 3883 LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8));
3884 3884 }
3885 3885
3886 3886 void MacroAssembler::sign_extend_byte(Register reg) {
3887 3887 if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) {
3888 3888 movsbl(reg, reg); // movsxb
3889 3889 } else {
3890 3890 shll(reg, 24);
3891 3891 sarl(reg, 24);
3892 3892 }
3893 3893 }
3894 3894
3895 3895 void MacroAssembler::sign_extend_short(Register reg) {
3896 3896 if (LP64_ONLY(true ||) VM_Version::is_P6()) {
3897 3897 movswl(reg, reg); // movsxw
3898 3898 } else {
3899 3899 shll(reg, 16);
3900 3900 sarl(reg, 16);
3901 3901 }
3902 3902 }
3903 3903
3904 3904 void MacroAssembler::testl(Register dst, AddressLiteral src) {
3905 3905 assert(reachable(src), "Address should be reachable");
3906 3906 testl(dst, as_Address(src));
3907 3907 }
3908 3908
3909 3909 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) {
3910 3910 if (reachable(src)) {
3911 3911 Assembler::sqrtsd(dst, as_Address(src));
3912 3912 } else {
3913 3913 lea(rscratch1, src);
3914 3914 Assembler::sqrtsd(dst, Address(rscratch1, 0));
3915 3915 }
3916 3916 }
3917 3917
3918 3918 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) {
3919 3919 if (reachable(src)) {
3920 3920 Assembler::sqrtss(dst, as_Address(src));
3921 3921 } else {
3922 3922 lea(rscratch1, src);
3923 3923 Assembler::sqrtss(dst, Address(rscratch1, 0));
3924 3924 }
3925 3925 }
3926 3926
3927 3927 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) {
3928 3928 if (reachable(src)) {
3929 3929 Assembler::subsd(dst, as_Address(src));
3930 3930 } else {
3931 3931 lea(rscratch1, src);
3932 3932 Assembler::subsd(dst, Address(rscratch1, 0));
3933 3933 }
3934 3934 }
3935 3935
3936 3936 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) {
3937 3937 if (reachable(src)) {
3938 3938 Assembler::subss(dst, as_Address(src));
3939 3939 } else {
3940 3940 lea(rscratch1, src);
3941 3941 Assembler::subss(dst, Address(rscratch1, 0));
3942 3942 }
3943 3943 }
3944 3944
3945 3945 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
3946 3946 if (reachable(src)) {
3947 3947 Assembler::ucomisd(dst, as_Address(src));
3948 3948 } else {
3949 3949 lea(rscratch1, src);
3950 3950 Assembler::ucomisd(dst, Address(rscratch1, 0));
3951 3951 }
3952 3952 }
3953 3953
3954 3954 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) {
3955 3955 if (reachable(src)) {
3956 3956 Assembler::ucomiss(dst, as_Address(src));
3957 3957 } else {
3958 3958 lea(rscratch1, src);
3959 3959 Assembler::ucomiss(dst, Address(rscratch1, 0));
3960 3960 }
3961 3961 }
3962 3962
3963 3963 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) {
3964 3964 // Used in sign-bit flipping with aligned address.
3965 3965 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3966 3966 if (reachable(src)) {
3967 3967 Assembler::xorpd(dst, as_Address(src));
3968 3968 } else {
3969 3969 lea(rscratch1, src);
3970 3970 Assembler::xorpd(dst, Address(rscratch1, 0));
3971 3971 }
3972 3972 }
3973 3973
3974 3974 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) {
3975 3975 // Used in sign-bit flipping with aligned address.
3976 3976 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes");
3977 3977 if (reachable(src)) {
3978 3978 Assembler::xorps(dst, as_Address(src));
3979 3979 } else {
3980 3980 lea(rscratch1, src);
3981 3981 Assembler::xorps(dst, Address(rscratch1, 0));
3982 3982 }
3983 3983 }
3984 3984
3985 3985 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) {
3986 3986 // Used in sign-bit flipping with aligned address.
3987 3987 bool aligned_adr = (((intptr_t)src.target() & 15) == 0);
3988 3988 assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes");
3989 3989 if (reachable(src)) {
3990 3990 Assembler::pshufb(dst, as_Address(src));
3991 3991 } else {
3992 3992 lea(rscratch1, src);
3993 3993 Assembler::pshufb(dst, Address(rscratch1, 0));
3994 3994 }
3995 3995 }
3996 3996
3997 3997 // AVX 3-operands instructions
3998 3998
3999 3999 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4000 4000 if (reachable(src)) {
4001 4001 vaddsd(dst, nds, as_Address(src));
4002 4002 } else {
4003 4003 lea(rscratch1, src);
4004 4004 vaddsd(dst, nds, Address(rscratch1, 0));
4005 4005 }
4006 4006 }
4007 4007
4008 4008 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4009 4009 if (reachable(src)) {
4010 4010 vaddss(dst, nds, as_Address(src));
4011 4011 } else {
4012 4012 lea(rscratch1, src);
4013 4013 vaddss(dst, nds, Address(rscratch1, 0));
4014 4014 }
4015 4015 }
4016 4016
4017 4017 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
4018 4018 if (reachable(src)) {
4019 4019 vandpd(dst, nds, as_Address(src), vector256);
4020 4020 } else {
4021 4021 lea(rscratch1, src);
4022 4022 vandpd(dst, nds, Address(rscratch1, 0), vector256);
4023 4023 }
4024 4024 }
4025 4025
4026 4026 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
4027 4027 if (reachable(src)) {
4028 4028 vandps(dst, nds, as_Address(src), vector256);
4029 4029 } else {
4030 4030 lea(rscratch1, src);
4031 4031 vandps(dst, nds, Address(rscratch1, 0), vector256);
4032 4032 }
4033 4033 }
4034 4034
4035 4035 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4036 4036 if (reachable(src)) {
4037 4037 vdivsd(dst, nds, as_Address(src));
4038 4038 } else {
4039 4039 lea(rscratch1, src);
4040 4040 vdivsd(dst, nds, Address(rscratch1, 0));
4041 4041 }
4042 4042 }
4043 4043
4044 4044 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4045 4045 if (reachable(src)) {
4046 4046 vdivss(dst, nds, as_Address(src));
4047 4047 } else {
4048 4048 lea(rscratch1, src);
4049 4049 vdivss(dst, nds, Address(rscratch1, 0));
4050 4050 }
4051 4051 }
4052 4052
4053 4053 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4054 4054 if (reachable(src)) {
4055 4055 vmulsd(dst, nds, as_Address(src));
4056 4056 } else {
4057 4057 lea(rscratch1, src);
4058 4058 vmulsd(dst, nds, Address(rscratch1, 0));
4059 4059 }
4060 4060 }
4061 4061
4062 4062 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4063 4063 if (reachable(src)) {
4064 4064 vmulss(dst, nds, as_Address(src));
4065 4065 } else {
4066 4066 lea(rscratch1, src);
4067 4067 vmulss(dst, nds, Address(rscratch1, 0));
4068 4068 }
4069 4069 }
4070 4070
4071 4071 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4072 4072 if (reachable(src)) {
4073 4073 vsubsd(dst, nds, as_Address(src));
4074 4074 } else {
4075 4075 lea(rscratch1, src);
4076 4076 vsubsd(dst, nds, Address(rscratch1, 0));
4077 4077 }
4078 4078 }
4079 4079
4080 4080 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) {
4081 4081 if (reachable(src)) {
4082 4082 vsubss(dst, nds, as_Address(src));
4083 4083 } else {
4084 4084 lea(rscratch1, src);
4085 4085 vsubss(dst, nds, Address(rscratch1, 0));
4086 4086 }
4087 4087 }
4088 4088
4089 4089 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
4090 4090 if (reachable(src)) {
4091 4091 vxorpd(dst, nds, as_Address(src), vector256);
4092 4092 } else {
4093 4093 lea(rscratch1, src);
4094 4094 vxorpd(dst, nds, Address(rscratch1, 0), vector256);
4095 4095 }
4096 4096 }
4097 4097
4098 4098 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, bool vector256) {
4099 4099 if (reachable(src)) {
4100 4100 vxorps(dst, nds, as_Address(src), vector256);
4101 4101 } else {
4102 4102 lea(rscratch1, src);
4103 4103 vxorps(dst, nds, Address(rscratch1, 0), vector256);
4104 4104 }
4105 4105 }
4106 4106
4107 4107
4108 4108 //////////////////////////////////////////////////////////////////////////////////
4109 4109 #if INCLUDE_ALL_GCS
4110 4110
4111 4111 void MacroAssembler::g1_write_barrier_pre(Register obj,
4112 4112 Register pre_val,
4113 4113 Register thread,
4114 4114 Register tmp,
4115 4115 bool tosca_live,
4116 4116 bool expand_call) {
4117 4117
4118 4118 // If expand_call is true then we expand the call_VM_leaf macro
4119 4119 // directly to skip generating the check by
4120 4120 // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
4121 4121
4122 4122 #ifdef _LP64
4123 4123 assert(thread == r15_thread, "must be");
4124 4124 #endif // _LP64
4125 4125
4126 4126 Label done;
4127 4127 Label runtime;
4128 4128
4129 4129 assert(pre_val != noreg, "check this code");
4130 4130
4131 4131 if (obj != noreg) {
4132 4132 assert_different_registers(obj, pre_val, tmp);
4133 4133 assert(pre_val != rax, "check this code");
4134 4134 }
4135 4135
4136 4136 Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
4137 4137 PtrQueue::byte_offset_of_active()));
4138 4138 Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
4139 4139 PtrQueue::byte_offset_of_index()));
4140 4140 Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
4141 4141 PtrQueue::byte_offset_of_buf()));
4142 4142
4143 4143
4144 4144 // Is marking active?
4145 4145 if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
4146 4146 cmpl(in_progress, 0);
4147 4147 } else {
4148 4148 assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
4149 4149 cmpb(in_progress, 0);
4150 4150 }
4151 4151 jcc(Assembler::equal, done);
4152 4152
4153 4153 // Do we need to load the previous value?
4154 4154 if (obj != noreg) {
4155 4155 load_heap_oop(pre_val, Address(obj, 0));
4156 4156 }
4157 4157
4158 4158 // Is the previous value null?
4159 4159 cmpptr(pre_val, (int32_t) NULL_WORD);
4160 4160 jcc(Assembler::equal, done);
4161 4161
4162 4162 // Can we store original value in the thread's buffer?
4163 4163 // Is index == 0?
4164 4164 // (The index field is typed as size_t.)
4165 4165
4166 4166 movptr(tmp, index); // tmp := *index_adr
4167 4167 cmpptr(tmp, 0); // tmp == 0?
4168 4168 jcc(Assembler::equal, runtime); // If yes, goto runtime
4169 4169
4170 4170 subptr(tmp, wordSize); // tmp := tmp - wordSize
4171 4171 movptr(index, tmp); // *index_adr := tmp
4172 4172 addptr(tmp, buffer); // tmp := tmp + *buffer_adr
4173 4173
4174 4174 // Record the previous value
4175 4175 movptr(Address(tmp, 0), pre_val);
4176 4176 jmp(done);
4177 4177
4178 4178 bind(runtime);
4179 4179 // save the live input values
4180 4180 if(tosca_live) push(rax);
4181 4181
4182 4182 if (obj != noreg && obj != rax)
4183 4183 push(obj);
4184 4184
4185 4185 if (pre_val != rax)
4186 4186 push(pre_val);
4187 4187
4188 4188 // Calling the runtime using the regular call_VM_leaf mechanism generates
4189 4189 // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
4190 4190 // that checks that the *(ebp+frame::interpreter_frame_last_sp) == NULL.
4191 4191 //
4192 4192 // If we care generating the pre-barrier without a frame (e.g. in the
4193 4193 // intrinsified Reference.get() routine) then ebp might be pointing to
4194 4194 // the caller frame and so this check will most likely fail at runtime.
4195 4195 //
4196 4196 // Expanding the call directly bypasses the generation of the check.
4197 4197 // So when we do not have have a full interpreter frame on the stack
4198 4198 // expand_call should be passed true.
4199 4199
4200 4200 NOT_LP64( push(thread); )
4201 4201
4202 4202 if (expand_call) {
4203 4203 LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); )
4204 4204 pass_arg1(this, thread);
4205 4205 pass_arg0(this, pre_val);
4206 4206 MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
4207 4207 } else {
4208 4208 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
4209 4209 }
4210 4210
4211 4211 NOT_LP64( pop(thread); )
4212 4212
4213 4213 // save the live input values
4214 4214 if (pre_val != rax)
4215 4215 pop(pre_val);
4216 4216
4217 4217 if (obj != noreg && obj != rax)
4218 4218 pop(obj);
4219 4219
4220 4220 if(tosca_live) pop(rax);
4221 4221
4222 4222 bind(done);
4223 4223 }
4224 4224
4225 4225 void MacroAssembler::g1_write_barrier_post(Register store_addr,
4226 4226 Register new_val,
4227 4227 Register thread,
4228 4228 Register tmp,
4229 4229 Register tmp2) {
4230 4230 #ifdef _LP64
4231 4231 assert(thread == r15_thread, "must be");
4232 4232 #endif // _LP64
4233 4233
4234 4234 Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
4235 4235 PtrQueue::byte_offset_of_index()));
4236 4236 Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
4237 4237 PtrQueue::byte_offset_of_buf()));
4238 4238
4239 4239 BarrierSet* bs = Universe::heap()->barrier_set();
4240 4240 CardTableModRefBS* ct = (CardTableModRefBS*)bs;
4241 4241 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
4242 4242
4243 4243 Label done;
4244 4244 Label runtime;
4245 4245
4246 4246 // Does store cross heap regions?
4247 4247
4248 4248 movptr(tmp, store_addr);
4249 4249 xorptr(tmp, new_val);
4250 4250 shrptr(tmp, HeapRegion::LogOfHRGrainBytes);
4251 4251 jcc(Assembler::equal, done);
4252 4252
4253 4253 // crosses regions, storing NULL?
4254 4254
4255 4255 cmpptr(new_val, (int32_t) NULL_WORD);
4256 4256 jcc(Assembler::equal, done);
4257 4257
4258 4258 // storing region crossing non-NULL, is card already dirty?
4259 4259
4260 4260 const Register card_addr = tmp;
4261 4261 const Register cardtable = tmp2;
4262 4262
4263 4263 movptr(card_addr, store_addr);
4264 4264 shrptr(card_addr, CardTableModRefBS::card_shift);
4265 4265 // Do not use ExternalAddress to load 'byte_map_base', since 'byte_map_base' is NOT
4266 4266 // a valid address and therefore is not properly handled by the relocation code.
4267 4267 movptr(cardtable, (intptr_t)ct->byte_map_base);
4268 4268 addptr(card_addr, cardtable);
4269 4269
4270 4270 cmpb(Address(card_addr, 0), (int)G1SATBCardTableModRefBS::g1_young_card_val());
4271 4271 jcc(Assembler::equal, done);
4272 4272
4273 4273 membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));
4274 4274 cmpb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val());
4275 4275 jcc(Assembler::equal, done);
4276 4276
4277 4277
4278 4278 // storing a region crossing, non-NULL oop, card is clean.
4279 4279 // dirty card and log.
4280 4280
4281 4281 movb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val());
4282 4282
4283 4283 cmpl(queue_index, 0);
4284 4284 jcc(Assembler::equal, runtime);
4285 4285 subl(queue_index, wordSize);
4286 4286 movptr(tmp2, buffer);
4287 4287 #ifdef _LP64
4288 4288 movslq(rscratch1, queue_index);
4289 4289 addq(tmp2, rscratch1);
4290 4290 movq(Address(tmp2, 0), card_addr);
4291 4291 #else
4292 4292 addl(tmp2, queue_index);
4293 4293 movl(Address(tmp2, 0), card_addr);
4294 4294 #endif
4295 4295 jmp(done);
4296 4296
4297 4297 bind(runtime);
4298 4298 // save the live input values
4299 4299 push(store_addr);
4300 4300 push(new_val);
4301 4301 #ifdef _LP64
4302 4302 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, r15_thread);
4303 4303 #else
4304 4304 push(thread);
4305 4305 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
4306 4306 pop(thread);
4307 4307 #endif
4308 4308 pop(new_val);
4309 4309 pop(store_addr);
4310 4310
4311 4311 bind(done);
4312 4312 }
4313 4313
4314 4314 #endif // INCLUDE_ALL_GCS
4315 4315 //////////////////////////////////////////////////////////////////////////////////
4316 4316
4317 4317
4318 4318 void MacroAssembler::store_check(Register obj) {
4319 4319 // Does a store check for the oop in register obj. The content of
4320 4320 // register obj is destroyed afterwards.
4321 4321 store_check_part_1(obj);
4322 4322 store_check_part_2(obj);
4323 4323 }
4324 4324
4325 4325 void MacroAssembler::store_check(Register obj, Address dst) {
4326 4326 store_check(obj);
4327 4327 }
4328 4328
4329 4329
4330 4330 // split the store check operation so that other instructions can be scheduled inbetween
4331 4331 void MacroAssembler::store_check_part_1(Register obj) {
4332 4332 BarrierSet* bs = Universe::heap()->barrier_set();
4333 4333 assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
4334 4334 shrptr(obj, CardTableModRefBS::card_shift);
4335 4335 }
4336 4336
4337 4337 void MacroAssembler::store_check_part_2(Register obj) {
4338 4338 BarrierSet* bs = Universe::heap()->barrier_set();
4339 4339 assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
4340 4340 CardTableModRefBS* ct = (CardTableModRefBS*)bs;
4341 4341 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
4342 4342
4343 4343 // The calculation for byte_map_base is as follows:
4344 4344 // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
4345 4345 // So this essentially converts an address to a displacement and it will
4346 4346 // never need to be relocated. On 64bit however the value may be too
4347 4347 // large for a 32bit displacement.
4348 4348 intptr_t disp = (intptr_t) ct->byte_map_base;
4349 4349 if (is_simm32(disp)) {
4350 4350 Address cardtable(noreg, obj, Address::times_1, disp);
4351 4351 movb(cardtable, 0);
4352 4352 } else {
4353 4353 // By doing it as an ExternalAddress 'disp' could be converted to a rip-relative
4354 4354 // displacement and done in a single instruction given favorable mapping and a
4355 4355 // smarter version of as_Address. However, 'ExternalAddress' generates a relocation
4356 4356 // entry and that entry is not properly handled by the relocation code.
4357 4357 AddressLiteral cardtable((address)ct->byte_map_base, relocInfo::none);
4358 4358 Address index(noreg, obj, Address::times_1);
4359 4359 movb(as_Address(ArrayAddress(cardtable, index)), 0);
4360 4360 }
4361 4361 }
4362 4362
4363 4363 void MacroAssembler::subptr(Register dst, int32_t imm32) {
4364 4364 LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
4365 4365 }
4366 4366
4367 4367 // Force generation of a 4 byte immediate value even if it fits into 8bit
4368 4368 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
4369 4369 LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
4370 4370 }
4371 4371
4372 4372 void MacroAssembler::subptr(Register dst, Register src) {
4373 4373 LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
4374 4374 }
4375 4375
4376 4376 // C++ bool manipulation
4377 4377 void MacroAssembler::testbool(Register dst) {
4378 4378 if(sizeof(bool) == 1)
4379 4379 testb(dst, 0xff);
4380 4380 else if(sizeof(bool) == 2) {
4381 4381 // testw implementation needed for two byte bools
4382 4382 ShouldNotReachHere();
4383 4383 } else if(sizeof(bool) == 4)
4384 4384 testl(dst, dst);
4385 4385 else
4386 4386 // unsupported
4387 4387 ShouldNotReachHere();
4388 4388 }
4389 4389
4390 4390 void MacroAssembler::testptr(Register dst, Register src) {
4391 4391 LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src));
4392 4392 }
4393 4393
4394 4394 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4395 4395 void MacroAssembler::tlab_allocate(Register obj,
4396 4396 Register var_size_in_bytes,
4397 4397 int con_size_in_bytes,
4398 4398 Register t1,
4399 4399 Register t2,
4400 4400 Label& slow_case) {
4401 4401 assert_different_registers(obj, t1, t2);
4402 4402 assert_different_registers(obj, var_size_in_bytes, t1);
4403 4403 Register end = t2;
4404 4404 Register thread = NOT_LP64(t1) LP64_ONLY(r15_thread);
4405 4405
4406 4406 verify_tlab();
4407 4407
4408 4408 NOT_LP64(get_thread(thread));
4409 4409
4410 4410 movptr(obj, Address(thread, JavaThread::tlab_top_offset()));
4411 4411 if (var_size_in_bytes == noreg) {
4412 4412 lea(end, Address(obj, con_size_in_bytes));
4413 4413 } else {
4414 4414 lea(end, Address(obj, var_size_in_bytes, Address::times_1));
4415 4415 }
4416 4416 cmpptr(end, Address(thread, JavaThread::tlab_end_offset()));
4417 4417 jcc(Assembler::above, slow_case);
4418 4418
4419 4419 // update the tlab top pointer
4420 4420 movptr(Address(thread, JavaThread::tlab_top_offset()), end);
4421 4421
4422 4422 // recover var_size_in_bytes if necessary
4423 4423 if (var_size_in_bytes == end) {
4424 4424 subptr(var_size_in_bytes, obj);
4425 4425 }
4426 4426 verify_tlab();
4427 4427 }
4428 4428
4429 4429 // Preserves rbx, and rdx.
4430 4430 Register MacroAssembler::tlab_refill(Label& retry,
4431 4431 Label& try_eden,
4432 4432 Label& slow_case) {
4433 4433 Register top = rax;
4434 4434 Register t1 = rcx;
4435 4435 Register t2 = rsi;
4436 4436 Register thread_reg = NOT_LP64(rdi) LP64_ONLY(r15_thread);
4437 4437 assert_different_registers(top, thread_reg, t1, t2, /* preserve: */ rbx, rdx);
4438 4438 Label do_refill, discard_tlab;
4439 4439
4440 4440 if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
4441 4441 // No allocation in the shared eden.
4442 4442 jmp(slow_case);
4443 4443 }
4444 4444
4445 4445 NOT_LP64(get_thread(thread_reg));
4446 4446
4447 4447 movptr(top, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
4448 4448 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
4449 4449
4450 4450 // calculate amount of free space
4451 4451 subptr(t1, top);
4452 4452 shrptr(t1, LogHeapWordSize);
4453 4453
4454 4454 // Retain tlab and allocate object in shared space if
4455 4455 // the amount free in the tlab is too large to discard.
4456 4456 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
4457 4457 jcc(Assembler::lessEqual, discard_tlab);
4458 4458
4459 4459 // Retain
4460 4460 // %%% yuck as movptr...
4461 4461 movptr(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
4462 4462 addptr(Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())), t2);
4463 4463 if (TLABStats) {
4464 4464 // increment number of slow_allocations
4465 4465 addl(Address(thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset())), 1);
4466 4466 }
4467 4467 jmp(try_eden);
4468 4468
4469 4469 bind(discard_tlab);
4470 4470 if (TLABStats) {
4471 4471 // increment number of refills
4472 4472 addl(Address(thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1);
4473 4473 // accumulate wastage -- t1 is amount free in tlab
4474 4474 addl(Address(thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1);
4475 4475 }
4476 4476
4477 4477 // if tlab is currently allocated (top or end != null) then
4478 4478 // fill [top, end + alignment_reserve) with array object
4479 4479 testptr(top, top);
4480 4480 jcc(Assembler::zero, do_refill);
4481 4481
4482 4482 // set up the mark word
4483 4483 movptr(Address(top, oopDesc::mark_offset_in_bytes()), (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
4484 4484 // set the length to the remaining space
4485 4485 subptr(t1, typeArrayOopDesc::header_size(T_INT));
4486 4486 addptr(t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
4487 4487 shlptr(t1, log2_intptr(HeapWordSize/sizeof(jint)));
4488 4488 movl(Address(top, arrayOopDesc::length_offset_in_bytes()), t1);
4489 4489 // set klass to intArrayKlass
4490 4490 // dubious reloc why not an oop reloc?
4491 4491 movptr(t1, ExternalAddress((address)Universe::intArrayKlassObj_addr()));
4492 4492 // store klass last. concurrent gcs assumes klass length is valid if
4493 4493 // klass field is not null.
4494 4494 store_klass(top, t1);
4495 4495
4496 4496 movptr(t1, top);
4497 4497 subptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
4498 4498 incr_allocated_bytes(thread_reg, t1, 0);
4499 4499
4500 4500 // refill the tlab with an eden allocation
4501 4501 bind(do_refill);
4502 4502 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
4503 4503 shlptr(t1, LogHeapWordSize);
4504 4504 // allocate new tlab, address returned in top
4505 4505 eden_allocate(top, t1, 0, t2, slow_case);
4506 4506
4507 4507 // Check that t1 was preserved in eden_allocate.
4508 4508 #ifdef ASSERT
4509 4509 if (UseTLAB) {
4510 4510 Label ok;
4511 4511 Register tsize = rsi;
4512 4512 assert_different_registers(tsize, thread_reg, t1);
4513 4513 push(tsize);
4514 4514 movptr(tsize, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset())));
4515 4515 shlptr(tsize, LogHeapWordSize);
4516 4516 cmpptr(t1, tsize);
4517 4517 jcc(Assembler::equal, ok);
4518 4518 STOP("assert(t1 != tlab size)");
4519 4519 should_not_reach_here();
4520 4520
4521 4521 bind(ok);
4522 4522 pop(tsize);
4523 4523 }
4524 4524 #endif
4525 4525 movptr(Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())), top);
4526 4526 movptr(Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())), top);
4527 4527 addptr(top, t1);
4528 4528 subptr(top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
4529 4529 movptr(Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())), top);
4530 4530 verify_tlab();
4531 4531 jmp(retry);
4532 4532
4533 4533 return thread_reg; // for use by caller
4534 4534 }
4535 4535
4536 4536 void MacroAssembler::incr_allocated_bytes(Register thread,
4537 4537 Register var_size_in_bytes,
4538 4538 int con_size_in_bytes,
4539 4539 Register t1) {
4540 4540 if (!thread->is_valid()) {
4541 4541 #ifdef _LP64
4542 4542 thread = r15_thread;
4543 4543 #else
4544 4544 assert(t1->is_valid(), "need temp reg");
4545 4545 thread = t1;
4546 4546 get_thread(thread);
4547 4547 #endif
4548 4548 }
4549 4549
4550 4550 #ifdef _LP64
4551 4551 if (var_size_in_bytes->is_valid()) {
4552 4552 addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
4553 4553 } else {
4554 4554 addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
4555 4555 }
4556 4556 #else
4557 4557 if (var_size_in_bytes->is_valid()) {
4558 4558 addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes);
4559 4559 } else {
4560 4560 addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes);
4561 4561 }
4562 4562 adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
4563 4563 #endif
4564 4564 }
4565 4565
4566 4566 void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
4567 4567 pusha();
4568 4568
4569 4569 // if we are coming from c1, xmm registers may be live
4570 4570 int off = 0;
4571 4571 if (UseSSE == 1) {
4572 4572 subptr(rsp, sizeof(jdouble)*8);
4573 4573 movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
4574 4574 movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
4575 4575 movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
4576 4576 movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
4577 4577 movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
4578 4578 movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
4579 4579 movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
4580 4580 movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
4581 4581 } else if (UseSSE >= 2) {
4582 4582 #ifdef COMPILER2
4583 4583 if (MaxVectorSize > 16) {
4584 4584 assert(UseAVX > 0, "256bit vectors are supported only with AVX");
4585 4585 // Save upper half of YMM registes
4586 4586 subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4587 4587 vextractf128h(Address(rsp, 0),xmm0);
4588 4588 vextractf128h(Address(rsp, 16),xmm1);
4589 4589 vextractf128h(Address(rsp, 32),xmm2);
4590 4590 vextractf128h(Address(rsp, 48),xmm3);
4591 4591 vextractf128h(Address(rsp, 64),xmm4);
4592 4592 vextractf128h(Address(rsp, 80),xmm5);
4593 4593 vextractf128h(Address(rsp, 96),xmm6);
4594 4594 vextractf128h(Address(rsp,112),xmm7);
4595 4595 #ifdef _LP64
4596 4596 vextractf128h(Address(rsp,128),xmm8);
4597 4597 vextractf128h(Address(rsp,144),xmm9);
4598 4598 vextractf128h(Address(rsp,160),xmm10);
4599 4599 vextractf128h(Address(rsp,176),xmm11);
4600 4600 vextractf128h(Address(rsp,192),xmm12);
4601 4601 vextractf128h(Address(rsp,208),xmm13);
4602 4602 vextractf128h(Address(rsp,224),xmm14);
4603 4603 vextractf128h(Address(rsp,240),xmm15);
4604 4604 #endif
4605 4605 }
4606 4606 #endif
4607 4607 // Save whole 128bit (16 bytes) XMM regiters
4608 4608 subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4609 4609 movdqu(Address(rsp,off++*16),xmm0);
4610 4610 movdqu(Address(rsp,off++*16),xmm1);
4611 4611 movdqu(Address(rsp,off++*16),xmm2);
4612 4612 movdqu(Address(rsp,off++*16),xmm3);
4613 4613 movdqu(Address(rsp,off++*16),xmm4);
4614 4614 movdqu(Address(rsp,off++*16),xmm5);
4615 4615 movdqu(Address(rsp,off++*16),xmm6);
4616 4616 movdqu(Address(rsp,off++*16),xmm7);
4617 4617 #ifdef _LP64
4618 4618 movdqu(Address(rsp,off++*16),xmm8);
4619 4619 movdqu(Address(rsp,off++*16),xmm9);
4620 4620 movdqu(Address(rsp,off++*16),xmm10);
4621 4621 movdqu(Address(rsp,off++*16),xmm11);
4622 4622 movdqu(Address(rsp,off++*16),xmm12);
4623 4623 movdqu(Address(rsp,off++*16),xmm13);
4624 4624 movdqu(Address(rsp,off++*16),xmm14);
4625 4625 movdqu(Address(rsp,off++*16),xmm15);
4626 4626 #endif
4627 4627 }
4628 4628
4629 4629 // Preserve registers across runtime call
4630 4630 int incoming_argument_and_return_value_offset = -1;
4631 4631 if (num_fpu_regs_in_use > 1) {
4632 4632 // Must preserve all other FPU regs (could alternatively convert
4633 4633 // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash
4634 4634 // FPU state, but can not trust C compiler)
4635 4635 NEEDS_CLEANUP;
4636 4636 // NOTE that in this case we also push the incoming argument(s) to
4637 4637 // the stack and restore it later; we also use this stack slot to
4638 4638 // hold the return value from dsin, dcos etc.
4639 4639 for (int i = 0; i < num_fpu_regs_in_use; i++) {
4640 4640 subptr(rsp, sizeof(jdouble));
4641 4641 fstp_d(Address(rsp, 0));
4642 4642 }
4643 4643 incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
4644 4644 for (int i = nb_args-1; i >= 0; i--) {
4645 4645 fld_d(Address(rsp, incoming_argument_and_return_value_offset-i*sizeof(jdouble)));
4646 4646 }
4647 4647 }
4648 4648
4649 4649 subptr(rsp, nb_args*sizeof(jdouble));
4650 4650 for (int i = 0; i < nb_args; i++) {
4651 4651 fstp_d(Address(rsp, i*sizeof(jdouble)));
4652 4652 }
4653 4653
4654 4654 #ifdef _LP64
4655 4655 if (nb_args > 0) {
4656 4656 movdbl(xmm0, Address(rsp, 0));
4657 4657 }
4658 4658 if (nb_args > 1) {
4659 4659 movdbl(xmm1, Address(rsp, sizeof(jdouble)));
4660 4660 }
4661 4661 assert(nb_args <= 2, "unsupported number of args");
4662 4662 #endif // _LP64
4663 4663
4664 4664 // NOTE: we must not use call_VM_leaf here because that requires a
4665 4665 // complete interpreter frame in debug mode -- same bug as 4387334
4666 4666 // MacroAssembler::call_VM_leaf_base is perfectly safe and will
4667 4667 // do proper 64bit abi
4668 4668
4669 4669 NEEDS_CLEANUP;
4670 4670 // Need to add stack banging before this runtime call if it needs to
4671 4671 // be taken; however, there is no generic stack banging routine at
4672 4672 // the MacroAssembler level
4673 4673
4674 4674 MacroAssembler::call_VM_leaf_base(runtime_entry, 0);
4675 4675
4676 4676 #ifdef _LP64
4677 4677 movsd(Address(rsp, 0), xmm0);
4678 4678 fld_d(Address(rsp, 0));
4679 4679 #endif // _LP64
4680 4680 addptr(rsp, sizeof(jdouble) * nb_args);
4681 4681 if (num_fpu_regs_in_use > 1) {
4682 4682 // Must save return value to stack and then restore entire FPU
4683 4683 // stack except incoming arguments
4684 4684 fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
4685 4685 for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) {
4686 4686 fld_d(Address(rsp, 0));
4687 4687 addptr(rsp, sizeof(jdouble));
4688 4688 }
4689 4689 fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
4690 4690 addptr(rsp, sizeof(jdouble) * nb_args);
4691 4691 }
4692 4692
4693 4693 off = 0;
4694 4694 if (UseSSE == 1) {
4695 4695 movflt(xmm0, Address(rsp,off++*sizeof(jdouble)));
4696 4696 movflt(xmm1, Address(rsp,off++*sizeof(jdouble)));
4697 4697 movflt(xmm2, Address(rsp,off++*sizeof(jdouble)));
4698 4698 movflt(xmm3, Address(rsp,off++*sizeof(jdouble)));
4699 4699 movflt(xmm4, Address(rsp,off++*sizeof(jdouble)));
4700 4700 movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
4701 4701 movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
4702 4702 movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
4703 4703 addptr(rsp, sizeof(jdouble)*8);
4704 4704 } else if (UseSSE >= 2) {
4705 4705 // Restore whole 128bit (16 bytes) XMM regiters
4706 4706 movdqu(xmm0, Address(rsp,off++*16));
4707 4707 movdqu(xmm1, Address(rsp,off++*16));
4708 4708 movdqu(xmm2, Address(rsp,off++*16));
4709 4709 movdqu(xmm3, Address(rsp,off++*16));
4710 4710 movdqu(xmm4, Address(rsp,off++*16));
4711 4711 movdqu(xmm5, Address(rsp,off++*16));
4712 4712 movdqu(xmm6, Address(rsp,off++*16));
4713 4713 movdqu(xmm7, Address(rsp,off++*16));
4714 4714 #ifdef _LP64
4715 4715 movdqu(xmm8, Address(rsp,off++*16));
4716 4716 movdqu(xmm9, Address(rsp,off++*16));
4717 4717 movdqu(xmm10, Address(rsp,off++*16));
4718 4718 movdqu(xmm11, Address(rsp,off++*16));
4719 4719 movdqu(xmm12, Address(rsp,off++*16));
4720 4720 movdqu(xmm13, Address(rsp,off++*16));
4721 4721 movdqu(xmm14, Address(rsp,off++*16));
4722 4722 movdqu(xmm15, Address(rsp,off++*16));
4723 4723 #endif
4724 4724 addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4725 4725 #ifdef COMPILER2
4726 4726 if (MaxVectorSize > 16) {
4727 4727 // Restore upper half of YMM registes.
4728 4728 vinsertf128h(xmm0, Address(rsp, 0));
4729 4729 vinsertf128h(xmm1, Address(rsp, 16));
4730 4730 vinsertf128h(xmm2, Address(rsp, 32));
4731 4731 vinsertf128h(xmm3, Address(rsp, 48));
4732 4732 vinsertf128h(xmm4, Address(rsp, 64));
4733 4733 vinsertf128h(xmm5, Address(rsp, 80));
4734 4734 vinsertf128h(xmm6, Address(rsp, 96));
4735 4735 vinsertf128h(xmm7, Address(rsp,112));
4736 4736 #ifdef _LP64
4737 4737 vinsertf128h(xmm8, Address(rsp,128));
4738 4738 vinsertf128h(xmm9, Address(rsp,144));
4739 4739 vinsertf128h(xmm10, Address(rsp,160));
4740 4740 vinsertf128h(xmm11, Address(rsp,176));
4741 4741 vinsertf128h(xmm12, Address(rsp,192));
4742 4742 vinsertf128h(xmm13, Address(rsp,208));
4743 4743 vinsertf128h(xmm14, Address(rsp,224));
4744 4744 vinsertf128h(xmm15, Address(rsp,240));
4745 4745 #endif
4746 4746 addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8));
4747 4747 }
4748 4748 #endif
4749 4749 }
4750 4750 popa();
4751 4751 }
4752 4752
4753 4753 static const double pi_4 = 0.7853981633974483;
4754 4754
4755 4755 void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
4756 4756 // A hand-coded argument reduction for values in fabs(pi/4, pi/2)
4757 4757 // was attempted in this code; unfortunately it appears that the
4758 4758 // switch to 80-bit precision and back causes this to be
4759 4759 // unprofitable compared with simply performing a runtime call if
4760 4760 // the argument is out of the (-pi/4, pi/4) range.
4761 4761
4762 4762 Register tmp = noreg;
4763 4763 if (!VM_Version::supports_cmov()) {
4764 4764 // fcmp needs a temporary so preserve rbx,
4765 4765 tmp = rbx;
4766 4766 push(tmp);
4767 4767 }
4768 4768
4769 4769 Label slow_case, done;
4770 4770
4771 4771 ExternalAddress pi4_adr = (address)&pi_4;
4772 4772 if (reachable(pi4_adr)) {
4773 4773 // x ?<= pi/4
4774 4774 fld_d(pi4_adr);
4775 4775 fld_s(1); // Stack: X PI/4 X
4776 4776 fabs(); // Stack: |X| PI/4 X
4777 4777 fcmp(tmp);
4778 4778 jcc(Assembler::above, slow_case);
4779 4779
4780 4780 // fastest case: -pi/4 <= x <= pi/4
4781 4781 switch(trig) {
4782 4782 case 's':
4783 4783 fsin();
4784 4784 break;
4785 4785 case 'c':
4786 4786 fcos();
4787 4787 break;
4788 4788 case 't':
4789 4789 ftan();
4790 4790 break;
4791 4791 default:
4792 4792 assert(false, "bad intrinsic");
4793 4793 break;
4794 4794 }
4795 4795 jmp(done);
4796 4796 }
4797 4797
4798 4798 // slow case: runtime call
4799 4799 bind(slow_case);
4800 4800
4801 4801 switch(trig) {
4802 4802 case 's':
4803 4803 {
4804 4804 fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 1, num_fpu_regs_in_use);
4805 4805 }
4806 4806 break;
4807 4807 case 'c':
4808 4808 {
4809 4809 fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 1, num_fpu_regs_in_use);
4810 4810 }
4811 4811 break;
4812 4812 case 't':
4813 4813 {
4814 4814 fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 1, num_fpu_regs_in_use);
4815 4815 }
4816 4816 break;
4817 4817 default:
4818 4818 assert(false, "bad intrinsic");
4819 4819 break;
4820 4820 }
4821 4821
4822 4822 // Come here with result in F-TOS
4823 4823 bind(done);
4824 4824
4825 4825 if (tmp != noreg) {
4826 4826 pop(tmp);
4827 4827 }
4828 4828 }
4829 4829
4830 4830
4831 4831 // Look up the method for a megamorphic invokeinterface call.
4832 4832 // The target method is determined by <intf_klass, itable_index>.
4833 4833 // The receiver klass is in recv_klass.
4834 4834 // On success, the result will be in method_result, and execution falls through.
4835 4835 // On failure, execution transfers to the given label.
4836 4836 void MacroAssembler::lookup_interface_method(Register recv_klass,
4837 4837 Register intf_klass,
4838 4838 RegisterOrConstant itable_index,
4839 4839 Register method_result,
4840 4840 Register scan_temp,
4841 4841 Label& L_no_such_interface) {
4842 4842 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
4843 4843 assert(itable_index.is_constant() || itable_index.as_register() == method_result,
4844 4844 "caller must use same register for non-constant itable index as for method");
4845 4845
4846 4846 // Compute start of first itableOffsetEntry (which is at the end of the vtable)
4847 4847 int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
4848 4848 int itentry_off = itableMethodEntry::method_offset_in_bytes();
4849 4849 int scan_step = itableOffsetEntry::size() * wordSize;
4850 4850 int vte_size = vtableEntry::size() * wordSize;
4851 4851 Address::ScaleFactor times_vte_scale = Address::times_ptr;
4852 4852 assert(vte_size == wordSize, "else adjust times_vte_scale");
4853 4853
4854 4854 movl(scan_temp, Address(recv_klass, InstanceKlass::vtable_length_offset() * wordSize));
4855 4855
4856 4856 // %%% Could store the aligned, prescaled offset in the klassoop.
4857 4857 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
4858 4858 if (HeapWordsPerLong > 1) {
4859 4859 // Round up to align_object_offset boundary
4860 4860 // see code for InstanceKlass::start_of_itable!
4861 4861 round_to(scan_temp, BytesPerLong);
4862 4862 }
4863 4863
4864 4864 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
4865 4865 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4866 4866 lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
4867 4867
4868 4868 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
4869 4869 // if (scan->interface() == intf) {
4870 4870 // result = (klass + scan->offset() + itable_index);
4871 4871 // }
4872 4872 // }
4873 4873 Label search, found_method;
4874 4874
4875 4875 for (int peel = 1; peel >= 0; peel--) {
4876 4876 movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
4877 4877 cmpptr(intf_klass, method_result);
4878 4878
4879 4879 if (peel) {
4880 4880 jccb(Assembler::equal, found_method);
4881 4881 } else {
4882 4882 jccb(Assembler::notEqual, search);
4883 4883 // (invert the test to fall through to found_method...)
4884 4884 }
4885 4885
4886 4886 if (!peel) break;
4887 4887
4888 4888 bind(search);
4889 4889
4890 4890 // Check that the previous entry is non-null. A null entry means that
4891 4891 // the receiver class doesn't implement the interface, and wasn't the
4892 4892 // same as when the caller was compiled.
4893 4893 testptr(method_result, method_result);
4894 4894 jcc(Assembler::zero, L_no_such_interface);
4895 4895 addptr(scan_temp, scan_step);
4896 4896 }
4897 4897
4898 4898 bind(found_method);
4899 4899
4900 4900 // Got a hit.
4901 4901 movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
4902 4902 movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
4903 4903 }
4904 4904
4905 4905
4906 4906 // virtual method calling
4907 4907 void MacroAssembler::lookup_virtual_method(Register recv_klass,
4908 4908 RegisterOrConstant vtable_index,
4909 4909 Register method_result) {
4910 4910 const int base = InstanceKlass::vtable_start_offset() * wordSize;
4911 4911 assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
4912 4912 Address vtable_entry_addr(recv_klass,
4913 4913 vtable_index, Address::times_ptr,
4914 4914 base + vtableEntry::method_offset_in_bytes());
4915 4915 movptr(method_result, vtable_entry_addr);
4916 4916 }
4917 4917
4918 4918
4919 4919 void MacroAssembler::check_klass_subtype(Register sub_klass,
4920 4920 Register super_klass,
4921 4921 Register temp_reg,
4922 4922 Label& L_success) {
4923 4923 Label L_failure;
4924 4924 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL);
4925 4925 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
4926 4926 bind(L_failure);
4927 4927 }
4928 4928
4929 4929
4930 4930 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4931 4931 Register super_klass,
4932 4932 Register temp_reg,
4933 4933 Label* L_success,
4934 4934 Label* L_failure,
4935 4935 Label* L_slow_path,
4936 4936 RegisterOrConstant super_check_offset) {
4937 4937 assert_different_registers(sub_klass, super_klass, temp_reg);
4938 4938 bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
4939 4939 if (super_check_offset.is_register()) {
4940 4940 assert_different_registers(sub_klass, super_klass,
4941 4941 super_check_offset.as_register());
4942 4942 } else if (must_load_sco) {
4943 4943 assert(temp_reg != noreg, "supply either a temp or a register offset");
4944 4944 }
4945 4945
4946 4946 Label L_fallthrough;
4947 4947 int label_nulls = 0;
4948 4948 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
4949 4949 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
4950 4950 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
4951 4951 assert(label_nulls <= 1, "at most one NULL in the batch");
4952 4952
4953 4953 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4954 4954 int sco_offset = in_bytes(Klass::super_check_offset_offset());
4955 4955 Address super_check_offset_addr(super_klass, sco_offset);
4956 4956
4957 4957 // Hacked jcc, which "knows" that L_fallthrough, at least, is in
4958 4958 // range of a jccb. If this routine grows larger, reconsider at
4959 4959 // least some of these.
4960 4960 #define local_jcc(assembler_cond, label) \
4961 4961 if (&(label) == &L_fallthrough) jccb(assembler_cond, label); \
4962 4962 else jcc( assembler_cond, label) /*omit semi*/
4963 4963
4964 4964 // Hacked jmp, which may only be used just before L_fallthrough.
4965 4965 #define final_jmp(label) \
4966 4966 if (&(label) == &L_fallthrough) { /*do nothing*/ } \
4967 4967 else jmp(label) /*omit semi*/
4968 4968
4969 4969 // If the pointers are equal, we are done (e.g., String[] elements).
4970 4970 // This self-check enables sharing of secondary supertype arrays among
4971 4971 // non-primary types such as array-of-interface. Otherwise, each such
4972 4972 // type would need its own customized SSA.
4973 4973 // We move this check to the front of the fast path because many
4974 4974 // type checks are in fact trivially successful in this manner,
4975 4975 // so we get a nicely predicted branch right at the start of the check.
4976 4976 cmpptr(sub_klass, super_klass);
4977 4977 local_jcc(Assembler::equal, *L_success);
4978 4978
4979 4979 // Check the supertype display:
4980 4980 if (must_load_sco) {
4981 4981 // Positive movl does right thing on LP64.
4982 4982 movl(temp_reg, super_check_offset_addr);
4983 4983 super_check_offset = RegisterOrConstant(temp_reg);
4984 4984 }
4985 4985 Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
4986 4986 cmpptr(super_klass, super_check_addr); // load displayed supertype
4987 4987
4988 4988 // This check has worked decisively for primary supers.
4989 4989 // Secondary supers are sought in the super_cache ('super_cache_addr').
4990 4990 // (Secondary supers are interfaces and very deeply nested subtypes.)
4991 4991 // This works in the same check above because of a tricky aliasing
4992 4992 // between the super_cache and the primary super display elements.
4993 4993 // (The 'super_check_addr' can address either, as the case requires.)
4994 4994 // Note that the cache is updated below if it does not help us find
4995 4995 // what we need immediately.
4996 4996 // So if it was a primary super, we can just fail immediately.
4997 4997 // Otherwise, it's the slow path for us (no success at this point).
4998 4998
4999 4999 if (super_check_offset.is_register()) {
5000 5000 local_jcc(Assembler::equal, *L_success);
5001 5001 cmpl(super_check_offset.as_register(), sc_offset);
5002 5002 if (L_failure == &L_fallthrough) {
5003 5003 local_jcc(Assembler::equal, *L_slow_path);
5004 5004 } else {
5005 5005 local_jcc(Assembler::notEqual, *L_failure);
5006 5006 final_jmp(*L_slow_path);
5007 5007 }
5008 5008 } else if (super_check_offset.as_constant() == sc_offset) {
5009 5009 // Need a slow path; fast failure is impossible.
5010 5010 if (L_slow_path == &L_fallthrough) {
5011 5011 local_jcc(Assembler::equal, *L_success);
5012 5012 } else {
5013 5013 local_jcc(Assembler::notEqual, *L_slow_path);
5014 5014 final_jmp(*L_success);
5015 5015 }
5016 5016 } else {
5017 5017 // No slow path; it's a fast decision.
5018 5018 if (L_failure == &L_fallthrough) {
5019 5019 local_jcc(Assembler::equal, *L_success);
5020 5020 } else {
5021 5021 local_jcc(Assembler::notEqual, *L_failure);
5022 5022 final_jmp(*L_success);
5023 5023 }
5024 5024 }
5025 5025
5026 5026 bind(L_fallthrough);
5027 5027
5028 5028 #undef local_jcc
5029 5029 #undef final_jmp
5030 5030 }
5031 5031
5032 5032
5033 5033 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
5034 5034 Register super_klass,
5035 5035 Register temp_reg,
5036 5036 Register temp2_reg,
5037 5037 Label* L_success,
5038 5038 Label* L_failure,
5039 5039 bool set_cond_codes) {
5040 5040 assert_different_registers(sub_klass, super_klass, temp_reg);
5041 5041 if (temp2_reg != noreg)
5042 5042 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
5043 5043 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
5044 5044
5045 5045 Label L_fallthrough;
5046 5046 int label_nulls = 0;
5047 5047 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
5048 5048 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
5049 5049 assert(label_nulls <= 1, "at most one NULL in the batch");
5050 5050
5051 5051 // a couple of useful fields in sub_klass:
5052 5052 int ss_offset = in_bytes(Klass::secondary_supers_offset());
5053 5053 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
5054 5054 Address secondary_supers_addr(sub_klass, ss_offset);
5055 5055 Address super_cache_addr( sub_klass, sc_offset);
5056 5056
5057 5057 // Do a linear scan of the secondary super-klass chain.
5058 5058 // This code is rarely used, so simplicity is a virtue here.
5059 5059 // The repne_scan instruction uses fixed registers, which we must spill.
5060 5060 // Don't worry too much about pre-existing connections with the input regs.
5061 5061
5062 5062 assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
5063 5063 assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
5064 5064
5065 5065 // Get super_klass value into rax (even if it was in rdi or rcx).
5066 5066 bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
5067 5067 if (super_klass != rax || UseCompressedOops) {
5068 5068 if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
5069 5069 mov(rax, super_klass);
5070 5070 }
5071 5071 if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
5072 5072 if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
5073 5073
5074 5074 #ifndef PRODUCT
5075 5075 int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
5076 5076 ExternalAddress pst_counter_addr((address) pst_counter);
5077 5077 NOT_LP64( incrementl(pst_counter_addr) );
5078 5078 LP64_ONLY( lea(rcx, pst_counter_addr) );
5079 5079 LP64_ONLY( incrementl(Address(rcx, 0)) );
5080 5080 #endif //PRODUCT
5081 5081
5082 5082 // We will consult the secondary-super array.
5083 5083 movptr(rdi, secondary_supers_addr);
5084 5084 // Load the array length. (Positive movl does right thing on LP64.)
5085 5085 movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes()));
5086 5086 // Skip to start of data.
5087 5087 addptr(rdi, Array<Klass*>::base_offset_in_bytes());
5088 5088
5089 5089 // Scan RCX words at [RDI] for an occurrence of RAX.
5090 5090 // Set NZ/Z based on last compare.
5091 5091 // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does
5092 5092 // not change flags (only scas instruction which is repeated sets flags).
5093 5093 // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
5094 5094
5095 5095 testptr(rax,rax); // Set Z = 0
5096 5096 repne_scan();
5097 5097
5098 5098 // Unspill the temp. registers:
5099 5099 if (pushed_rdi) pop(rdi);
5100 5100 if (pushed_rcx) pop(rcx);
5101 5101 if (pushed_rax) pop(rax);
5102 5102
5103 5103 if (set_cond_codes) {
5104 5104 // Special hack for the AD files: rdi is guaranteed non-zero.
5105 5105 assert(!pushed_rdi, "rdi must be left non-NULL");
5106 5106 // Also, the condition codes are properly set Z/NZ on succeed/failure.
5107 5107 }
5108 5108
5109 5109 if (L_failure == &L_fallthrough)
5110 5110 jccb(Assembler::notEqual, *L_failure);
5111 5111 else jcc(Assembler::notEqual, *L_failure);
5112 5112
5113 5113 // Success. Cache the super we found and proceed in triumph.
5114 5114 movptr(super_cache_addr, super_klass);
5115 5115
5116 5116 if (L_success != &L_fallthrough) {
5117 5117 jmp(*L_success);
5118 5118 }
5119 5119
5120 5120 #undef IS_A_TEMP
5121 5121
5122 5122 bind(L_fallthrough);
5123 5123 }
5124 5124
5125 5125
5126 5126 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) {
5127 5127 if (VM_Version::supports_cmov()) {
5128 5128 cmovl(cc, dst, src);
5129 5129 } else {
5130 5130 Label L;
5131 5131 jccb(negate_condition(cc), L);
5132 5132 movl(dst, src);
5133 5133 bind(L);
5134 5134 }
5135 5135 }
5136 5136
5137 5137 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) {
5138 5138 if (VM_Version::supports_cmov()) {
5139 5139 cmovl(cc, dst, src);
5140 5140 } else {
5141 5141 Label L;
5142 5142 jccb(negate_condition(cc), L);
5143 5143 movl(dst, src);
5144 5144 bind(L);
5145 5145 }
5146 5146 }
5147 5147
5148 5148 void MacroAssembler::verify_oop(Register reg, const char* s) {
5149 5149 if (!VerifyOops) return;
5150 5150
5151 5151 // Pass register number to verify_oop_subroutine
5152 5152 const char* b = NULL;
5153 5153 {
5154 5154 ResourceMark rm;
5155 5155 stringStream ss;
5156 5156 ss.print("verify_oop: %s: %s", reg->name(), s);
5157 5157 b = code_string(ss.as_string());
5158 5158 }
5159 5159 BLOCK_COMMENT("verify_oop {");
5160 5160 #ifdef _LP64
5161 5161 push(rscratch1); // save r10, trashed by movptr()
5162 5162 #endif
5163 5163 push(rax); // save rax,
5164 5164 push(reg); // pass register argument
5165 5165 ExternalAddress buffer((address) b);
5166 5166 // avoid using pushptr, as it modifies scratch registers
5167 5167 // and our contract is not to modify anything
5168 5168 movptr(rax, buffer.addr());
5169 5169 push(rax);
5170 5170 // call indirectly to solve generation ordering problem
5171 5171 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
5172 5172 call(rax);
5173 5173 // Caller pops the arguments (oop, message) and restores rax, r10
5174 5174 BLOCK_COMMENT("} verify_oop");
5175 5175 }
5176 5176
5177 5177
5178 5178 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
5179 5179 Register tmp,
5180 5180 int offset) {
5181 5181 intptr_t value = *delayed_value_addr;
5182 5182 if (value != 0)
5183 5183 return RegisterOrConstant(value + offset);
5184 5184
5185 5185 // load indirectly to solve generation ordering problem
5186 5186 movptr(tmp, ExternalAddress((address) delayed_value_addr));
5187 5187
5188 5188 #ifdef ASSERT
5189 5189 { Label L;
5190 5190 testptr(tmp, tmp);
5191 5191 if (WizardMode) {
5192 5192 const char* buf = NULL;
5193 5193 {
5194 5194 ResourceMark rm;
5195 5195 stringStream ss;
5196 5196 ss.print("DelayedValue="INTPTR_FORMAT, delayed_value_addr[1]);
5197 5197 buf = code_string(ss.as_string());
5198 5198 }
5199 5199 jcc(Assembler::notZero, L);
5200 5200 STOP(buf);
5201 5201 } else {
5202 5202 jccb(Assembler::notZero, L);
5203 5203 hlt();
5204 5204 }
5205 5205 bind(L);
5206 5206 }
5207 5207 #endif
5208 5208
5209 5209 if (offset != 0)
5210 5210 addptr(tmp, offset);
5211 5211
5212 5212 return RegisterOrConstant(tmp);
5213 5213 }
5214 5214
5215 5215
5216 5216 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
5217 5217 int extra_slot_offset) {
5218 5218 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
5219 5219 int stackElementSize = Interpreter::stackElementSize;
5220 5220 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
5221 5221 #ifdef ASSERT
5222 5222 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
5223 5223 assert(offset1 - offset == stackElementSize, "correct arithmetic");
5224 5224 #endif
5225 5225 Register scale_reg = noreg;
5226 5226 Address::ScaleFactor scale_factor = Address::no_scale;
5227 5227 if (arg_slot.is_constant()) {
5228 5228 offset += arg_slot.as_constant() * stackElementSize;
5229 5229 } else {
5230 5230 scale_reg = arg_slot.as_register();
5231 5231 scale_factor = Address::times(stackElementSize);
5232 5232 }
5233 5233 offset += wordSize; // return PC is on stack
5234 5234 return Address(rsp, scale_reg, scale_factor, offset);
5235 5235 }
5236 5236
5237 5237
5238 5238 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
5239 5239 if (!VerifyOops) return;
5240 5240
5241 5241 // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord);
5242 5242 // Pass register number to verify_oop_subroutine
5243 5243 const char* b = NULL;
5244 5244 {
5245 5245 ResourceMark rm;
5246 5246 stringStream ss;
5247 5247 ss.print("verify_oop_addr: %s", s);
5248 5248 b = code_string(ss.as_string());
5249 5249 }
5250 5250 #ifdef _LP64
5251 5251 push(rscratch1); // save r10, trashed by movptr()
5252 5252 #endif
5253 5253 push(rax); // save rax,
5254 5254 // addr may contain rsp so we will have to adjust it based on the push
5255 5255 // we just did (and on 64 bit we do two pushes)
5256 5256 // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which
5257 5257 // stores rax into addr which is backwards of what was intended.
5258 5258 if (addr.uses(rsp)) {
5259 5259 lea(rax, addr);
5260 5260 pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
5261 5261 } else {
5262 5262 pushptr(addr);
5263 5263 }
5264 5264
5265 5265 ExternalAddress buffer((address) b);
5266 5266 // pass msg argument
5267 5267 // avoid using pushptr, as it modifies scratch registers
5268 5268 // and our contract is not to modify anything
5269 5269 movptr(rax, buffer.addr());
5270 5270 push(rax);
5271 5271
5272 5272 // call indirectly to solve generation ordering problem
5273 5273 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
5274 5274 call(rax);
5275 5275 // Caller pops the arguments (addr, message) and restores rax, r10.
5276 5276 }
5277 5277
5278 5278 void MacroAssembler::verify_tlab() {
5279 5279 #ifdef ASSERT
5280 5280 if (UseTLAB && VerifyOops) {
5281 5281 Label next, ok;
5282 5282 Register t1 = rsi;
5283 5283 Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread);
5284 5284
5285 5285 push(t1);
5286 5286 NOT_LP64(push(thread_reg));
5287 5287 NOT_LP64(get_thread(thread_reg));
5288 5288
5289 5289 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
5290 5290 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())));
5291 5291 jcc(Assembler::aboveEqual, next);
5292 5292 STOP("assert(top >= start)");
5293 5293 should_not_reach_here();
5294 5294
5295 5295 bind(next);
5296 5296 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())));
5297 5297 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())));
5298 5298 jcc(Assembler::aboveEqual, ok);
5299 5299 STOP("assert(top <= end)");
5300 5300 should_not_reach_here();
5301 5301
5302 5302 bind(ok);
5303 5303 NOT_LP64(pop(thread_reg));
5304 5304 pop(t1);
5305 5305 }
5306 5306 #endif
5307 5307 }
5308 5308
5309 5309 class ControlWord {
5310 5310 public:
5311 5311 int32_t _value;
5312 5312
5313 5313 int rounding_control() const { return (_value >> 10) & 3 ; }
5314 5314 int precision_control() const { return (_value >> 8) & 3 ; }
5315 5315 bool precision() const { return ((_value >> 5) & 1) != 0; }
5316 5316 bool underflow() const { return ((_value >> 4) & 1) != 0; }
5317 5317 bool overflow() const { return ((_value >> 3) & 1) != 0; }
5318 5318 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
5319 5319 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
5320 5320 bool invalid() const { return ((_value >> 0) & 1) != 0; }
5321 5321
5322 5322 void print() const {
5323 5323 // rounding control
5324 5324 const char* rc;
5325 5325 switch (rounding_control()) {
5326 5326 case 0: rc = "round near"; break;
5327 5327 case 1: rc = "round down"; break;
5328 5328 case 2: rc = "round up "; break;
5329 5329 case 3: rc = "chop "; break;
5330 5330 };
5331 5331 // precision control
5332 5332 const char* pc;
5333 5333 switch (precision_control()) {
5334 5334 case 0: pc = "24 bits "; break;
5335 5335 case 1: pc = "reserved"; break;
5336 5336 case 2: pc = "53 bits "; break;
5337 5337 case 3: pc = "64 bits "; break;
5338 5338 };
5339 5339 // flags
5340 5340 char f[9];
5341 5341 f[0] = ' ';
5342 5342 f[1] = ' ';
5343 5343 f[2] = (precision ()) ? 'P' : 'p';
5344 5344 f[3] = (underflow ()) ? 'U' : 'u';
5345 5345 f[4] = (overflow ()) ? 'O' : 'o';
5346 5346 f[5] = (zero_divide ()) ? 'Z' : 'z';
5347 5347 f[6] = (denormalized()) ? 'D' : 'd';
5348 5348 f[7] = (invalid ()) ? 'I' : 'i';
5349 5349 f[8] = '\x0';
5350 5350 // output
5351 5351 printf("%04x masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc);
5352 5352 }
5353 5353
5354 5354 };
5355 5355
5356 5356 class StatusWord {
5357 5357 public:
5358 5358 int32_t _value;
5359 5359
5360 5360 bool busy() const { return ((_value >> 15) & 1) != 0; }
5361 5361 bool C3() const { return ((_value >> 14) & 1) != 0; }
5362 5362 bool C2() const { return ((_value >> 10) & 1) != 0; }
5363 5363 bool C1() const { return ((_value >> 9) & 1) != 0; }
5364 5364 bool C0() const { return ((_value >> 8) & 1) != 0; }
5365 5365 int top() const { return (_value >> 11) & 7 ; }
5366 5366 bool error_status() const { return ((_value >> 7) & 1) != 0; }
5367 5367 bool stack_fault() const { return ((_value >> 6) & 1) != 0; }
5368 5368 bool precision() const { return ((_value >> 5) & 1) != 0; }
5369 5369 bool underflow() const { return ((_value >> 4) & 1) != 0; }
5370 5370 bool overflow() const { return ((_value >> 3) & 1) != 0; }
5371 5371 bool zero_divide() const { return ((_value >> 2) & 1) != 0; }
5372 5372 bool denormalized() const { return ((_value >> 1) & 1) != 0; }
5373 5373 bool invalid() const { return ((_value >> 0) & 1) != 0; }
5374 5374
5375 5375 void print() const {
5376 5376 // condition codes
5377 5377 char c[5];
5378 5378 c[0] = (C3()) ? '3' : '-';
5379 5379 c[1] = (C2()) ? '2' : '-';
5380 5380 c[2] = (C1()) ? '1' : '-';
5381 5381 c[3] = (C0()) ? '0' : '-';
5382 5382 c[4] = '\x0';
5383 5383 // flags
5384 5384 char f[9];
5385 5385 f[0] = (error_status()) ? 'E' : '-';
5386 5386 f[1] = (stack_fault ()) ? 'S' : '-';
5387 5387 f[2] = (precision ()) ? 'P' : '-';
5388 5388 f[3] = (underflow ()) ? 'U' : '-';
5389 5389 f[4] = (overflow ()) ? 'O' : '-';
5390 5390 f[5] = (zero_divide ()) ? 'Z' : '-';
5391 5391 f[6] = (denormalized()) ? 'D' : '-';
5392 5392 f[7] = (invalid ()) ? 'I' : '-';
5393 5393 f[8] = '\x0';
5394 5394 // output
5395 5395 printf("%04x flags = %s, cc = %s, top = %d", _value & 0xFFFF, f, c, top());
5396 5396 }
5397 5397
5398 5398 };
5399 5399
5400 5400 class TagWord {
5401 5401 public:
5402 5402 int32_t _value;
5403 5403
5404 5404 int tag_at(int i) const { return (_value >> (i*2)) & 3; }
5405 5405
5406 5406 void print() const {
5407 5407 printf("%04x", _value & 0xFFFF);
5408 5408 }
5409 5409
5410 5410 };
5411 5411
5412 5412 class FPU_Register {
5413 5413 public:
5414 5414 int32_t _m0;
5415 5415 int32_t _m1;
5416 5416 int16_t _ex;
5417 5417
5418 5418 bool is_indefinite() const {
5419 5419 return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0;
5420 5420 }
5421 5421
5422 5422 void print() const {
5423 5423 char sign = (_ex < 0) ? '-' : '+';
5424 5424 const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : " ";
5425 5425 printf("%c%04hx.%08x%08x %s", sign, _ex, _m1, _m0, kind);
5426 5426 };
5427 5427
5428 5428 };
5429 5429
5430 5430 class FPU_State {
5431 5431 public:
5432 5432 enum {
5433 5433 register_size = 10,
5434 5434 number_of_registers = 8,
5435 5435 register_mask = 7
5436 5436 };
5437 5437
5438 5438 ControlWord _control_word;
5439 5439 StatusWord _status_word;
5440 5440 TagWord _tag_word;
5441 5441 int32_t _error_offset;
5442 5442 int32_t _error_selector;
5443 5443 int32_t _data_offset;
5444 5444 int32_t _data_selector;
5445 5445 int8_t _register[register_size * number_of_registers];
5446 5446
5447 5447 int tag_for_st(int i) const { return _tag_word.tag_at((_status_word.top() + i) & register_mask); }
5448 5448 FPU_Register* st(int i) const { return (FPU_Register*)&_register[register_size * i]; }
5449 5449
5450 5450 const char* tag_as_string(int tag) const {
5451 5451 switch (tag) {
5452 5452 case 0: return "valid";
5453 5453 case 1: return "zero";
5454 5454 case 2: return "special";
5455 5455 case 3: return "empty";
5456 5456 }
5457 5457 ShouldNotReachHere();
5458 5458 return NULL;
5459 5459 }
5460 5460
5461 5461 void print() const {
5462 5462 // print computation registers
5463 5463 { int t = _status_word.top();
5464 5464 for (int i = 0; i < number_of_registers; i++) {
5465 5465 int j = (i - t) & register_mask;
5466 5466 printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j);
5467 5467 st(j)->print();
5468 5468 printf(" %s\n", tag_as_string(_tag_word.tag_at(i)));
5469 5469 }
5470 5470 }
5471 5471 printf("\n");
5472 5472 // print control registers
5473 5473 printf("ctrl = "); _control_word.print(); printf("\n");
5474 5474 printf("stat = "); _status_word .print(); printf("\n");
5475 5475 printf("tags = "); _tag_word .print(); printf("\n");
5476 5476 }
5477 5477
5478 5478 };
5479 5479
5480 5480 class Flag_Register {
5481 5481 public:
5482 5482 int32_t _value;
5483 5483
5484 5484 bool overflow() const { return ((_value >> 11) & 1) != 0; }
5485 5485 bool direction() const { return ((_value >> 10) & 1) != 0; }
5486 5486 bool sign() const { return ((_value >> 7) & 1) != 0; }
5487 5487 bool zero() const { return ((_value >> 6) & 1) != 0; }
5488 5488 bool auxiliary_carry() const { return ((_value >> 4) & 1) != 0; }
5489 5489 bool parity() const { return ((_value >> 2) & 1) != 0; }
5490 5490 bool carry() const { return ((_value >> 0) & 1) != 0; }
5491 5491
5492 5492 void print() const {
5493 5493 // flags
5494 5494 char f[8];
5495 5495 f[0] = (overflow ()) ? 'O' : '-';
5496 5496 f[1] = (direction ()) ? 'D' : '-';
5497 5497 f[2] = (sign ()) ? 'S' : '-';
5498 5498 f[3] = (zero ()) ? 'Z' : '-';
5499 5499 f[4] = (auxiliary_carry()) ? 'A' : '-';
5500 5500 f[5] = (parity ()) ? 'P' : '-';
5501 5501 f[6] = (carry ()) ? 'C' : '-';
5502 5502 f[7] = '\x0';
5503 5503 // output
5504 5504 printf("%08x flags = %s", _value, f);
5505 5505 }
5506 5506
5507 5507 };
5508 5508
5509 5509 class IU_Register {
5510 5510 public:
5511 5511 int32_t _value;
5512 5512
5513 5513 void print() const {
5514 5514 printf("%08x %11d", _value, _value);
5515 5515 }
5516 5516
5517 5517 };
5518 5518
5519 5519 class IU_State {
5520 5520 public:
5521 5521 Flag_Register _eflags;
5522 5522 IU_Register _rdi;
5523 5523 IU_Register _rsi;
5524 5524 IU_Register _rbp;
5525 5525 IU_Register _rsp;
5526 5526 IU_Register _rbx;
5527 5527 IU_Register _rdx;
5528 5528 IU_Register _rcx;
5529 5529 IU_Register _rax;
5530 5530
5531 5531 void print() const {
5532 5532 // computation registers
5533 5533 printf("rax, = "); _rax.print(); printf("\n");
5534 5534 printf("rbx, = "); _rbx.print(); printf("\n");
5535 5535 printf("rcx = "); _rcx.print(); printf("\n");
5536 5536 printf("rdx = "); _rdx.print(); printf("\n");
5537 5537 printf("rdi = "); _rdi.print(); printf("\n");
5538 5538 printf("rsi = "); _rsi.print(); printf("\n");
5539 5539 printf("rbp, = "); _rbp.print(); printf("\n");
5540 5540 printf("rsp = "); _rsp.print(); printf("\n");
5541 5541 printf("\n");
5542 5542 // control registers
5543 5543 printf("flgs = "); _eflags.print(); printf("\n");
5544 5544 }
5545 5545 };
5546 5546
5547 5547
5548 5548 class CPU_State {
5549 5549 public:
5550 5550 FPU_State _fpu_state;
5551 5551 IU_State _iu_state;
5552 5552
5553 5553 void print() const {
5554 5554 printf("--------------------------------------------------\n");
5555 5555 _iu_state .print();
5556 5556 printf("\n");
5557 5557 _fpu_state.print();
5558 5558 printf("--------------------------------------------------\n");
5559 5559 }
5560 5560
5561 5561 };
5562 5562
5563 5563
5564 5564 static void _print_CPU_state(CPU_State* state) {
5565 5565 state->print();
5566 5566 };
5567 5567
5568 5568
5569 5569 void MacroAssembler::print_CPU_state() {
5570 5570 push_CPU_state();
5571 5571 push(rsp); // pass CPU state
5572 5572 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
5573 5573 addptr(rsp, wordSize); // discard argument
5574 5574 pop_CPU_state();
5575 5575 }
5576 5576
5577 5577
5578 5578 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) {
5579 5579 static int counter = 0;
5580 5580 FPU_State* fs = &state->_fpu_state;
5581 5581 counter++;
5582 5582 // For leaf calls, only verify that the top few elements remain empty.
5583 5583 // We only need 1 empty at the top for C2 code.
5584 5584 if( stack_depth < 0 ) {
5585 5585 if( fs->tag_for_st(7) != 3 ) {
5586 5586 printf("FPR7 not empty\n");
5587 5587 state->print();
5588 5588 assert(false, "error");
5589 5589 return false;
5590 5590 }
5591 5591 return true; // All other stack states do not matter
5592 5592 }
5593 5593
5594 5594 assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std,
5595 5595 "bad FPU control word");
5596 5596
5597 5597 // compute stack depth
5598 5598 int i = 0;
5599 5599 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) < 3) i++;
5600 5600 int d = i;
5601 5601 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++;
5602 5602 // verify findings
5603 5603 if (i != FPU_State::number_of_registers) {
5604 5604 // stack not contiguous
5605 5605 printf("%s: stack not contiguous at ST%d\n", s, i);
5606 5606 state->print();
5607 5607 assert(false, "error");
5608 5608 return false;
5609 5609 }
5610 5610 // check if computed stack depth corresponds to expected stack depth
5611 5611 if (stack_depth < 0) {
5612 5612 // expected stack depth is -stack_depth or less
5613 5613 if (d > -stack_depth) {
5614 5614 // too many elements on the stack
5615 5615 printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
5616 5616 state->print();
5617 5617 assert(false, "error");
5618 5618 return false;
5619 5619 }
5620 5620 } else {
5621 5621 // expected stack depth is stack_depth
5622 5622 if (d != stack_depth) {
5623 5623 // wrong stack depth
5624 5624 printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
5625 5625 state->print();
5626 5626 assert(false, "error");
5627 5627 return false;
5628 5628 }
5629 5629 }
5630 5630 // everything is cool
5631 5631 return true;
5632 5632 }
5633 5633
5634 5634
5635 5635 void MacroAssembler::verify_FPU(int stack_depth, const char* s) {
5636 5636 if (!VerifyFPU) return;
5637 5637 push_CPU_state();
5638 5638 push(rsp); // pass CPU state
5639 5639 ExternalAddress msg((address) s);
5640 5640 // pass message string s
5641 5641 pushptr(msg.addr());
5642 5642 push(stack_depth); // pass stack depth
5643 5643 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU)));
5644 5644 addptr(rsp, 3 * wordSize); // discard arguments
5645 5645 // check for error
5646 5646 { Label L;
5647 5647 testl(rax, rax);
5648 5648 jcc(Assembler::notZero, L);
5649 5649 int3(); // break if error condition
5650 5650 bind(L);
5651 5651 }
5652 5652 pop_CPU_state();
5653 5653 }
5654 5654
5655 5655 void MacroAssembler::restore_cpu_control_state_after_jni() {
5656 5656 // Either restore the MXCSR register after returning from the JNI Call
5657 5657 // or verify that it wasn't changed (with -Xcheck:jni flag).
5658 5658 if (VM_Version::supports_sse()) {
5659 5659 if (RestoreMXCSROnJNICalls) {
5660 5660 ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std()));
5661 5661 } else if (CheckJNICalls) {
5662 5662 call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
5663 5663 }
5664 5664 }
5665 5665 if (VM_Version::supports_avx()) {
5666 5666 // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
5667 5667 vzeroupper();
5668 5668 }
5669 5669
5670 5670 #ifndef _LP64
5671 5671 // Either restore the x87 floating pointer control word after returning
5672 5672 // from the JNI call or verify that it wasn't changed.
5673 5673 if (CheckJNICalls) {
5674 5674 call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
5675 5675 }
5676 5676 #endif // _LP64
5677 5677 }
5678 5678
5679 5679
5680 5680 void MacroAssembler::load_klass(Register dst, Register src) {
5681 5681 #ifdef _LP64
5682 5682 if (UseCompressedClassPointers) {
5683 5683 movl(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5684 5684 decode_klass_not_null(dst);
5685 5685 } else
5686 5686 #endif
5687 5687 movptr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5688 5688 }
5689 5689
5690 5690 void MacroAssembler::load_prototype_header(Register dst, Register src) {
5691 5691 load_klass(dst, src);
5692 5692 movptr(dst, Address(dst, Klass::prototype_header_offset()));
5693 5693 }
5694 5694
5695 5695 void MacroAssembler::store_klass(Register dst, Register src) {
5696 5696 #ifdef _LP64
5697 5697 if (UseCompressedClassPointers) {
5698 5698 encode_klass_not_null(src);
5699 5699 movl(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5700 5700 } else
5701 5701 #endif
5702 5702 movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src);
5703 5703 }
5704 5704
5705 5705 void MacroAssembler::load_heap_oop(Register dst, Address src) {
5706 5706 #ifdef _LP64
5707 5707 // FIXME: Must change all places where we try to load the klass.
5708 5708 if (UseCompressedOops) {
5709 5709 movl(dst, src);
5710 5710 decode_heap_oop(dst);
5711 5711 } else
5712 5712 #endif
5713 5713 movptr(dst, src);
5714 5714 }
5715 5715
5716 5716 // Doesn't do verfication, generates fixed size code
5717 5717 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src) {
5718 5718 #ifdef _LP64
5719 5719 if (UseCompressedOops) {
5720 5720 movl(dst, src);
5721 5721 decode_heap_oop_not_null(dst);
5722 5722 } else
5723 5723 #endif
5724 5724 movptr(dst, src);
5725 5725 }
5726 5726
5727 5727 void MacroAssembler::store_heap_oop(Address dst, Register src) {
5728 5728 #ifdef _LP64
5729 5729 if (UseCompressedOops) {
5730 5730 assert(!dst.uses(src), "not enough registers");
5731 5731 encode_heap_oop(src);
5732 5732 movl(dst, src);
5733 5733 } else
5734 5734 #endif
5735 5735 movptr(dst, src);
5736 5736 }
5737 5737
5738 5738 void MacroAssembler::cmp_heap_oop(Register src1, Address src2, Register tmp) {
5739 5739 assert_different_registers(src1, tmp);
5740 5740 #ifdef _LP64
5741 5741 if (UseCompressedOops) {
5742 5742 bool did_push = false;
5743 5743 if (tmp == noreg) {
5744 5744 tmp = rax;
5745 5745 push(tmp);
5746 5746 did_push = true;
5747 5747 assert(!src2.uses(rsp), "can't push");
5748 5748 }
5749 5749 load_heap_oop(tmp, src2);
5750 5750 cmpptr(src1, tmp);
5751 5751 if (did_push) pop(tmp);
5752 5752 } else
5753 5753 #endif
5754 5754 cmpptr(src1, src2);
5755 5755 }
5756 5756
5757 5757 // Used for storing NULLs.
5758 5758 void MacroAssembler::store_heap_oop_null(Address dst) {
5759 5759 #ifdef _LP64
5760 5760 if (UseCompressedOops) {
5761 5761 movl(dst, (int32_t)NULL_WORD);
5762 5762 } else {
5763 5763 movslq(dst, (int32_t)NULL_WORD);
5764 5764 }
5765 5765 #else
5766 5766 movl(dst, (int32_t)NULL_WORD);
5767 5767 #endif
5768 5768 }
5769 5769
5770 5770 #ifdef _LP64
5771 5771 void MacroAssembler::store_klass_gap(Register dst, Register src) {
5772 5772 if (UseCompressedClassPointers) {
5773 5773 // Store to klass gap in destination
5774 5774 movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
5775 5775 }
5776 5776 }
5777 5777
5778 5778 #ifdef ASSERT
5779 5779 void MacroAssembler::verify_heapbase(const char* msg) {
5780 5780 assert (UseCompressedOops, "should be compressed");
5781 5781 assert (Universe::heap() != NULL, "java heap should be initialized");
5782 5782 if (CheckCompressedOops) {
5783 5783 Label ok;
5784 5784 push(rscratch1); // cmpptr trashes rscratch1
5785 5785 cmpptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
5786 5786 jcc(Assembler::equal, ok);
5787 5787 STOP(msg);
5788 5788 bind(ok);
5789 5789 pop(rscratch1);
5790 5790 }
5791 5791 }
5792 5792 #endif
5793 5793
5794 5794 // Algorithm must match oop.inline.hpp encode_heap_oop.
5795 5795 void MacroAssembler::encode_heap_oop(Register r) {
5796 5796 #ifdef ASSERT
5797 5797 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
5798 5798 #endif
5799 5799 verify_oop(r, "broken oop in encode_heap_oop");
5800 5800 if (Universe::narrow_oop_base() == NULL) {
5801 5801 if (Universe::narrow_oop_shift() != 0) {
5802 5802 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5803 5803 shrq(r, LogMinObjAlignmentInBytes);
5804 5804 }
5805 5805 return;
5806 5806 }
5807 5807 testq(r, r);
5808 5808 cmovq(Assembler::equal, r, r12_heapbase);
5809 5809 subq(r, r12_heapbase);
5810 5810 shrq(r, LogMinObjAlignmentInBytes);
5811 5811 }
5812 5812
5813 5813 void MacroAssembler::encode_heap_oop_not_null(Register r) {
5814 5814 #ifdef ASSERT
5815 5815 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
5816 5816 if (CheckCompressedOops) {
5817 5817 Label ok;
5818 5818 testq(r, r);
5819 5819 jcc(Assembler::notEqual, ok);
5820 5820 STOP("null oop passed to encode_heap_oop_not_null");
5821 5821 bind(ok);
5822 5822 }
5823 5823 #endif
5824 5824 verify_oop(r, "broken oop in encode_heap_oop_not_null");
5825 5825 if (Universe::narrow_oop_base() != NULL) {
5826 5826 subq(r, r12_heapbase);
5827 5827 }
5828 5828 if (Universe::narrow_oop_shift() != 0) {
5829 5829 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5830 5830 shrq(r, LogMinObjAlignmentInBytes);
5831 5831 }
5832 5832 }
5833 5833
5834 5834 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
5835 5835 #ifdef ASSERT
5836 5836 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
5837 5837 if (CheckCompressedOops) {
5838 5838 Label ok;
5839 5839 testq(src, src);
5840 5840 jcc(Assembler::notEqual, ok);
5841 5841 STOP("null oop passed to encode_heap_oop_not_null2");
5842 5842 bind(ok);
5843 5843 }
5844 5844 #endif
5845 5845 verify_oop(src, "broken oop in encode_heap_oop_not_null2");
5846 5846 if (dst != src) {
5847 5847 movq(dst, src);
5848 5848 }
5849 5849 if (Universe::narrow_oop_base() != NULL) {
5850 5850 subq(dst, r12_heapbase);
5851 5851 }
5852 5852 if (Universe::narrow_oop_shift() != 0) {
5853 5853 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5854 5854 shrq(dst, LogMinObjAlignmentInBytes);
5855 5855 }
5856 5856 }
5857 5857
5858 5858 void MacroAssembler::decode_heap_oop(Register r) {
5859 5859 #ifdef ASSERT
5860 5860 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
5861 5861 #endif
5862 5862 if (Universe::narrow_oop_base() == NULL) {
5863 5863 if (Universe::narrow_oop_shift() != 0) {
5864 5864 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5865 5865 shlq(r, LogMinObjAlignmentInBytes);
5866 5866 }
5867 5867 } else {
5868 5868 Label done;
5869 5869 shlq(r, LogMinObjAlignmentInBytes);
5870 5870 jccb(Assembler::equal, done);
5871 5871 addq(r, r12_heapbase);
5872 5872 bind(done);
5873 5873 }
5874 5874 verify_oop(r, "broken oop in decode_heap_oop");
5875 5875 }
5876 5876
5877 5877 void MacroAssembler::decode_heap_oop_not_null(Register r) {
5878 5878 // Note: it will change flags
5879 5879 assert (UseCompressedOops, "should only be used for compressed headers");
5880 5880 assert (Universe::heap() != NULL, "java heap should be initialized");
5881 5881 // Cannot assert, unverified entry point counts instructions (see .ad file)
5882 5882 // vtableStubs also counts instructions in pd_code_size_limit.
5883 5883 // Also do not verify_oop as this is called by verify_oop.
5884 5884 if (Universe::narrow_oop_shift() != 0) {
5885 5885 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5886 5886 shlq(r, LogMinObjAlignmentInBytes);
5887 5887 if (Universe::narrow_oop_base() != NULL) {
5888 5888 addq(r, r12_heapbase);
5889 5889 }
5890 5890 } else {
5891 5891 assert (Universe::narrow_oop_base() == NULL, "sanity");
5892 5892 }
5893 5893 }
5894 5894
5895 5895 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
5896 5896 // Note: it will change flags
5897 5897 assert (UseCompressedOops, "should only be used for compressed headers");
5898 5898 assert (Universe::heap() != NULL, "java heap should be initialized");
5899 5899 // Cannot assert, unverified entry point counts instructions (see .ad file)
5900 5900 // vtableStubs also counts instructions in pd_code_size_limit.
5901 5901 // Also do not verify_oop as this is called by verify_oop.
5902 5902 if (Universe::narrow_oop_shift() != 0) {
5903 5903 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
5904 5904 if (LogMinObjAlignmentInBytes == Address::times_8) {
5905 5905 leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
5906 5906 } else {
5907 5907 if (dst != src) {
5908 5908 movq(dst, src);
5909 5909 }
5910 5910 shlq(dst, LogMinObjAlignmentInBytes);
5911 5911 if (Universe::narrow_oop_base() != NULL) {
5912 5912 addq(dst, r12_heapbase);
5913 5913 }
5914 5914 }
5915 5915 } else {
5916 5916 assert (Universe::narrow_oop_base() == NULL, "sanity");
5917 5917 if (dst != src) {
5918 5918 movq(dst, src);
5919 5919 }
5920 5920 }
5921 5921 }
5922 5922
5923 5923 void MacroAssembler::encode_klass_not_null(Register r) {
5924 5924 if (Universe::narrow_klass_base() != NULL) {
5925 5925 // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5926 5926 assert(r != r12_heapbase, "Encoding a klass in r12");
5927 5927 mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base());
5928 5928 subq(r, r12_heapbase);
5929 5929 }
5930 5930 if (Universe::narrow_klass_shift() != 0) {
5931 5931 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
5932 5932 shrq(r, LogKlassAlignmentInBytes);
5933 5933 }
5934 5934 if (Universe::narrow_klass_base() != NULL) {
5935 5935 reinit_heapbase();
5936 5936 }
5937 5937 }
5938 5938
5939 5939 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
5940 5940 if (dst == src) {
5941 5941 encode_klass_not_null(src);
5942 5942 } else {
5943 5943 if (Universe::narrow_klass_base() != NULL) {
5944 5944 mov64(dst, (int64_t)Universe::narrow_klass_base());
5945 5945 negq(dst);
5946 5946 addq(dst, src);
5947 5947 } else {
5948 5948 movptr(dst, src);
5949 5949 }
5950 5950 if (Universe::narrow_klass_shift() != 0) {
5951 5951 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
5952 5952 shrq(dst, LogKlassAlignmentInBytes);
5953 5953 }
5954 5954 }
5955 5955 }
5956 5956
5957 5957 // Function instr_size_for_decode_klass_not_null() counts the instructions
5958 5958 // generated by decode_klass_not_null(register r) and reinit_heapbase(),
5959 5959 // when (Universe::heap() != NULL). Hence, if the instructions they
5960 5960 // generate change, then this method needs to be updated.
5961 5961 int MacroAssembler::instr_size_for_decode_klass_not_null() {
5962 5962 assert (UseCompressedClassPointers, "only for compressed klass ptrs");
5963 5963 if (Universe::narrow_klass_base() != NULL) {
5964 5964 // mov64 + addq + shlq? + mov64 (for reinit_heapbase()).
5965 5965 return (Universe::narrow_klass_shift() == 0 ? 20 : 24);
5966 5966 } else {
5967 5967 // longest load decode klass function, mov64, leaq
5968 5968 return 16;
5969 5969 }
5970 5970 }
5971 5971
5972 5972 // !!! If the instructions that get generated here change then function
5973 5973 // instr_size_for_decode_klass_not_null() needs to get updated.
5974 5974 void MacroAssembler::decode_klass_not_null(Register r) {
5975 5975 // Note: it will change flags
5976 5976 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5977 5977 assert(r != r12_heapbase, "Decoding a klass in r12");
5978 5978 // Cannot assert, unverified entry point counts instructions (see .ad file)
5979 5979 // vtableStubs also counts instructions in pd_code_size_limit.
5980 5980 // Also do not verify_oop as this is called by verify_oop.
5981 5981 if (Universe::narrow_klass_shift() != 0) {
5982 5982 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
5983 5983 shlq(r, LogKlassAlignmentInBytes);
5984 5984 }
5985 5985 // Use r12 as a scratch register in which to temporarily load the narrow_klass_base.
5986 5986 if (Universe::narrow_klass_base() != NULL) {
5987 5987 mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base());
5988 5988 addq(r, r12_heapbase);
5989 5989 reinit_heapbase();
5990 5990 }
5991 5991 }
5992 5992
5993 5993 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
5994 5994 // Note: it will change flags
5995 5995 assert (UseCompressedClassPointers, "should only be used for compressed headers");
5996 5996 if (dst == src) {
5997 5997 decode_klass_not_null(dst);
5998 5998 } else {
5999 5999 // Cannot assert, unverified entry point counts instructions (see .ad file)
6000 6000 // vtableStubs also counts instructions in pd_code_size_limit.
6001 6001 // Also do not verify_oop as this is called by verify_oop.
6002 6002 mov64(dst, (int64_t)Universe::narrow_klass_base());
6003 6003 if (Universe::narrow_klass_shift() != 0) {
6004 6004 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
6005 6005 assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
6006 6006 leaq(dst, Address(dst, src, Address::times_8, 0));
6007 6007 } else {
6008 6008 addq(dst, src);
6009 6009 }
6010 6010 }
6011 6011 }
6012 6012
6013 6013 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
6014 6014 assert (UseCompressedOops, "should only be used for compressed headers");
6015 6015 assert (Universe::heap() != NULL, "java heap should be initialized");
6016 6016 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6017 6017 int oop_index = oop_recorder()->find_index(obj);
6018 6018 RelocationHolder rspec = oop_Relocation::spec(oop_index);
6019 6019 mov_narrow_oop(dst, oop_index, rspec);
6020 6020 }
6021 6021
6022 6022 void MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
6023 6023 assert (UseCompressedOops, "should only be used for compressed headers");
6024 6024 assert (Universe::heap() != NULL, "java heap should be initialized");
6025 6025 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6026 6026 int oop_index = oop_recorder()->find_index(obj);
6027 6027 RelocationHolder rspec = oop_Relocation::spec(oop_index);
6028 6028 mov_narrow_oop(dst, oop_index, rspec);
6029 6029 }
6030 6030
6031 6031 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
6032 6032 assert (UseCompressedClassPointers, "should only be used for compressed headers");
6033 6033 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6034 6034 int klass_index = oop_recorder()->find_index(k);
6035 6035 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6036 6036 mov_narrow_oop(dst, Klass::encode_klass(k), rspec);
6037 6037 }
6038 6038
6039 6039 void MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
6040 6040 assert (UseCompressedClassPointers, "should only be used for compressed headers");
6041 6041 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6042 6042 int klass_index = oop_recorder()->find_index(k);
6043 6043 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6044 6044 mov_narrow_oop(dst, Klass::encode_klass(k), rspec);
6045 6045 }
6046 6046
6047 6047 void MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
6048 6048 assert (UseCompressedOops, "should only be used for compressed headers");
6049 6049 assert (Universe::heap() != NULL, "java heap should be initialized");
6050 6050 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6051 6051 int oop_index = oop_recorder()->find_index(obj);
6052 6052 RelocationHolder rspec = oop_Relocation::spec(oop_index);
6053 6053 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
6054 6054 }
6055 6055
6056 6056 void MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
6057 6057 assert (UseCompressedOops, "should only be used for compressed headers");
6058 6058 assert (Universe::heap() != NULL, "java heap should be initialized");
6059 6059 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6060 6060 int oop_index = oop_recorder()->find_index(obj);
6061 6061 RelocationHolder rspec = oop_Relocation::spec(oop_index);
6062 6062 Assembler::cmp_narrow_oop(dst, oop_index, rspec);
6063 6063 }
6064 6064
6065 6065 void MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
6066 6066 assert (UseCompressedClassPointers, "should only be used for compressed headers");
6067 6067 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6068 6068 int klass_index = oop_recorder()->find_index(k);
6069 6069 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6070 6070 Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec);
6071 6071 }
6072 6072
6073 6073 void MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
6074 6074 assert (UseCompressedClassPointers, "should only be used for compressed headers");
6075 6075 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
6076 6076 int klass_index = oop_recorder()->find_index(k);
6077 6077 RelocationHolder rspec = metadata_Relocation::spec(klass_index);
6078 6078 Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec);
6079 6079 }
6080 6080
6081 6081 void MacroAssembler::reinit_heapbase() {
6082 6082 if (UseCompressedOops || UseCompressedClassPointers) {
6083 6083 if (Universe::heap() != NULL) {
6084 6084 if (Universe::narrow_oop_base() == NULL) {
6085 6085 MacroAssembler::xorptr(r12_heapbase, r12_heapbase);
6086 6086 } else {
6087 6087 mov64(r12_heapbase, (int64_t)Universe::narrow_ptrs_base());
6088 6088 }
6089 6089 } else {
6090 6090 movptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
6091 6091 }
6092 6092 }
6093 6093 }
6094 6094
6095 6095 #endif // _LP64
6096 6096
6097 6097
6098 6098 // C2 compiled method's prolog code.
6099 6099 void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b) {
6100 6100
6101 6101 // WARNING: Initial instruction MUST be 5 bytes or longer so that
6102 6102 // NativeJump::patch_verified_entry will be able to patch out the entry
6103 6103 // code safely. The push to verify stack depth is ok at 5 bytes,
6104 6104 // the frame allocation can be either 3 or 6 bytes. So if we don't do
6105 6105 // stack bang then we must use the 6 byte frame allocation even if
6106 6106 // we have no frame. :-(
6107 6107 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
6108 6108
6109 6109 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
6110 6110 // Remove word for return addr
6111 6111 framesize -= wordSize;
6112 6112 stack_bang_size -= wordSize;
6113 6113
6114 6114 // Calls to C2R adapters often do not accept exceptional returns.
6115 6115 // We require that their callers must bang for them. But be careful, because
6116 6116 // some VM calls (such as call site linkage) can use several kilobytes of
6117 6117 // stack. But the stack safety zone should account for that.
6118 6118 // See bugs 4446381, 4468289, 4497237.
6119 6119 if (stack_bang_size > 0) {
6120 6120 generate_stack_overflow_check(stack_bang_size);
6121 6121
6122 6122 // We always push rbp, so that on return to interpreter rbp, will be
6123 6123 // restored correctly and we can correct the stack.
6124 6124 push(rbp);
6125 6125 // Remove word for ebp
6126 6126 framesize -= wordSize;
6127 6127
6128 6128 // Create frame
6129 6129 if (framesize) {
6130 6130 subptr(rsp, framesize);
6131 6131 }
6132 6132 } else {
6133 6133 // Create frame (force generation of a 4 byte immediate value)
6134 6134 subptr_imm32(rsp, framesize);
6135 6135
6136 6136 // Save RBP register now.
6137 6137 framesize -= wordSize;
6138 6138 movptr(Address(rsp, framesize), rbp);
6139 6139 }
6140 6140
6141 6141 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
6142 6142 framesize -= wordSize;
6143 6143 movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
6144 6144 }
6145 6145
6146 6146 #ifndef _LP64
6147 6147 // If method sets FPU control word do it now
6148 6148 if (fp_mode_24b) {
6149 6149 fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
6150 6150 }
6151 6151 if (UseSSE >= 2 && VerifyFPU) {
6152 6152 verify_FPU(0, "FPU stack must be clean on entry");
6153 6153 }
6154 6154 #endif
6155 6155
6156 6156 #ifdef ASSERT
6157 6157 if (VerifyStackAtCalls) {
6158 6158 Label L;
6159 6159 push(rax);
6160 6160 mov(rax, rsp);
6161 6161 andptr(rax, StackAlignmentInBytes-1);
6162 6162 cmpptr(rax, StackAlignmentInBytes-wordSize);
6163 6163 pop(rax);
6164 6164 jcc(Assembler::equal, L);
6165 6165 STOP("Stack is not properly aligned!");
6166 6166 bind(L);
6167 6167 }
6168 6168 #endif
6169 6169
6170 6170 }
6171 6171
6172 6172 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp) {
6173 6173 // cnt - number of qwords (8-byte words).
6174 6174 // base - start address, qword aligned.
6175 6175 assert(base==rdi, "base register must be edi for rep stos");
6176 6176 assert(tmp==rax, "tmp register must be eax for rep stos");
6177 6177 assert(cnt==rcx, "cnt register must be ecx for rep stos");
6178 6178
6179 6179 xorptr(tmp, tmp);
6180 6180 if (UseFastStosb) {
6181 6181 shlptr(cnt,3); // convert to number of bytes
6182 6182 rep_stosb();
6183 6183 } else {
6184 6184 NOT_LP64(shlptr(cnt,1);) // convert to number of dwords for 32-bit VM
6185 6185 rep_stos();
6186 6186 }
6187 6187 }
6188 6188
6189 6189 // IndexOf for constant substrings with size >= 8 chars
6190 6190 // which don't need to be loaded through stack.
6191 6191 void MacroAssembler::string_indexofC8(Register str1, Register str2,
6192 6192 Register cnt1, Register cnt2,
6193 6193 int int_cnt2, Register result,
6194 6194 XMMRegister vec, Register tmp) {
6195 6195 ShortBranchVerifier sbv(this);
6196 6196 assert(UseSSE42Intrinsics, "SSE4.2 is required");
6197 6197
6198 6198 // This method uses pcmpestri inxtruction with bound registers
6199 6199 // inputs:
6200 6200 // xmm - substring
6201 6201 // rax - substring length (elements count)
6202 6202 // mem - scanned string
6203 6203 // rdx - string length (elements count)
6204 6204 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
6205 6205 // outputs:
6206 6206 // rcx - matched index in string
6207 6207 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6208 6208
6209 6209 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
6210 6210 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
6211 6211 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
6212 6212
6213 6213 // Note, inline_string_indexOf() generates checks:
6214 6214 // if (substr.count > string.count) return -1;
6215 6215 // if (substr.count == 0) return 0;
6216 6216 assert(int_cnt2 >= 8, "this code isused only for cnt2 >= 8 chars");
6217 6217
6218 6218 // Load substring.
6219 6219 movdqu(vec, Address(str2, 0));
6220 6220 movl(cnt2, int_cnt2);
6221 6221 movptr(result, str1); // string addr
6222 6222
6223 6223 if (int_cnt2 > 8) {
6224 6224 jmpb(SCAN_TO_SUBSTR);
6225 6225
6226 6226 // Reload substr for rescan, this code
6227 6227 // is executed only for large substrings (> 8 chars)
6228 6228 bind(RELOAD_SUBSTR);
6229 6229 movdqu(vec, Address(str2, 0));
6230 6230 negptr(cnt2); // Jumped here with negative cnt2, convert to positive
6231 6231
6232 6232 bind(RELOAD_STR);
6233 6233 // We came here after the beginning of the substring was
6234 6234 // matched but the rest of it was not so we need to search
6235 6235 // again. Start from the next element after the previous match.
6236 6236
6237 6237 // cnt2 is number of substring reminding elements and
6238 6238 // cnt1 is number of string reminding elements when cmp failed.
6239 6239 // Restored cnt1 = cnt1 - cnt2 + int_cnt2
6240 6240 subl(cnt1, cnt2);
6241 6241 addl(cnt1, int_cnt2);
6242 6242 movl(cnt2, int_cnt2); // Now restore cnt2
6243 6243
6244 6244 decrementl(cnt1); // Shift to next element
6245 6245 cmpl(cnt1, cnt2);
6246 6246 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
6247 6247
6248 6248 addptr(result, 2);
6249 6249
6250 6250 } // (int_cnt2 > 8)
6251 6251
6252 6252 // Scan string for start of substr in 16-byte vectors
6253 6253 bind(SCAN_TO_SUBSTR);
6254 6254 pcmpestri(vec, Address(result, 0), 0x0d);
6255 6255 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
6256 6256 subl(cnt1, 8);
6257 6257 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
6258 6258 cmpl(cnt1, cnt2);
6259 6259 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
6260 6260 addptr(result, 16);
6261 6261 jmpb(SCAN_TO_SUBSTR);
6262 6262
6263 6263 // Found a potential substr
6264 6264 bind(FOUND_CANDIDATE);
6265 6265 // Matched whole vector if first element matched (tmp(rcx) == 0).
6266 6266 if (int_cnt2 == 8) {
6267 6267 jccb(Assembler::overflow, RET_FOUND); // OF == 1
6268 6268 } else { // int_cnt2 > 8
6269 6269 jccb(Assembler::overflow, FOUND_SUBSTR);
6270 6270 }
6271 6271 // After pcmpestri tmp(rcx) contains matched element index
6272 6272 // Compute start addr of substr
6273 6273 lea(result, Address(result, tmp, Address::times_2));
6274 6274
6275 6275 // Make sure string is still long enough
6276 6276 subl(cnt1, tmp);
6277 6277 cmpl(cnt1, cnt2);
6278 6278 if (int_cnt2 == 8) {
6279 6279 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
6280 6280 } else { // int_cnt2 > 8
6281 6281 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
6282 6282 }
6283 6283 // Left less then substring.
6284 6284
6285 6285 bind(RET_NOT_FOUND);
6286 6286 movl(result, -1);
6287 6287 jmpb(EXIT);
6288 6288
6289 6289 if (int_cnt2 > 8) {
6290 6290 // This code is optimized for the case when whole substring
6291 6291 // is matched if its head is matched.
6292 6292 bind(MATCH_SUBSTR_HEAD);
6293 6293 pcmpestri(vec, Address(result, 0), 0x0d);
6294 6294 // Reload only string if does not match
6295 6295 jccb(Assembler::noOverflow, RELOAD_STR); // OF == 0
6296 6296
6297 6297 Label CONT_SCAN_SUBSTR;
6298 6298 // Compare the rest of substring (> 8 chars).
6299 6299 bind(FOUND_SUBSTR);
6300 6300 // First 8 chars are already matched.
6301 6301 negptr(cnt2);
6302 6302 addptr(cnt2, 8);
6303 6303
6304 6304 bind(SCAN_SUBSTR);
6305 6305 subl(cnt1, 8);
6306 6306 cmpl(cnt2, -8); // Do not read beyond substring
6307 6307 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
6308 6308 // Back-up strings to avoid reading beyond substring:
6309 6309 // cnt1 = cnt1 - cnt2 + 8
6310 6310 addl(cnt1, cnt2); // cnt2 is negative
6311 6311 addl(cnt1, 8);
6312 6312 movl(cnt2, 8); negptr(cnt2);
6313 6313 bind(CONT_SCAN_SUBSTR);
6314 6314 if (int_cnt2 < (int)G) {
6315 6315 movdqu(vec, Address(str2, cnt2, Address::times_2, int_cnt2*2));
6316 6316 pcmpestri(vec, Address(result, cnt2, Address::times_2, int_cnt2*2), 0x0d);
6317 6317 } else {
6318 6318 // calculate index in register to avoid integer overflow (int_cnt2*2)
6319 6319 movl(tmp, int_cnt2);
6320 6320 addptr(tmp, cnt2);
6321 6321 movdqu(vec, Address(str2, tmp, Address::times_2, 0));
6322 6322 pcmpestri(vec, Address(result, tmp, Address::times_2, 0), 0x0d);
6323 6323 }
6324 6324 // Need to reload strings pointers if not matched whole vector
6325 6325 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
6326 6326 addptr(cnt2, 8);
6327 6327 jcc(Assembler::negative, SCAN_SUBSTR);
6328 6328 // Fall through if found full substring
6329 6329
6330 6330 } // (int_cnt2 > 8)
6331 6331
6332 6332 bind(RET_FOUND);
6333 6333 // Found result if we matched full small substring.
6334 6334 // Compute substr offset
6335 6335 subptr(result, str1);
6336 6336 shrl(result, 1); // index
6337 6337 bind(EXIT);
6338 6338
6339 6339 } // string_indexofC8
6340 6340
6341 6341 // Small strings are loaded through stack if they cross page boundary.
6342 6342 void MacroAssembler::string_indexof(Register str1, Register str2,
6343 6343 Register cnt1, Register cnt2,
6344 6344 int int_cnt2, Register result,
6345 6345 XMMRegister vec, Register tmp) {
6346 6346 ShortBranchVerifier sbv(this);
6347 6347 assert(UseSSE42Intrinsics, "SSE4.2 is required");
6348 6348 //
6349 6349 // int_cnt2 is length of small (< 8 chars) constant substring
6350 6350 // or (-1) for non constant substring in which case its length
6351 6351 // is in cnt2 register.
6352 6352 //
6353 6353 // Note, inline_string_indexOf() generates checks:
6354 6354 // if (substr.count > string.count) return -1;
6355 6355 // if (substr.count == 0) return 0;
6356 6356 //
6357 6357 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < 8), "should be != 0");
6358 6358
6359 6359 // This method uses pcmpestri inxtruction with bound registers
6360 6360 // inputs:
6361 6361 // xmm - substring
6362 6362 // rax - substring length (elements count)
6363 6363 // mem - scanned string
6364 6364 // rdx - string length (elements count)
6365 6365 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
6366 6366 // outputs:
6367 6367 // rcx - matched index in string
6368 6368 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6369 6369
6370 6370 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
6371 6371 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
6372 6372 FOUND_CANDIDATE;
6373 6373
6374 6374 { //========================================================
6375 6375 // We don't know where these strings are located
6376 6376 // and we can't read beyond them. Load them through stack.
6377 6377 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
6378 6378
6379 6379 movptr(tmp, rsp); // save old SP
6380 6380
6381 6381 if (int_cnt2 > 0) { // small (< 8 chars) constant substring
6382 6382 if (int_cnt2 == 1) { // One char
6383 6383 load_unsigned_short(result, Address(str2, 0));
6384 6384 movdl(vec, result); // move 32 bits
6385 6385 } else if (int_cnt2 == 2) { // Two chars
6386 6386 movdl(vec, Address(str2, 0)); // move 32 bits
6387 6387 } else if (int_cnt2 == 4) { // Four chars
6388 6388 movq(vec, Address(str2, 0)); // move 64 bits
6389 6389 } else { // cnt2 = { 3, 5, 6, 7 }
6390 6390 // Array header size is 12 bytes in 32-bit VM
6391 6391 // + 6 bytes for 3 chars == 18 bytes,
6392 6392 // enough space to load vec and shift.
6393 6393 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
6394 6394 movdqu(vec, Address(str2, (int_cnt2*2)-16));
6395 6395 psrldq(vec, 16-(int_cnt2*2));
6396 6396 }
6397 6397 } else { // not constant substring
6398 6398 cmpl(cnt2, 8);
6399 6399 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
6400 6400
6401 6401 // We can read beyond string if srt+16 does not cross page boundary
6402 6402 // since heaps are aligned and mapped by pages.
6403 6403 assert(os::vm_page_size() < (int)G, "default page should be small");
6404 6404 movl(result, str2); // We need only low 32 bits
6405 6405 andl(result, (os::vm_page_size()-1));
6406 6406 cmpl(result, (os::vm_page_size()-16));
6407 6407 jccb(Assembler::belowEqual, CHECK_STR);
6408 6408
6409 6409 // Move small strings to stack to allow load 16 bytes into vec.
6410 6410 subptr(rsp, 16);
6411 6411 int stk_offset = wordSize-2;
6412 6412 push(cnt2);
6413 6413
6414 6414 bind(COPY_SUBSTR);
6415 6415 load_unsigned_short(result, Address(str2, cnt2, Address::times_2, -2));
6416 6416 movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
6417 6417 decrement(cnt2);
6418 6418 jccb(Assembler::notZero, COPY_SUBSTR);
6419 6419
6420 6420 pop(cnt2);
6421 6421 movptr(str2, rsp); // New substring address
6422 6422 } // non constant
6423 6423
6424 6424 bind(CHECK_STR);
6425 6425 cmpl(cnt1, 8);
6426 6426 jccb(Assembler::aboveEqual, BIG_STRINGS);
6427 6427
6428 6428 // Check cross page boundary.
6429 6429 movl(result, str1); // We need only low 32 bits
6430 6430 andl(result, (os::vm_page_size()-1));
6431 6431 cmpl(result, (os::vm_page_size()-16));
6432 6432 jccb(Assembler::belowEqual, BIG_STRINGS);
6433 6433
6434 6434 subptr(rsp, 16);
6435 6435 int stk_offset = -2;
6436 6436 if (int_cnt2 < 0) { // not constant
6437 6437 push(cnt2);
6438 6438 stk_offset += wordSize;
6439 6439 }
6440 6440 movl(cnt2, cnt1);
6441 6441
6442 6442 bind(COPY_STR);
6443 6443 load_unsigned_short(result, Address(str1, cnt2, Address::times_2, -2));
6444 6444 movw(Address(rsp, cnt2, Address::times_2, stk_offset), result);
6445 6445 decrement(cnt2);
6446 6446 jccb(Assembler::notZero, COPY_STR);
6447 6447
6448 6448 if (int_cnt2 < 0) { // not constant
6449 6449 pop(cnt2);
6450 6450 }
6451 6451 movptr(str1, rsp); // New string address
6452 6452
6453 6453 bind(BIG_STRINGS);
6454 6454 // Load substring.
6455 6455 if (int_cnt2 < 0) { // -1
6456 6456 movdqu(vec, Address(str2, 0));
6457 6457 push(cnt2); // substr count
6458 6458 push(str2); // substr addr
6459 6459 push(str1); // string addr
6460 6460 } else {
6461 6461 // Small (< 8 chars) constant substrings are loaded already.
6462 6462 movl(cnt2, int_cnt2);
6463 6463 }
6464 6464 push(tmp); // original SP
6465 6465
6466 6466 } // Finished loading
6467 6467
6468 6468 //========================================================
6469 6469 // Start search
6470 6470 //
6471 6471
6472 6472 movptr(result, str1); // string addr
6473 6473
6474 6474 if (int_cnt2 < 0) { // Only for non constant substring
6475 6475 jmpb(SCAN_TO_SUBSTR);
6476 6476
6477 6477 // SP saved at sp+0
6478 6478 // String saved at sp+1*wordSize
6479 6479 // Substr saved at sp+2*wordSize
6480 6480 // Substr count saved at sp+3*wordSize
6481 6481
6482 6482 // Reload substr for rescan, this code
6483 6483 // is executed only for large substrings (> 8 chars)
6484 6484 bind(RELOAD_SUBSTR);
6485 6485 movptr(str2, Address(rsp, 2*wordSize));
6486 6486 movl(cnt2, Address(rsp, 3*wordSize));
6487 6487 movdqu(vec, Address(str2, 0));
6488 6488 // We came here after the beginning of the substring was
6489 6489 // matched but the rest of it was not so we need to search
6490 6490 // again. Start from the next element after the previous match.
6491 6491 subptr(str1, result); // Restore counter
6492 6492 shrl(str1, 1);
6493 6493 addl(cnt1, str1);
6494 6494 decrementl(cnt1); // Shift to next element
6495 6495 cmpl(cnt1, cnt2);
6496 6496 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
6497 6497
6498 6498 addptr(result, 2);
6499 6499 } // non constant
6500 6500
6501 6501 // Scan string for start of substr in 16-byte vectors
6502 6502 bind(SCAN_TO_SUBSTR);
6503 6503 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
6504 6504 pcmpestri(vec, Address(result, 0), 0x0d);
6505 6505 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
6506 6506 subl(cnt1, 8);
6507 6507 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
6508 6508 cmpl(cnt1, cnt2);
6509 6509 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
6510 6510 addptr(result, 16);
6511 6511
6512 6512 bind(ADJUST_STR);
6513 6513 cmpl(cnt1, 8); // Do not read beyond string
6514 6514 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
6515 6515 // Back-up string to avoid reading beyond string.
6516 6516 lea(result, Address(result, cnt1, Address::times_2, -16));
6517 6517 movl(cnt1, 8);
6518 6518 jmpb(SCAN_TO_SUBSTR);
6519 6519
6520 6520 // Found a potential substr
6521 6521 bind(FOUND_CANDIDATE);
6522 6522 // After pcmpestri tmp(rcx) contains matched element index
6523 6523
6524 6524 // Make sure string is still long enough
6525 6525 subl(cnt1, tmp);
6526 6526 cmpl(cnt1, cnt2);
6527 6527 jccb(Assembler::greaterEqual, FOUND_SUBSTR);
6528 6528 // Left less then substring.
6529 6529
6530 6530 bind(RET_NOT_FOUND);
6531 6531 movl(result, -1);
6532 6532 jmpb(CLEANUP);
6533 6533
6534 6534 bind(FOUND_SUBSTR);
6535 6535 // Compute start addr of substr
6536 6536 lea(result, Address(result, tmp, Address::times_2));
6537 6537
6538 6538 if (int_cnt2 > 0) { // Constant substring
6539 6539 // Repeat search for small substring (< 8 chars)
6540 6540 // from new point without reloading substring.
6541 6541 // Have to check that we don't read beyond string.
6542 6542 cmpl(tmp, 8-int_cnt2);
6543 6543 jccb(Assembler::greater, ADJUST_STR);
6544 6544 // Fall through if matched whole substring.
6545 6545 } else { // non constant
6546 6546 assert(int_cnt2 == -1, "should be != 0");
6547 6547
6548 6548 addl(tmp, cnt2);
6549 6549 // Found result if we matched whole substring.
6550 6550 cmpl(tmp, 8);
6551 6551 jccb(Assembler::lessEqual, RET_FOUND);
6552 6552
6553 6553 // Repeat search for small substring (<= 8 chars)
6554 6554 // from new point 'str1' without reloading substring.
6555 6555 cmpl(cnt2, 8);
6556 6556 // Have to check that we don't read beyond string.
6557 6557 jccb(Assembler::lessEqual, ADJUST_STR);
6558 6558
6559 6559 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
6560 6560 // Compare the rest of substring (> 8 chars).
6561 6561 movptr(str1, result);
6562 6562
6563 6563 cmpl(tmp, cnt2);
6564 6564 // First 8 chars are already matched.
6565 6565 jccb(Assembler::equal, CHECK_NEXT);
6566 6566
6567 6567 bind(SCAN_SUBSTR);
6568 6568 pcmpestri(vec, Address(str1, 0), 0x0d);
6569 6569 // Need to reload strings pointers if not matched whole vector
6570 6570 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
6571 6571
6572 6572 bind(CHECK_NEXT);
6573 6573 subl(cnt2, 8);
6574 6574 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
6575 6575 addptr(str1, 16);
6576 6576 addptr(str2, 16);
6577 6577 subl(cnt1, 8);
6578 6578 cmpl(cnt2, 8); // Do not read beyond substring
6579 6579 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
6580 6580 // Back-up strings to avoid reading beyond substring.
6581 6581 lea(str2, Address(str2, cnt2, Address::times_2, -16));
6582 6582 lea(str1, Address(str1, cnt2, Address::times_2, -16));
6583 6583 subl(cnt1, cnt2);
6584 6584 movl(cnt2, 8);
6585 6585 addl(cnt1, 8);
6586 6586 bind(CONT_SCAN_SUBSTR);
6587 6587 movdqu(vec, Address(str2, 0));
6588 6588 jmpb(SCAN_SUBSTR);
6589 6589
6590 6590 bind(RET_FOUND_LONG);
6591 6591 movptr(str1, Address(rsp, wordSize));
6592 6592 } // non constant
6593 6593
6594 6594 bind(RET_FOUND);
6595 6595 // Compute substr offset
6596 6596 subptr(result, str1);
6597 6597 shrl(result, 1); // index
6598 6598
6599 6599 bind(CLEANUP);
6600 6600 pop(rsp); // restore SP
6601 6601
6602 6602 } // string_indexof
6603 6603
6604 6604 // Compare strings.
6605 6605 void MacroAssembler::string_compare(Register str1, Register str2,
6606 6606 Register cnt1, Register cnt2, Register result,
6607 6607 XMMRegister vec1) {
6608 6608 ShortBranchVerifier sbv(this);
6609 6609 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
6610 6610
6611 6611 // Compute the minimum of the string lengths and the
6612 6612 // difference of the string lengths (stack).
6613 6613 // Do the conditional move stuff
6614 6614 movl(result, cnt1);
6615 6615 subl(cnt1, cnt2);
6616 6616 push(cnt1);
6617 6617 cmov32(Assembler::lessEqual, cnt2, result);
6618 6618
6619 6619 // Is the minimum length zero?
6620 6620 testl(cnt2, cnt2);
6621 6621 jcc(Assembler::zero, LENGTH_DIFF_LABEL);
6622 6622
6623 6623 // Compare first characters
6624 6624 load_unsigned_short(result, Address(str1, 0));
6625 6625 load_unsigned_short(cnt1, Address(str2, 0));
6626 6626 subl(result, cnt1);
6627 6627 jcc(Assembler::notZero, POP_LABEL);
6628 6628 cmpl(cnt2, 1);
6629 6629 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
6630 6630
6631 6631 // Check if the strings start at the same location.
6632 6632 cmpptr(str1, str2);
6633 6633 jcc(Assembler::equal, LENGTH_DIFF_LABEL);
6634 6634
6635 6635 Address::ScaleFactor scale = Address::times_2;
6636 6636 int stride = 8;
6637 6637
6638 6638 if (UseAVX >= 2 && UseSSE42Intrinsics) {
6639 6639 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
6640 6640 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
6641 6641 Label COMPARE_TAIL_LONG;
6642 6642 int pcmpmask = 0x19;
6643 6643
6644 6644 // Setup to compare 16-chars (32-bytes) vectors,
6645 6645 // start from first character again because it has aligned address.
6646 6646 int stride2 = 16;
6647 6647 int adr_stride = stride << scale;
6648 6648 int adr_stride2 = stride2 << scale;
6649 6649
6650 6650 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
6651 6651 // rax and rdx are used by pcmpestri as elements counters
6652 6652 movl(result, cnt2);
6653 6653 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
6654 6654 jcc(Assembler::zero, COMPARE_TAIL_LONG);
6655 6655
6656 6656 // fast path : compare first 2 8-char vectors.
6657 6657 bind(COMPARE_16_CHARS);
6658 6658 movdqu(vec1, Address(str1, 0));
6659 6659 pcmpestri(vec1, Address(str2, 0), pcmpmask);
6660 6660 jccb(Assembler::below, COMPARE_INDEX_CHAR);
6661 6661
6662 6662 movdqu(vec1, Address(str1, adr_stride));
6663 6663 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
6664 6664 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
6665 6665 addl(cnt1, stride);
6666 6666
6667 6667 // Compare the characters at index in cnt1
6668 6668 bind(COMPARE_INDEX_CHAR); //cnt1 has the offset of the mismatching character
6669 6669 load_unsigned_short(result, Address(str1, cnt1, scale));
6670 6670 load_unsigned_short(cnt2, Address(str2, cnt1, scale));
6671 6671 subl(result, cnt2);
6672 6672 jmp(POP_LABEL);
6673 6673
6674 6674 // Setup the registers to start vector comparison loop
6675 6675 bind(COMPARE_WIDE_VECTORS);
6676 6676 lea(str1, Address(str1, result, scale));
6677 6677 lea(str2, Address(str2, result, scale));
6678 6678 subl(result, stride2);
6679 6679 subl(cnt2, stride2);
6680 6680 jccb(Assembler::zero, COMPARE_WIDE_TAIL);
6681 6681 negptr(result);
6682 6682
6683 6683 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
6684 6684 bind(COMPARE_WIDE_VECTORS_LOOP);
6685 6685 vmovdqu(vec1, Address(str1, result, scale));
6686 6686 vpxor(vec1, Address(str2, result, scale));
6687 6687 vptest(vec1, vec1);
6688 6688 jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
6689 6689 addptr(result, stride2);
6690 6690 subl(cnt2, stride2);
6691 6691 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
6692 6692 // clean upper bits of YMM registers
6693 6693 vzeroupper();
6694 6694
6695 6695 // compare wide vectors tail
6696 6696 bind(COMPARE_WIDE_TAIL);
6697 6697 testptr(result, result);
6698 6698 jccb(Assembler::zero, LENGTH_DIFF_LABEL);
6699 6699
6700 6700 movl(result, stride2);
6701 6701 movl(cnt2, result);
6702 6702 negptr(result);
6703 6703 jmpb(COMPARE_WIDE_VECTORS_LOOP);
6704 6704
6705 6705 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
6706 6706 bind(VECTOR_NOT_EQUAL);
6707 6707 // clean upper bits of YMM registers
6708 6708 vzeroupper();
6709 6709 lea(str1, Address(str1, result, scale));
6710 6710 lea(str2, Address(str2, result, scale));
6711 6711 jmp(COMPARE_16_CHARS);
6712 6712
6713 6713 // Compare tail chars, length between 1 to 15 chars
6714 6714 bind(COMPARE_TAIL_LONG);
6715 6715 movl(cnt2, result);
6716 6716 cmpl(cnt2, stride);
6717 6717 jccb(Assembler::less, COMPARE_SMALL_STR);
6718 6718
6719 6719 movdqu(vec1, Address(str1, 0));
6720 6720 pcmpestri(vec1, Address(str2, 0), pcmpmask);
6721 6721 jcc(Assembler::below, COMPARE_INDEX_CHAR);
6722 6722 subptr(cnt2, stride);
6723 6723 jccb(Assembler::zero, LENGTH_DIFF_LABEL);
6724 6724 lea(str1, Address(str1, result, scale));
6725 6725 lea(str2, Address(str2, result, scale));
6726 6726 negptr(cnt2);
6727 6727 jmpb(WHILE_HEAD_LABEL);
6728 6728
6729 6729 bind(COMPARE_SMALL_STR);
6730 6730 } else if (UseSSE42Intrinsics) {
6731 6731 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
6732 6732 int pcmpmask = 0x19;
6733 6733 // Setup to compare 8-char (16-byte) vectors,
6734 6734 // start from first character again because it has aligned address.
6735 6735 movl(result, cnt2);
6736 6736 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
6737 6737 jccb(Assembler::zero, COMPARE_TAIL);
6738 6738
6739 6739 lea(str1, Address(str1, result, scale));
6740 6740 lea(str2, Address(str2, result, scale));
6741 6741 negptr(result);
6742 6742
6743 6743 // pcmpestri
6744 6744 // inputs:
6745 6745 // vec1- substring
6746 6746 // rax - negative string length (elements count)
6747 6747 // mem - scaned string
6748 6748 // rdx - string length (elements count)
6749 6749 // pcmpmask - cmp mode: 11000 (string compare with negated result)
6750 6750 // + 00 (unsigned bytes) or + 01 (unsigned shorts)
6751 6751 // outputs:
6752 6752 // rcx - first mismatched element index
6753 6753 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
6754 6754
6755 6755 bind(COMPARE_WIDE_VECTORS);
6756 6756 movdqu(vec1, Address(str1, result, scale));
6757 6757 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
6758 6758 // After pcmpestri cnt1(rcx) contains mismatched element index
6759 6759
6760 6760 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1
6761 6761 addptr(result, stride);
6762 6762 subptr(cnt2, stride);
6763 6763 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
6764 6764
6765 6765 // compare wide vectors tail
6766 6766 testptr(result, result);
6767 6767 jccb(Assembler::zero, LENGTH_DIFF_LABEL);
6768 6768
6769 6769 movl(cnt2, stride);
6770 6770 movl(result, stride);
6771 6771 negptr(result);
6772 6772 movdqu(vec1, Address(str1, result, scale));
6773 6773 pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
6774 6774 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
6775 6775
6776 6776 // Mismatched characters in the vectors
6777 6777 bind(VECTOR_NOT_EQUAL);
6778 6778 addptr(cnt1, result);
6779 6779 load_unsigned_short(result, Address(str1, cnt1, scale));
6780 6780 load_unsigned_short(cnt2, Address(str2, cnt1, scale));
6781 6781 subl(result, cnt2);
6782 6782 jmpb(POP_LABEL);
6783 6783
6784 6784 bind(COMPARE_TAIL); // limit is zero
6785 6785 movl(cnt2, result);
6786 6786 // Fallthru to tail compare
6787 6787 }
6788 6788 // Shift str2 and str1 to the end of the arrays, negate min
6789 6789 lea(str1, Address(str1, cnt2, scale));
6790 6790 lea(str2, Address(str2, cnt2, scale));
6791 6791 decrementl(cnt2); // first character was compared already
6792 6792 negptr(cnt2);
6793 6793
6794 6794 // Compare the rest of the elements
6795 6795 bind(WHILE_HEAD_LABEL);
6796 6796 load_unsigned_short(result, Address(str1, cnt2, scale, 0));
6797 6797 load_unsigned_short(cnt1, Address(str2, cnt2, scale, 0));
6798 6798 subl(result, cnt1);
6799 6799 jccb(Assembler::notZero, POP_LABEL);
6800 6800 increment(cnt2);
6801 6801 jccb(Assembler::notZero, WHILE_HEAD_LABEL);
6802 6802
6803 6803 // Strings are equal up to min length. Return the length difference.
6804 6804 bind(LENGTH_DIFF_LABEL);
6805 6805 pop(result);
6806 6806 jmpb(DONE_LABEL);
6807 6807
6808 6808 // Discard the stored length difference
6809 6809 bind(POP_LABEL);
6810 6810 pop(cnt1);
6811 6811
6812 6812 // That's it
6813 6813 bind(DONE_LABEL);
6814 6814 }
6815 6815
6816 6816 // Compare char[] arrays aligned to 4 bytes or substrings.
6817 6817 void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Register ary2,
6818 6818 Register limit, Register result, Register chr,
6819 6819 XMMRegister vec1, XMMRegister vec2) {
6820 6820 ShortBranchVerifier sbv(this);
6821 6821 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR;
6822 6822
6823 6823 int length_offset = arrayOopDesc::length_offset_in_bytes();
6824 6824 int base_offset = arrayOopDesc::base_offset_in_bytes(T_CHAR);
6825 6825
6826 6826 // Check the input args
6827 6827 cmpptr(ary1, ary2);
6828 6828 jcc(Assembler::equal, TRUE_LABEL);
6829 6829
6830 6830 if (is_array_equ) {
6831 6831 // Need additional checks for arrays_equals.
6832 6832 testptr(ary1, ary1);
6833 6833 jcc(Assembler::zero, FALSE_LABEL);
6834 6834 testptr(ary2, ary2);
6835 6835 jcc(Assembler::zero, FALSE_LABEL);
6836 6836
6837 6837 // Check the lengths
6838 6838 movl(limit, Address(ary1, length_offset));
6839 6839 cmpl(limit, Address(ary2, length_offset));
6840 6840 jcc(Assembler::notEqual, FALSE_LABEL);
6841 6841 }
6842 6842
6843 6843 // count == 0
6844 6844 testl(limit, limit);
6845 6845 jcc(Assembler::zero, TRUE_LABEL);
6846 6846
6847 6847 if (is_array_equ) {
6848 6848 // Load array address
6849 6849 lea(ary1, Address(ary1, base_offset));
6850 6850 lea(ary2, Address(ary2, base_offset));
6851 6851 }
6852 6852
6853 6853 shll(limit, 1); // byte count != 0
6854 6854 movl(result, limit); // copy
6855 6855
6856 6856 if (UseAVX >= 2) {
6857 6857 // With AVX2, use 32-byte vector compare
6858 6858 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
6859 6859
6860 6860 // Compare 32-byte vectors
6861 6861 andl(result, 0x0000001e); // tail count (in bytes)
6862 6862 andl(limit, 0xffffffe0); // vector count (in bytes)
6863 6863 jccb(Assembler::zero, COMPARE_TAIL);
6864 6864
6865 6865 lea(ary1, Address(ary1, limit, Address::times_1));
6866 6866 lea(ary2, Address(ary2, limit, Address::times_1));
6867 6867 negptr(limit);
6868 6868
6869 6869 bind(COMPARE_WIDE_VECTORS);
6870 6870 vmovdqu(vec1, Address(ary1, limit, Address::times_1));
6871 6871 vmovdqu(vec2, Address(ary2, limit, Address::times_1));
6872 6872 vpxor(vec1, vec2);
6873 6873
6874 6874 vptest(vec1, vec1);
6875 6875 jccb(Assembler::notZero, FALSE_LABEL);
6876 6876 addptr(limit, 32);
6877 6877 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
6878 6878
6879 6879 testl(result, result);
6880 6880 jccb(Assembler::zero, TRUE_LABEL);
6881 6881
6882 6882 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
6883 6883 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
6884 6884 vpxor(vec1, vec2);
6885 6885
6886 6886 vptest(vec1, vec1);
6887 6887 jccb(Assembler::notZero, FALSE_LABEL);
6888 6888 jmpb(TRUE_LABEL);
6889 6889
6890 6890 bind(COMPARE_TAIL); // limit is zero
6891 6891 movl(limit, result);
6892 6892 // Fallthru to tail compare
6893 6893 } else if (UseSSE42Intrinsics) {
6894 6894 // With SSE4.2, use double quad vector compare
6895 6895 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
6896 6896
6897 6897 // Compare 16-byte vectors
6898 6898 andl(result, 0x0000000e); // tail count (in bytes)
6899 6899 andl(limit, 0xfffffff0); // vector count (in bytes)
6900 6900 jccb(Assembler::zero, COMPARE_TAIL);
6901 6901
6902 6902 lea(ary1, Address(ary1, limit, Address::times_1));
6903 6903 lea(ary2, Address(ary2, limit, Address::times_1));
6904 6904 negptr(limit);
6905 6905
6906 6906 bind(COMPARE_WIDE_VECTORS);
6907 6907 movdqu(vec1, Address(ary1, limit, Address::times_1));
6908 6908 movdqu(vec2, Address(ary2, limit, Address::times_1));
6909 6909 pxor(vec1, vec2);
6910 6910
6911 6911 ptest(vec1, vec1);
6912 6912 jccb(Assembler::notZero, FALSE_LABEL);
6913 6913 addptr(limit, 16);
6914 6914 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
6915 6915
6916 6916 testl(result, result);
6917 6917 jccb(Assembler::zero, TRUE_LABEL);
6918 6918
6919 6919 movdqu(vec1, Address(ary1, result, Address::times_1, -16));
6920 6920 movdqu(vec2, Address(ary2, result, Address::times_1, -16));
6921 6921 pxor(vec1, vec2);
6922 6922
6923 6923 ptest(vec1, vec1);
6924 6924 jccb(Assembler::notZero, FALSE_LABEL);
6925 6925 jmpb(TRUE_LABEL);
6926 6926
6927 6927 bind(COMPARE_TAIL); // limit is zero
6928 6928 movl(limit, result);
6929 6929 // Fallthru to tail compare
6930 6930 }
6931 6931
6932 6932 // Compare 4-byte vectors
6933 6933 andl(limit, 0xfffffffc); // vector count (in bytes)
6934 6934 jccb(Assembler::zero, COMPARE_CHAR);
6935 6935
6936 6936 lea(ary1, Address(ary1, limit, Address::times_1));
6937 6937 lea(ary2, Address(ary2, limit, Address::times_1));
6938 6938 negptr(limit);
6939 6939
6940 6940 bind(COMPARE_VECTORS);
6941 6941 movl(chr, Address(ary1, limit, Address::times_1));
6942 6942 cmpl(chr, Address(ary2, limit, Address::times_1));
6943 6943 jccb(Assembler::notEqual, FALSE_LABEL);
6944 6944 addptr(limit, 4);
6945 6945 jcc(Assembler::notZero, COMPARE_VECTORS);
6946 6946
6947 6947 // Compare trailing char (final 2 bytes), if any
6948 6948 bind(COMPARE_CHAR);
6949 6949 testl(result, 0x2); // tail char
6950 6950 jccb(Assembler::zero, TRUE_LABEL);
6951 6951 load_unsigned_short(chr, Address(ary1, 0));
6952 6952 load_unsigned_short(limit, Address(ary2, 0));
6953 6953 cmpl(chr, limit);
6954 6954 jccb(Assembler::notEqual, FALSE_LABEL);
6955 6955
6956 6956 bind(TRUE_LABEL);
6957 6957 movl(result, 1); // return true
6958 6958 jmpb(DONE);
6959 6959
6960 6960 bind(FALSE_LABEL);
6961 6961 xorl(result, result); // return false
6962 6962
6963 6963 // That's it
6964 6964 bind(DONE);
6965 6965 if (UseAVX >= 2) {
6966 6966 // clean upper bits of YMM registers
6967 6967 vzeroupper();
6968 6968 }
6969 6969 }
6970 6970
6971 6971 void MacroAssembler::generate_fill(BasicType t, bool aligned,
6972 6972 Register to, Register value, Register count,
6973 6973 Register rtmp, XMMRegister xtmp) {
6974 6974 ShortBranchVerifier sbv(this);
6975 6975 assert_different_registers(to, value, count, rtmp);
6976 6976 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
6977 6977 Label L_fill_2_bytes, L_fill_4_bytes;
6978 6978
6979 6979 int shift = -1;
6980 6980 switch (t) {
6981 6981 case T_BYTE:
6982 6982 shift = 2;
6983 6983 break;
6984 6984 case T_SHORT:
6985 6985 shift = 1;
6986 6986 break;
6987 6987 case T_INT:
6988 6988 shift = 0;
6989 6989 break;
6990 6990 default: ShouldNotReachHere();
6991 6991 }
6992 6992
6993 6993 if (t == T_BYTE) {
6994 6994 andl(value, 0xff);
6995 6995 movl(rtmp, value);
6996 6996 shll(rtmp, 8);
6997 6997 orl(value, rtmp);
6998 6998 }
6999 6999 if (t == T_SHORT) {
7000 7000 andl(value, 0xffff);
7001 7001 }
7002 7002 if (t == T_BYTE || t == T_SHORT) {
7003 7003 movl(rtmp, value);
7004 7004 shll(rtmp, 16);
7005 7005 orl(value, rtmp);
7006 7006 }
7007 7007
7008 7008 cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
7009 7009 jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp
7010 7010 if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
7011 7011 // align source address at 4 bytes address boundary
7012 7012 if (t == T_BYTE) {
7013 7013 // One byte misalignment happens only for byte arrays
7014 7014 testptr(to, 1);
7015 7015 jccb(Assembler::zero, L_skip_align1);
7016 7016 movb(Address(to, 0), value);
7017 7017 increment(to);
7018 7018 decrement(count);
7019 7019 BIND(L_skip_align1);
7020 7020 }
7021 7021 // Two bytes misalignment happens only for byte and short (char) arrays
7022 7022 testptr(to, 2);
7023 7023 jccb(Assembler::zero, L_skip_align2);
7024 7024 movw(Address(to, 0), value);
7025 7025 addptr(to, 2);
7026 7026 subl(count, 1<<(shift-1));
7027 7027 BIND(L_skip_align2);
7028 7028 }
7029 7029 if (UseSSE < 2) {
7030 7030 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7031 7031 // Fill 32-byte chunks
7032 7032 subl(count, 8 << shift);
7033 7033 jcc(Assembler::less, L_check_fill_8_bytes);
7034 7034 align(16);
7035 7035
7036 7036 BIND(L_fill_32_bytes_loop);
7037 7037
7038 7038 for (int i = 0; i < 32; i += 4) {
7039 7039 movl(Address(to, i), value);
7040 7040 }
7041 7041
7042 7042 addptr(to, 32);
7043 7043 subl(count, 8 << shift);
7044 7044 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
7045 7045 BIND(L_check_fill_8_bytes);
7046 7046 addl(count, 8 << shift);
7047 7047 jccb(Assembler::zero, L_exit);
7048 7048 jmpb(L_fill_8_bytes);
7049 7049
7050 7050 //
7051 7051 // length is too short, just fill qwords
7052 7052 //
7053 7053 BIND(L_fill_8_bytes_loop);
7054 7054 movl(Address(to, 0), value);
7055 7055 movl(Address(to, 4), value);
7056 7056 addptr(to, 8);
7057 7057 BIND(L_fill_8_bytes);
7058 7058 subl(count, 1 << (shift + 1));
7059 7059 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7060 7060 // fall through to fill 4 bytes
7061 7061 } else {
7062 7062 Label L_fill_32_bytes;
7063 7063 if (!UseUnalignedLoadStores) {
7064 7064 // align to 8 bytes, we know we are 4 byte aligned to start
7065 7065 testptr(to, 4);
7066 7066 jccb(Assembler::zero, L_fill_32_bytes);
7067 7067 movl(Address(to, 0), value);
7068 7068 addptr(to, 4);
7069 7069 subl(count, 1<<shift);
7070 7070 }
7071 7071 BIND(L_fill_32_bytes);
7072 7072 {
7073 7073 assert( UseSSE >= 2, "supported cpu only" );
7074 7074 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
7075 7075 movdl(xtmp, value);
7076 7076 if (UseAVX >= 2 && UseUnalignedLoadStores) {
7077 7077 // Fill 64-byte chunks
7078 7078 Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
7079 7079 vpbroadcastd(xtmp, xtmp);
7080 7080
7081 7081 subl(count, 16 << shift);
7082 7082 jcc(Assembler::less, L_check_fill_32_bytes);
7083 7083 align(16);
7084 7084
7085 7085 BIND(L_fill_64_bytes_loop);
7086 7086 vmovdqu(Address(to, 0), xtmp);
7087 7087 vmovdqu(Address(to, 32), xtmp);
7088 7088 addptr(to, 64);
7089 7089 subl(count, 16 << shift);
7090 7090 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
7091 7091
7092 7092 BIND(L_check_fill_32_bytes);
7093 7093 addl(count, 8 << shift);
7094 7094 jccb(Assembler::less, L_check_fill_8_bytes);
7095 7095 vmovdqu(Address(to, 0), xtmp);
7096 7096 addptr(to, 32);
7097 7097 subl(count, 8 << shift);
7098 7098
7099 7099 BIND(L_check_fill_8_bytes);
7100 7100 // clean upper bits of YMM registers
7101 7101 vzeroupper();
7102 7102 } else {
7103 7103 // Fill 32-byte chunks
7104 7104 pshufd(xtmp, xtmp, 0);
7105 7105
7106 7106 subl(count, 8 << shift);
7107 7107 jcc(Assembler::less, L_check_fill_8_bytes);
7108 7108 align(16);
7109 7109
7110 7110 BIND(L_fill_32_bytes_loop);
7111 7111
7112 7112 if (UseUnalignedLoadStores) {
7113 7113 movdqu(Address(to, 0), xtmp);
7114 7114 movdqu(Address(to, 16), xtmp);
7115 7115 } else {
7116 7116 movq(Address(to, 0), xtmp);
7117 7117 movq(Address(to, 8), xtmp);
7118 7118 movq(Address(to, 16), xtmp);
7119 7119 movq(Address(to, 24), xtmp);
7120 7120 }
7121 7121
7122 7122 addptr(to, 32);
7123 7123 subl(count, 8 << shift);
7124 7124 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
7125 7125
7126 7126 BIND(L_check_fill_8_bytes);
7127 7127 }
7128 7128 addl(count, 8 << shift);
7129 7129 jccb(Assembler::zero, L_exit);
7130 7130 jmpb(L_fill_8_bytes);
7131 7131
7132 7132 //
7133 7133 // length is too short, just fill qwords
7134 7134 //
7135 7135 BIND(L_fill_8_bytes_loop);
7136 7136 movq(Address(to, 0), xtmp);
7137 7137 addptr(to, 8);
7138 7138 BIND(L_fill_8_bytes);
7139 7139 subl(count, 1 << (shift + 1));
7140 7140 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
7141 7141 }
7142 7142 }
7143 7143 // fill trailing 4 bytes
7144 7144 BIND(L_fill_4_bytes);
7145 7145 testl(count, 1<<shift);
7146 7146 jccb(Assembler::zero, L_fill_2_bytes);
7147 7147 movl(Address(to, 0), value);
7148 7148 if (t == T_BYTE || t == T_SHORT) {
7149 7149 addptr(to, 4);
7150 7150 BIND(L_fill_2_bytes);
7151 7151 // fill trailing 2 bytes
7152 7152 testl(count, 1<<(shift-1));
7153 7153 jccb(Assembler::zero, L_fill_byte);
7154 7154 movw(Address(to, 0), value);
7155 7155 if (t == T_BYTE) {
7156 7156 addptr(to, 2);
7157 7157 BIND(L_fill_byte);
7158 7158 // fill trailing byte
7159 7159 testl(count, 1);
7160 7160 jccb(Assembler::zero, L_exit);
7161 7161 movb(Address(to, 0), value);
7162 7162 } else {
7163 7163 BIND(L_fill_byte);
7164 7164 }
7165 7165 } else {
7166 7166 BIND(L_fill_2_bytes);
7167 7167 }
7168 7168 BIND(L_exit);
7169 7169 }
7170 7170
7171 7171 // encode char[] to byte[] in ISO_8859_1
7172 7172 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
7173 7173 XMMRegister tmp1Reg, XMMRegister tmp2Reg,
7174 7174 XMMRegister tmp3Reg, XMMRegister tmp4Reg,
7175 7175 Register tmp5, Register result) {
7176 7176 // rsi: src
7177 7177 // rdi: dst
7178 7178 // rdx: len
7179 7179 // rcx: tmp5
7180 7180 // rax: result
7181 7181 ShortBranchVerifier sbv(this);
7182 7182 assert_different_registers(src, dst, len, tmp5, result);
7183 7183 Label L_done, L_copy_1_char, L_copy_1_char_exit;
7184 7184
7185 7185 // set result
7186 7186 xorl(result, result);
7187 7187 // check for zero length
7188 7188 testl(len, len);
7189 7189 jcc(Assembler::zero, L_done);
7190 7190 movl(result, len);
7191 7191
7192 7192 // Setup pointers
7193 7193 lea(src, Address(src, len, Address::times_2)); // char[]
7194 7194 lea(dst, Address(dst, len, Address::times_1)); // byte[]
7195 7195 negptr(len);
7196 7196
7197 7197 if (UseSSE42Intrinsics || UseAVX >= 2) {
7198 7198 Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit;
7199 7199 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit;
7200 7200
7201 7201 if (UseAVX >= 2) {
7202 7202 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
7203 7203 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector
7204 7204 movdl(tmp1Reg, tmp5);
7205 7205 vpbroadcastd(tmp1Reg, tmp1Reg);
7206 7206 jmpb(L_chars_32_check);
7207 7207
7208 7208 bind(L_copy_32_chars);
7209 7209 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64));
7210 7210 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32));
7211 7211 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ true);
7212 7212 vptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
7213 7213 jccb(Assembler::notZero, L_copy_32_chars_exit);
7214 7214 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector256 */ true);
7215 7215 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector256 */ true);
7216 7216 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg);
7217 7217
7218 7218 bind(L_chars_32_check);
7219 7219 addptr(len, 32);
7220 7220 jccb(Assembler::lessEqual, L_copy_32_chars);
7221 7221
7222 7222 bind(L_copy_32_chars_exit);
7223 7223 subptr(len, 16);
7224 7224 jccb(Assembler::greater, L_copy_16_chars_exit);
7225 7225
7226 7226 } else if (UseSSE42Intrinsics) {
7227 7227 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector
7228 7228 movdl(tmp1Reg, tmp5);
7229 7229 pshufd(tmp1Reg, tmp1Reg, 0);
7230 7230 jmpb(L_chars_16_check);
7231 7231 }
7232 7232
7233 7233 bind(L_copy_16_chars);
7234 7234 if (UseAVX >= 2) {
7235 7235 vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32));
7236 7236 vptest(tmp2Reg, tmp1Reg);
7237 7237 jccb(Assembler::notZero, L_copy_16_chars_exit);
7238 7238 vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector256 */ true);
7239 7239 vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector256 */ true);
7240 7240 } else {
7241 7241 if (UseAVX > 0) {
7242 7242 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7243 7243 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7244 7244 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector256 */ false);
7245 7245 } else {
7246 7246 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32));
7247 7247 por(tmp2Reg, tmp3Reg);
7248 7248 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16));
7249 7249 por(tmp2Reg, tmp4Reg);
7250 7250 }
7251 7251 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector
7252 7252 jccb(Assembler::notZero, L_copy_16_chars_exit);
7253 7253 packuswb(tmp3Reg, tmp4Reg);
7254 7254 }
7255 7255 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg);
7256 7256
7257 7257 bind(L_chars_16_check);
7258 7258 addptr(len, 16);
7259 7259 jccb(Assembler::lessEqual, L_copy_16_chars);
7260 7260
7261 7261 bind(L_copy_16_chars_exit);
7262 7262 if (UseAVX >= 2) {
7263 7263 // clean upper bits of YMM registers
7264 7264 vzeroupper();
7265 7265 }
7266 7266 subptr(len, 8);
7267 7267 jccb(Assembler::greater, L_copy_8_chars_exit);
7268 7268
7269 7269 bind(L_copy_8_chars);
7270 7270 movdqu(tmp3Reg, Address(src, len, Address::times_2, -16));
7271 7271 ptest(tmp3Reg, tmp1Reg);
7272 7272 jccb(Assembler::notZero, L_copy_8_chars_exit);
7273 7273 packuswb(tmp3Reg, tmp1Reg);
7274 7274 movq(Address(dst, len, Address::times_1, -8), tmp3Reg);
7275 7275 addptr(len, 8);
7276 7276 jccb(Assembler::lessEqual, L_copy_8_chars);
7277 7277
7278 7278 bind(L_copy_8_chars_exit);
7279 7279 subptr(len, 8);
7280 7280 jccb(Assembler::zero, L_done);
7281 7281 }
7282 7282
7283 7283 bind(L_copy_1_char);
7284 7284 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0));
7285 7285 testl(tmp5, 0xff00); // check if Unicode char
7286 7286 jccb(Assembler::notZero, L_copy_1_char_exit);
7287 7287 movb(Address(dst, len, Address::times_1, 0), tmp5);
7288 7288 addptr(len, 1);
7289 7289 jccb(Assembler::less, L_copy_1_char);
7290 7290
7291 7291 bind(L_copy_1_char_exit);
7292 7292 addptr(result, len); // len is negative count of not processed elements
7293 7293 bind(L_done);
7294 7294 }
7295 7295
7296 7296 #ifdef _LP64
7297 7297 /**
7298 7298 * Helper for multiply_to_len().
7299 7299 */
7300 7300 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
7301 7301 addq(dest_lo, src1);
7302 7302 adcq(dest_hi, 0);
7303 7303 addq(dest_lo, src2);
7304 7304 adcq(dest_hi, 0);
7305 7305 }
7306 7306
7307 7307 /**
7308 7308 * Multiply 64 bit by 64 bit first loop.
7309 7309 */
7310 7310 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
7311 7311 Register y, Register y_idx, Register z,
7312 7312 Register carry, Register product,
7313 7313 Register idx, Register kdx) {
7314 7314 //
7315 7315 // jlong carry, x[], y[], z[];
7316 7316 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7317 7317 // huge_128 product = y[idx] * x[xstart] + carry;
7318 7318 // z[kdx] = (jlong)product;
7319 7319 // carry = (jlong)(product >>> 64);
7320 7320 // }
7321 7321 // z[xstart] = carry;
7322 7322 //
7323 7323
7324 7324 Label L_first_loop, L_first_loop_exit;
7325 7325 Label L_one_x, L_one_y, L_multiply;
7326 7326
7327 7327 decrementl(xstart);
7328 7328 jcc(Assembler::negative, L_one_x);
7329 7329
7330 7330 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
7331 7331 rorq(x_xstart, 32); // convert big-endian to little-endian
7332 7332
7333 7333 bind(L_first_loop);
7334 7334 decrementl(idx);
7335 7335 jcc(Assembler::negative, L_first_loop_exit);
7336 7336 decrementl(idx);
7337 7337 jcc(Assembler::negative, L_one_y);
7338 7338 movq(y_idx, Address(y, idx, Address::times_4, 0));
7339 7339 rorq(y_idx, 32); // convert big-endian to little-endian
7340 7340 bind(L_multiply);
7341 7341 movq(product, x_xstart);
7342 7342 mulq(y_idx); // product(rax) * y_idx -> rdx:rax
7343 7343 addq(product, carry);
7344 7344 adcq(rdx, 0);
7345 7345 subl(kdx, 2);
7346 7346 movl(Address(z, kdx, Address::times_4, 4), product);
7347 7347 shrq(product, 32);
7348 7348 movl(Address(z, kdx, Address::times_4, 0), product);
7349 7349 movq(carry, rdx);
7350 7350 jmp(L_first_loop);
7351 7351
7352 7352 bind(L_one_y);
7353 7353 movl(y_idx, Address(y, 0));
7354 7354 jmp(L_multiply);
7355 7355
7356 7356 bind(L_one_x);
7357 7357 movl(x_xstart, Address(x, 0));
7358 7358 jmp(L_first_loop);
7359 7359
7360 7360 bind(L_first_loop_exit);
7361 7361 }
7362 7362
7363 7363 /**
7364 7364 * Multiply 64 bit by 64 bit and add 128 bit.
7365 7365 */
7366 7366 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
7367 7367 Register yz_idx, Register idx,
7368 7368 Register carry, Register product, int offset) {
7369 7369 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
7370 7370 // z[kdx] = (jlong)product;
7371 7371
7372 7372 movq(yz_idx, Address(y, idx, Address::times_4, offset));
7373 7373 rorq(yz_idx, 32); // convert big-endian to little-endian
7374 7374 movq(product, x_xstart);
7375 7375 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
7376 7376 movq(yz_idx, Address(z, idx, Address::times_4, offset));
7377 7377 rorq(yz_idx, 32); // convert big-endian to little-endian
7378 7378
7379 7379 add2_with_carry(rdx, product, carry, yz_idx);
7380 7380
7381 7381 movl(Address(z, idx, Address::times_4, offset+4), product);
7382 7382 shrq(product, 32);
7383 7383 movl(Address(z, idx, Address::times_4, offset), product);
7384 7384
7385 7385 }
7386 7386
7387 7387 /**
7388 7388 * Multiply 128 bit by 128 bit. Unrolled inner loop.
7389 7389 */
7390 7390 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
7391 7391 Register yz_idx, Register idx, Register jdx,
7392 7392 Register carry, Register product,
7393 7393 Register carry2) {
7394 7394 // jlong carry, x[], y[], z[];
7395 7395 // int kdx = ystart+1;
7396 7396 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7397 7397 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
7398 7398 // z[kdx+idx+1] = (jlong)product;
7399 7399 // jlong carry2 = (jlong)(product >>> 64);
7400 7400 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
7401 7401 // z[kdx+idx] = (jlong)product;
7402 7402 // carry = (jlong)(product >>> 64);
7403 7403 // }
7404 7404 // idx += 2;
7405 7405 // if (idx > 0) {
7406 7406 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
7407 7407 // z[kdx+idx] = (jlong)product;
7408 7408 // carry = (jlong)(product >>> 64);
7409 7409 // }
7410 7410 //
7411 7411
7412 7412 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7413 7413
7414 7414 movl(jdx, idx);
7415 7415 andl(jdx, 0xFFFFFFFC);
7416 7416 shrl(jdx, 2);
7417 7417
7418 7418 bind(L_third_loop);
7419 7419 subl(jdx, 1);
7420 7420 jcc(Assembler::negative, L_third_loop_exit);
7421 7421 subl(idx, 4);
7422 7422
7423 7423 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
7424 7424 movq(carry2, rdx);
7425 7425
7426 7426 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
7427 7427 movq(carry, rdx);
7428 7428 jmp(L_third_loop);
7429 7429
7430 7430 bind (L_third_loop_exit);
7431 7431
7432 7432 andl (idx, 0x3);
7433 7433 jcc(Assembler::zero, L_post_third_loop_done);
7434 7434
7435 7435 Label L_check_1;
7436 7436 subl(idx, 2);
7437 7437 jcc(Assembler::negative, L_check_1);
7438 7438
7439 7439 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
7440 7440 movq(carry, rdx);
7441 7441
7442 7442 bind (L_check_1);
7443 7443 addl (idx, 0x2);
7444 7444 andl (idx, 0x1);
7445 7445 subl(idx, 1);
7446 7446 jcc(Assembler::negative, L_post_third_loop_done);
7447 7447
7448 7448 movl(yz_idx, Address(y, idx, Address::times_4, 0));
7449 7449 movq(product, x_xstart);
7450 7450 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
7451 7451 movl(yz_idx, Address(z, idx, Address::times_4, 0));
7452 7452
7453 7453 add2_with_carry(rdx, product, yz_idx, carry);
7454 7454
7455 7455 movl(Address(z, idx, Address::times_4, 0), product);
7456 7456 shrq(product, 32);
7457 7457
7458 7458 shlq(rdx, 32);
7459 7459 orq(product, rdx);
7460 7460 movq(carry, product);
7461 7461
7462 7462 bind(L_post_third_loop_done);
7463 7463 }
7464 7464
7465 7465 /**
7466 7466 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
7467 7467 *
7468 7468 */
7469 7469 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
7470 7470 Register carry, Register carry2,
7471 7471 Register idx, Register jdx,
7472 7472 Register yz_idx1, Register yz_idx2,
7473 7473 Register tmp, Register tmp3, Register tmp4) {
7474 7474 assert(UseBMI2Instructions, "should be used only when BMI2 is available");
7475 7475
7476 7476 // jlong carry, x[], y[], z[];
7477 7477 // int kdx = ystart+1;
7478 7478 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
7479 7479 // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
7480 7480 // jlong carry2 = (jlong)(tmp3 >>> 64);
7481 7481 // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2;
7482 7482 // carry = (jlong)(tmp4 >>> 64);
7483 7483 // z[kdx+idx+1] = (jlong)tmp3;
7484 7484 // z[kdx+idx] = (jlong)tmp4;
7485 7485 // }
7486 7486 // idx += 2;
7487 7487 // if (idx > 0) {
7488 7488 // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
7489 7489 // z[kdx+idx] = (jlong)yz_idx1;
7490 7490 // carry = (jlong)(yz_idx1 >>> 64);
7491 7491 // }
7492 7492 //
7493 7493
7494 7494 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
7495 7495
7496 7496 movl(jdx, idx);
7497 7497 andl(jdx, 0xFFFFFFFC);
7498 7498 shrl(jdx, 2);
7499 7499
7500 7500 bind(L_third_loop);
7501 7501 subl(jdx, 1);
7502 7502 jcc(Assembler::negative, L_third_loop_exit);
7503 7503 subl(idx, 4);
7504 7504
7505 7505 movq(yz_idx1, Address(y, idx, Address::times_4, 8));
7506 7506 rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
7507 7507 movq(yz_idx2, Address(y, idx, Address::times_4, 0));
7508 7508 rorxq(yz_idx2, yz_idx2, 32);
7509 7509
7510 7510 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
7511 7511 mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp
7512 7512
7513 7513 movq(yz_idx1, Address(z, idx, Address::times_4, 8));
7514 7514 rorxq(yz_idx1, yz_idx1, 32);
7515 7515 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
7516 7516 rorxq(yz_idx2, yz_idx2, 32);
7517 7517
7518 7518 if (VM_Version::supports_adx()) {
7519 7519 adcxq(tmp3, carry);
7520 7520 adoxq(tmp3, yz_idx1);
7521 7521
7522 7522 adcxq(tmp4, tmp);
7523 7523 adoxq(tmp4, yz_idx2);
7524 7524
7525 7525 movl(carry, 0); // does not affect flags
7526 7526 adcxq(carry2, carry);
7527 7527 adoxq(carry2, carry);
7528 7528 } else {
7529 7529 add2_with_carry(tmp4, tmp3, carry, yz_idx1);
7530 7530 add2_with_carry(carry2, tmp4, tmp, yz_idx2);
7531 7531 }
7532 7532 movq(carry, carry2);
7533 7533
7534 7534 movl(Address(z, idx, Address::times_4, 12), tmp3);
7535 7535 shrq(tmp3, 32);
7536 7536 movl(Address(z, idx, Address::times_4, 8), tmp3);
7537 7537
7538 7538 movl(Address(z, idx, Address::times_4, 4), tmp4);
7539 7539 shrq(tmp4, 32);
7540 7540 movl(Address(z, idx, Address::times_4, 0), tmp4);
7541 7541
7542 7542 jmp(L_third_loop);
7543 7543
7544 7544 bind (L_third_loop_exit);
7545 7545
7546 7546 andl (idx, 0x3);
7547 7547 jcc(Assembler::zero, L_post_third_loop_done);
7548 7548
7549 7549 Label L_check_1;
7550 7550 subl(idx, 2);
7551 7551 jcc(Assembler::negative, L_check_1);
7552 7552
7553 7553 movq(yz_idx1, Address(y, idx, Address::times_4, 0));
7554 7554 rorxq(yz_idx1, yz_idx1, 32);
7555 7555 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
7556 7556 movq(yz_idx2, Address(z, idx, Address::times_4, 0));
7557 7557 rorxq(yz_idx2, yz_idx2, 32);
7558 7558
7559 7559 add2_with_carry(tmp4, tmp3, carry, yz_idx2);
7560 7560
7561 7561 movl(Address(z, idx, Address::times_4, 4), tmp3);
7562 7562 shrq(tmp3, 32);
7563 7563 movl(Address(z, idx, Address::times_4, 0), tmp3);
7564 7564 movq(carry, tmp4);
7565 7565
7566 7566 bind (L_check_1);
7567 7567 addl (idx, 0x2);
7568 7568 andl (idx, 0x1);
7569 7569 subl(idx, 1);
7570 7570 jcc(Assembler::negative, L_post_third_loop_done);
7571 7571 movl(tmp4, Address(y, idx, Address::times_4, 0));
7572 7572 mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3
7573 7573 movl(tmp4, Address(z, idx, Address::times_4, 0));
7574 7574
7575 7575 add2_with_carry(carry2, tmp3, tmp4, carry);
7576 7576
7577 7577 movl(Address(z, idx, Address::times_4, 0), tmp3);
7578 7578 shrq(tmp3, 32);
7579 7579
7580 7580 shlq(carry2, 32);
7581 7581 orq(tmp3, carry2);
7582 7582 movq(carry, tmp3);
7583 7583
7584 7584 bind(L_post_third_loop_done);
7585 7585 }
7586 7586
7587 7587 /**
7588 7588 * Code for BigInteger::multiplyToLen() instrinsic.
7589 7589 *
7590 7590 * rdi: x
7591 7591 * rax: xlen
7592 7592 * rsi: y
7593 7593 * rcx: ylen
7594 7594 * r8: z
7595 7595 * r11: zlen
7596 7596 * r12: tmp1
7597 7597 * r13: tmp2
7598 7598 * r14: tmp3
7599 7599 * r15: tmp4
7600 7600 * rbx: tmp5
7601 7601 *
7602 7602 */
7603 7603 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
7604 7604 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
7605 7605 ShortBranchVerifier sbv(this);
7606 7606 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
7607 7607
7608 7608 push(tmp1);
7609 7609 push(tmp2);
7610 7610 push(tmp3);
7611 7611 push(tmp4);
7612 7612 push(tmp5);
7613 7613
7614 7614 push(xlen);
7615 7615 push(zlen);
7616 7616
7617 7617 const Register idx = tmp1;
7618 7618 const Register kdx = tmp2;
7619 7619 const Register xstart = tmp3;
7620 7620
7621 7621 const Register y_idx = tmp4;
7622 7622 const Register carry = tmp5;
7623 7623 const Register product = xlen;
7624 7624 const Register x_xstart = zlen; // reuse register
7625 7625
7626 7626 // First Loop.
7627 7627 //
7628 7628 // final static long LONG_MASK = 0xffffffffL;
7629 7629 // int xstart = xlen - 1;
7630 7630 // int ystart = ylen - 1;
7631 7631 // long carry = 0;
7632 7632 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
7633 7633 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
7634 7634 // z[kdx] = (int)product;
7635 7635 // carry = product >>> 32;
7636 7636 // }
7637 7637 // z[xstart] = (int)carry;
7638 7638 //
7639 7639
7640 7640 movl(idx, ylen); // idx = ylen;
7641 7641 movl(kdx, zlen); // kdx = xlen+ylen;
7642 7642 xorq(carry, carry); // carry = 0;
7643 7643
7644 7644 Label L_done;
7645 7645
7646 7646 movl(xstart, xlen);
7647 7647 decrementl(xstart);
7648 7648 jcc(Assembler::negative, L_done);
7649 7649
7650 7650 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
7651 7651
7652 7652 Label L_second_loop;
7653 7653 testl(kdx, kdx);
7654 7654 jcc(Assembler::zero, L_second_loop);
7655 7655
7656 7656 Label L_carry;
7657 7657 subl(kdx, 1);
7658 7658 jcc(Assembler::zero, L_carry);
7659 7659
7660 7660 movl(Address(z, kdx, Address::times_4, 0), carry);
7661 7661 shrq(carry, 32);
7662 7662 subl(kdx, 1);
7663 7663
7664 7664 bind(L_carry);
7665 7665 movl(Address(z, kdx, Address::times_4, 0), carry);
7666 7666
7667 7667 // Second and third (nested) loops.
7668 7668 //
7669 7669 // for (int i = xstart-1; i >= 0; i--) { // Second loop
7670 7670 // carry = 0;
7671 7671 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
7672 7672 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
7673 7673 // (z[k] & LONG_MASK) + carry;
7674 7674 // z[k] = (int)product;
7675 7675 // carry = product >>> 32;
7676 7676 // }
7677 7677 // z[i] = (int)carry;
7678 7678 // }
7679 7679 //
7680 7680 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
7681 7681
7682 7682 const Register jdx = tmp1;
7683 7683
7684 7684 bind(L_second_loop);
7685 7685 xorl(carry, carry); // carry = 0;
7686 7686 movl(jdx, ylen); // j = ystart+1
7687 7687
7688 7688 subl(xstart, 1); // i = xstart-1;
7689 7689 jcc(Assembler::negative, L_done);
7690 7690
7691 7691 push (z);
7692 7692
7693 7693 Label L_last_x;
7694 7694 lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
7695 7695 subl(xstart, 1); // i = xstart-1;
7696 7696 jcc(Assembler::negative, L_last_x);
7697 7697
7698 7698 if (UseBMI2Instructions) {
7699 7699 movq(rdx, Address(x, xstart, Address::times_4, 0));
7700 7700 rorxq(rdx, rdx, 32); // convert big-endian to little-endian
7701 7701 } else {
7702 7702 movq(x_xstart, Address(x, xstart, Address::times_4, 0));
7703 7703 rorq(x_xstart, 32); // convert big-endian to little-endian
7704 7704 }
7705 7705
7706 7706 Label L_third_loop_prologue;
7707 7707 bind(L_third_loop_prologue);
7708 7708
7709 7709 push (x);
7710 7710 push (xstart);
7711 7711 push (ylen);
7712 7712
7713 7713
7714 7714 if (UseBMI2Instructions) {
7715 7715 multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
7716 7716 } else { // !UseBMI2Instructions
7717 7717 multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
7718 7718 }
7719 7719
7720 7720 pop(ylen);
7721 7721 pop(xlen);
7722 7722 pop(x);
7723 7723 pop(z);
7724 7724
7725 7725 movl(tmp3, xlen);
7726 7726 addl(tmp3, 1);
7727 7727 movl(Address(z, tmp3, Address::times_4, 0), carry);
7728 7728 subl(tmp3, 1);
7729 7729 jccb(Assembler::negative, L_done);
7730 7730
7731 7731 shrq(carry, 32);
7732 7732 movl(Address(z, tmp3, Address::times_4, 0), carry);
7733 7733 jmp(L_second_loop);
7734 7734
7735 7735 // Next infrequent code is moved outside loops.
7736 7736 bind(L_last_x);
7737 7737 if (UseBMI2Instructions) {
7738 7738 movl(rdx, Address(x, 0));
7739 7739 } else {
7740 7740 movl(x_xstart, Address(x, 0));
7741 7741 }
7742 7742 jmp(L_third_loop_prologue);
7743 7743
7744 7744 bind(L_done);
7745 7745
7746 7746 pop(zlen);
7747 7747 pop(xlen);
7748 7748
7749 7749 pop(tmp5);
7750 7750 pop(tmp4);
7751 7751 pop(tmp3);
7752 7752 pop(tmp2);
7753 7753 pop(tmp1);
7754 7754 }
7755 7755 #endif
7756 7756
7757 7757 /**
7758 7758 * Emits code to update CRC-32 with a byte value according to constants in table
7759 7759 *
7760 7760 * @param [in,out]crc Register containing the crc.
7761 7761 * @param [in]val Register containing the byte to fold into the CRC.
7762 7762 * @param [in]table Register containing the table of crc constants.
7763 7763 *
7764 7764 * uint32_t crc;
7765 7765 * val = crc_table[(val ^ crc) & 0xFF];
7766 7766 * crc = val ^ (crc >> 8);
7767 7767 *
7768 7768 */
7769 7769 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
7770 7770 xorl(val, crc);
7771 7771 andl(val, 0xFF);
7772 7772 shrl(crc, 8); // unsigned shift
7773 7773 xorl(crc, Address(table, val, Address::times_4, 0));
7774 7774 }
7775 7775
7776 7776 /**
7777 7777 * Fold 128-bit data chunk
7778 7778 */
7779 7779 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) {
7780 7780 if (UseAVX > 0) {
7781 7781 vpclmulhdq(xtmp, xK, xcrc); // [123:64]
7782 7782 vpclmulldq(xcrc, xK, xcrc); // [63:0]
7783 7783 vpxor(xcrc, xcrc, Address(buf, offset), false /* vector256 */);
7784 7784 pxor(xcrc, xtmp);
7785 7785 } else {
7786 7786 movdqa(xtmp, xcrc);
7787 7787 pclmulhdq(xtmp, xK); // [123:64]
7788 7788 pclmulldq(xcrc, xK); // [63:0]
7789 7789 pxor(xcrc, xtmp);
7790 7790 movdqu(xtmp, Address(buf, offset));
7791 7791 pxor(xcrc, xtmp);
7792 7792 }
7793 7793 }
7794 7794
7795 7795 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) {
7796 7796 if (UseAVX > 0) {
7797 7797 vpclmulhdq(xtmp, xK, xcrc);
7798 7798 vpclmulldq(xcrc, xK, xcrc);
7799 7799 pxor(xcrc, xbuf);
7800 7800 pxor(xcrc, xtmp);
7801 7801 } else {
7802 7802 movdqa(xtmp, xcrc);
7803 7803 pclmulhdq(xtmp, xK);
7804 7804 pclmulldq(xcrc, xK);
7805 7805 pxor(xcrc, xbuf);
7806 7806 pxor(xcrc, xtmp);
7807 7807 }
7808 7808 }
7809 7809
7810 7810 /**
7811 7811 * 8-bit folds to compute 32-bit CRC
7812 7812 *
7813 7813 * uint64_t xcrc;
7814 7814 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8);
7815 7815 */
7816 7816 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) {
7817 7817 movdl(tmp, xcrc);
7818 7818 andl(tmp, 0xFF);
7819 7819 movdl(xtmp, Address(table, tmp, Address::times_4, 0));
7820 7820 psrldq(xcrc, 1); // unsigned shift one byte
7821 7821 pxor(xcrc, xtmp);
7822 7822 }
7823 7823
7824 7824 /**
7825 7825 * uint32_t crc;
7826 7826 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8);
7827 7827 */
7828 7828 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
7829 7829 movl(tmp, crc);
7830 7830 andl(tmp, 0xFF);
7831 7831 shrl(crc, 8);
7832 7832 xorl(crc, Address(table, tmp, Address::times_4, 0));
7833 7833 }
7834 7834
7835 7835 /**
7836 7836 * @param crc register containing existing CRC (32-bit)
7837 7837 * @param buf register pointing to input byte buffer (byte*)
7838 7838 * @param len register containing number of bytes
7839 7839 * @param table register that will contain address of CRC table
7840 7840 * @param tmp scratch register
7841 7841 */
7842 7842 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) {
7843 7843 assert_different_registers(crc, buf, len, table, tmp, rax);
7844 7844
7845 7845 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned;
7846 7846 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop;
7847 7847
7848 7848 lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
7849 7849 notl(crc); // ~crc
7850 7850 cmpl(len, 16);
7851 7851 jcc(Assembler::less, L_tail);
7852 7852
7853 7853 // Align buffer to 16 bytes
7854 7854 movl(tmp, buf);
7855 7855 andl(tmp, 0xF);
7856 7856 jccb(Assembler::zero, L_aligned);
7857 7857 subl(tmp, 16);
7858 7858 addl(len, tmp);
7859 7859
7860 7860 align(4);
7861 7861 BIND(L_align_loop);
7862 7862 movsbl(rax, Address(buf, 0)); // load byte with sign extension
7863 7863 update_byte_crc32(crc, rax, table);
7864 7864 increment(buf);
7865 7865 incrementl(tmp);
7866 7866 jccb(Assembler::less, L_align_loop);
7867 7867
7868 7868 BIND(L_aligned);
7869 7869 movl(tmp, len); // save
7870 7870 shrl(len, 4);
7871 7871 jcc(Assembler::zero, L_tail_restore);
7872 7872
7873 7873 // Fold crc into first bytes of vector
7874 7874 movdqa(xmm1, Address(buf, 0));
7875 7875 movdl(rax, xmm1);
7876 7876 xorl(crc, rax);
7877 7877 pinsrd(xmm1, crc, 0);
7878 7878 addptr(buf, 16);
7879 7879 subl(len, 4); // len > 0
7880 7880 jcc(Assembler::less, L_fold_tail);
7881 7881
7882 7882 movdqa(xmm2, Address(buf, 0));
7883 7883 movdqa(xmm3, Address(buf, 16));
7884 7884 movdqa(xmm4, Address(buf, 32));
7885 7885 addptr(buf, 48);
7886 7886 subl(len, 3);
7887 7887 jcc(Assembler::lessEqual, L_fold_512b);
7888 7888
7889 7889 // Fold total 512 bits of polynomial on each iteration,
7890 7890 // 128 bits per each of 4 parallel streams.
7891 7891 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32));
7892 7892
7893 7893 align(32);
7894 7894 BIND(L_fold_512b_loop);
7895 7895 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
7896 7896 fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16);
7897 7897 fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32);
7898 7898 fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48);
7899 7899 addptr(buf, 64);
7900 7900 subl(len, 4);
7901 7901 jcc(Assembler::greater, L_fold_512b_loop);
7902 7902
7903 7903 // Fold 512 bits to 128 bits.
7904 7904 BIND(L_fold_512b);
7905 7905 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7906 7906 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2);
7907 7907 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3);
7908 7908 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4);
7909 7909
7910 7910 // Fold the rest of 128 bits data chunks
7911 7911 BIND(L_fold_tail);
7912 7912 addl(len, 3);
7913 7913 jccb(Assembler::lessEqual, L_fold_128b);
7914 7914 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16));
7915 7915
7916 7916 BIND(L_fold_tail_loop);
7917 7917 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0);
7918 7918 addptr(buf, 16);
7919 7919 decrementl(len);
7920 7920 jccb(Assembler::greater, L_fold_tail_loop);
7921 7921
7922 7922 // Fold 128 bits in xmm1 down into 32 bits in crc register.
7923 7923 BIND(L_fold_128b);
7924 7924 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr()));
7925 7925 if (UseAVX > 0) {
7926 7926 vpclmulqdq(xmm2, xmm0, xmm1, 0x1);
7927 7927 vpand(xmm3, xmm0, xmm2, false /* vector256 */);
7928 7928 vpclmulqdq(xmm0, xmm0, xmm3, 0x1);
7929 7929 } else {
7930 7930 movdqa(xmm2, xmm0);
7931 7931 pclmulqdq(xmm2, xmm1, 0x1);
7932 7932 movdqa(xmm3, xmm0);
7933 7933 pand(xmm3, xmm2);
7934 7934 pclmulqdq(xmm0, xmm3, 0x1);
7935 7935 }
7936 7936 psrldq(xmm1, 8);
7937 7937 psrldq(xmm2, 4);
7938 7938 pxor(xmm0, xmm1);
7939 7939 pxor(xmm0, xmm2);
7940 7940
7941 7941 // 8 8-bit folds to compute 32-bit CRC.
7942 7942 for (int j = 0; j < 4; j++) {
7943 7943 fold_8bit_crc32(xmm0, table, xmm1, rax);
7944 7944 }
7945 7945 movdl(crc, xmm0); // mov 32 bits to general register
7946 7946 for (int j = 0; j < 4; j++) {
7947 7947 fold_8bit_crc32(crc, table, rax);
7948 7948 }
7949 7949
7950 7950 BIND(L_tail_restore);
7951 7951 movl(len, tmp); // restore
7952 7952 BIND(L_tail);
7953 7953 andl(len, 0xf);
7954 7954 jccb(Assembler::zero, L_exit);
7955 7955
7956 7956 // Fold the rest of bytes
7957 7957 align(4);
7958 7958 BIND(L_tail_loop);
7959 7959 movsbl(rax, Address(buf, 0)); // load byte with sign extension
7960 7960 update_byte_crc32(crc, rax, table);
7961 7961 increment(buf);
7962 7962 decrementl(len);
7963 7963 jccb(Assembler::greater, L_tail_loop);
7964 7964
7965 7965 BIND(L_exit);
7966 7966 notl(crc); // ~c
7967 7967 }
7968 7968
7969 7969 #undef BIND
7970 7970 #undef BLOCK_COMMENT
7971 7971
7972 7972
7973 7973 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
7974 7974 switch (cond) {
7975 7975 // Note some conditions are synonyms for others
7976 7976 case Assembler::zero: return Assembler::notZero;
7977 7977 case Assembler::notZero: return Assembler::zero;
7978 7978 case Assembler::less: return Assembler::greaterEqual;
7979 7979 case Assembler::lessEqual: return Assembler::greater;
7980 7980 case Assembler::greater: return Assembler::lessEqual;
7981 7981 case Assembler::greaterEqual: return Assembler::less;
7982 7982 case Assembler::below: return Assembler::aboveEqual;
7983 7983 case Assembler::belowEqual: return Assembler::above;
7984 7984 case Assembler::above: return Assembler::belowEqual;
7985 7985 case Assembler::aboveEqual: return Assembler::below;
7986 7986 case Assembler::overflow: return Assembler::noOverflow;
7987 7987 case Assembler::noOverflow: return Assembler::overflow;
7988 7988 case Assembler::negative: return Assembler::positive;
7989 7989 case Assembler::positive: return Assembler::negative;
7990 7990 case Assembler::parity: return Assembler::noParity;
7991 7991 case Assembler::noParity: return Assembler::parity;
7992 7992 }
7993 7993 ShouldNotReachHere(); return Assembler::overflow;
7994 7994 }
7995 7995
7996 7996 SkipIfEqual::SkipIfEqual(
7997 7997 MacroAssembler* masm, const bool* flag_addr, bool value) {
7998 7998 _masm = masm;
7999 7999 _masm->cmp8(ExternalAddress((address)flag_addr), value);
8000 8000 _masm->jcc(Assembler::equal, _label);
8001 8001 }
8002 8002
8003 8003 SkipIfEqual::~SkipIfEqual() {
8004 8004 _masm->bind(_label);
8005 8005 }
↓ open down ↓ |
6223 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX