hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp

Print this page
rev 611 : Merge

@@ -1,10 +1,7 @@
-#ifdef USE_PRAGMA_IDENT_SRC
-#pragma ident "@(#)stubGenerator_x86_64.cpp     1.49 07/10/05 19:12:48 JVM"
-#endif
 /*
- * Copyright 2003-2007 Sun Microsystems, Inc.  All Rights Reserved.
+ * Copyright 2003-2008 Sun Microsystems, Inc.  All Rights Reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 only, as
  * published by the Free Software Foundation.

@@ -31,10 +28,12 @@
 // Declaration and definition of StubGenerator (no .hpp file).
 // For a more detailed description of the stub routine structure
 // see the comment in stubRoutines.hpp
 
 #define __ _masm->
+#define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
+#define a__ ((Assembler*)_masm)->
 
 #ifdef PRODUCT
 #define BLOCK_COMMENT(str) /* nothing */
 #else
 #define BLOCK_COMMENT(str) __ block_comment(str)

@@ -210,59 +209,60 @@
     const Address r12_save(rbp, r12_off * wordSize);
     const Address rbx_save(rbp, rbx_off * wordSize);
 
     // stub code
     __ enter();
-    __ subq(rsp, -rsp_after_call_off * wordSize);
+    __ subptr(rsp, -rsp_after_call_off * wordSize);
 
     // save register parameters
 #ifndef _WIN64
-    __ movq(parameters,   c_rarg5); // parameters
-    __ movq(entry_point,  c_rarg4); // entry_point
+    __ movptr(parameters,   c_rarg5); // parameters
+    __ movptr(entry_point,  c_rarg4); // entry_point
 #endif
 
-    __ movq(method,       c_rarg3); // method
+    __ movptr(method,       c_rarg3); // method
     __ movl(result_type,  c_rarg2); // result type
-    __ movq(result,       c_rarg1); // result
-    __ movq(call_wrapper, c_rarg0); // call wrapper
+    __ movptr(result,       c_rarg1); // result
+    __ movptr(call_wrapper, c_rarg0); // call wrapper
 
     // save regs belonging to calling function
-    __ movq(rbx_save, rbx);
-    __ movq(r12_save, r12);
-    __ movq(r13_save, r13);
-    __ movq(r14_save, r14);
-    __ movq(r15_save, r15);
+    __ movptr(rbx_save, rbx);
+    __ movptr(r12_save, r12);
+    __ movptr(r13_save, r13);
+    __ movptr(r14_save, r14);
+    __ movptr(r15_save, r15);
 
 #ifdef _WIN64
     const Address rdi_save(rbp, rdi_off * wordSize);
     const Address rsi_save(rbp, rsi_off * wordSize);
 
-    __ movq(rsi_save, rsi);
-    __ movq(rdi_save, rdi);
+    __ movptr(rsi_save, rsi);
+    __ movptr(rdi_save, rdi);
 #else
     const Address mxcsr_save(rbp, mxcsr_off * wordSize);
     {
       Label skip_ldmx;
       __ stmxcsr(mxcsr_save);
       __ movl(rax, mxcsr_save);
       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
-      ExternalAddress mxcsr_std(StubRoutines::amd64::mxcsr_std());
+      ExternalAddress mxcsr_std(StubRoutines::x86::mxcsr_std());
       __ cmp32(rax, mxcsr_std);
       __ jcc(Assembler::equal, skip_ldmx);
       __ ldmxcsr(mxcsr_std);
       __ bind(skip_ldmx);
     }
 #endif
 
     // Load up thread register
-    __ movq(r15_thread, thread);
+    __ movptr(r15_thread, thread);
+    __ reinit_heapbase();
 
 #ifdef ASSERT
     // make sure we have no pending exceptions
     { 
       Label L;
-      __ cmpq(Address(r15_thread, Thread::pending_exception_offset()), (int)NULL_WORD);
+      __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
       __ jcc(Assembler::equal, L);
       __ stop("StubRoutines::call_stub: entered with pending exception");
       __ bind(L);
     }
 #endif

@@ -273,38 +273,38 @@
     __ movl(c_rarg3, parameter_size);
     __ testl(c_rarg3, c_rarg3);
     __ jcc(Assembler::zero, parameters_done);
 
     Label loop;
-    __ movq(c_rarg2, parameters);     // parameter pointer
+    __ movptr(c_rarg2, parameters);       // parameter pointer
     __ movl(c_rarg1, c_rarg3);        // parameter counter is in c_rarg1
     __ BIND(loop);
     if (TaggedStackInterpreter) {
-      __ movq(rax, Address(c_rarg2, 0)); // get tag
-      __ addq(c_rarg2, wordSize);     // advance to next tag
-      __ pushq(rax);                  // pass tag
+      __ movl(rax, Address(c_rarg2, 0)); // get tag
+      __ addptr(c_rarg2, wordSize);      // advance to next tag
+      __ push(rax);                      // pass tag
     }
-    __ movq(rax, Address(c_rarg2, 0));  // get parameter
-    __ addq(c_rarg2, wordSize);       // advance to next parameter
+    __ movptr(rax, Address(c_rarg2, 0));// get parameter
+    __ addptr(c_rarg2, wordSize);       // advance to next parameter
     __ decrementl(c_rarg1);           // decrement counter
-    __ pushq(rax);                    // pass parameter
+    __ push(rax);                       // pass parameter
     __ jcc(Assembler::notZero, loop);
 
     // call Java function
     __ BIND(parameters_done);
-    __ movq(rbx, method);             // get methodOop
-    __ movq(c_rarg1, entry_point);    // get entry_point
-    __ movq(r13, rsp);                // set sender sp
+    __ movptr(rbx, method);             // get methodOop
+    __ movptr(c_rarg1, entry_point);    // get entry_point
+    __ mov(r13, rsp);                   // set sender sp
     BLOCK_COMMENT("call Java function");
     __ call(c_rarg1);
 
     BLOCK_COMMENT("call_stub_return_address:");
     return_address = __ pc();
 
     // store result depending on type (everything that is not
     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
-    __ movq(c_rarg0, result);
+    __ movptr(c_rarg0, result);
     Label is_long, is_float, is_double, exit;
     __ movl(c_rarg1, result_type);
     __ cmpl(c_rarg1, T_OBJECT);
     __ jcc(Assembler::equal, is_long);
     __ cmpl(c_rarg1, T_LONG);

@@ -318,47 +318,47 @@
     __ movl(Address(c_rarg0, 0), rax);
 
     __ BIND(exit);
 
     // pop parameters
-    __ leaq(rsp, rsp_after_call);
+    __ lea(rsp, rsp_after_call);
 
 #ifdef ASSERT
     // verify that threads correspond
     { 
       Label L, S;
-      __ cmpq(r15_thread, thread);
+      __ cmpptr(r15_thread, thread);
       __ jcc(Assembler::notEqual, S);
       __ get_thread(rbx);
-      __ cmpq(r15_thread, rbx);
+      __ cmpptr(r15_thread, rbx);
       __ jcc(Assembler::equal, L);
       __ bind(S);
       __ jcc(Assembler::equal, L);
       __ stop("StubRoutines::call_stub: threads must correspond");
       __ bind(L);
     }
 #endif
 
     // restore regs belonging to calling function
-    __ movq(r15, r15_save);
-    __ movq(r14, r14_save);
-    __ movq(r13, r13_save);
-    __ movq(r12, r12_save);
-    __ movq(rbx, rbx_save);
+    __ movptr(r15, r15_save);
+    __ movptr(r14, r14_save);
+    __ movptr(r13, r13_save);
+    __ movptr(r12, r12_save);
+    __ movptr(rbx, rbx_save);
 
 #ifdef _WIN64
-    __ movq(rdi, rdi_save);
-    __ movq(rsi, rsi_save);
+    __ movptr(rdi, rdi_save);
+    __ movptr(rsi, rsi_save);
 #else
     __ ldmxcsr(mxcsr_save);
 #endif
 
     // restore rsp
-    __ addq(rsp, -rsp_after_call_off * wordSize);
+    __ addptr(rsp, -rsp_after_call_off * wordSize);
 
     // return
-    __ popq(rbp);
+    __ pop(rbp);
     __ ret(0);
 
     // handle return types different from T_INT
     __ BIND(is_long);
     __ movq(Address(c_rarg0, 0), rax);

@@ -397,27 +397,27 @@
 
 #ifdef ASSERT
     // verify that threads correspond
     { 
       Label L, S;
-      __ cmpq(r15_thread, thread);
+      __ cmpptr(r15_thread, thread);
       __ jcc(Assembler::notEqual, S);
       __ get_thread(rbx);
-      __ cmpq(r15_thread, rbx);
+      __ cmpptr(r15_thread, rbx);
       __ jcc(Assembler::equal, L);
       __ bind(S);
       __ stop("StubRoutines::catch_exception: threads must correspond");
       __ bind(L);
     }
 #endif
 
     // set pending exception
     __ verify_oop(rax);
 
-    __ movq(Address(r15_thread, Thread::pending_exception_offset()), rax);
+    __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
     __ lea(rscratch1, ExternalAddress((address)__FILE__));
-    __ movq(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
+    __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
     __ movl(Address(r15_thread, Thread::exception_line_offset()), (int)  __LINE__);
 
     // complete return to VM
     assert(StubRoutines::_call_stub_return_address != NULL,
            "_call_stub_return_address must have been generated before");

@@ -452,35 +452,35 @@
 
 #ifdef ASSERT
     // make sure this code is only executed if there is a pending exception
     { 
       Label L;
-      __ cmpq(Address(r15_thread, Thread::pending_exception_offset()), (int) NULL);
+      __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL);
       __ jcc(Assembler::notEqual, L);
       __ stop("StubRoutines::forward exception: no pending exception (1)");
       __ bind(L);
     }
 #endif
 
     // compute exception handler into rbx
-    __ movq(c_rarg0, Address(rsp, 0)); 
+    __ movptr(c_rarg0, Address(rsp, 0));
     BLOCK_COMMENT("call exception_handler_for_return_address");
     __ call_VM_leaf(CAST_FROM_FN_PTR(address, 
                          SharedRuntime::exception_handler_for_return_address),
                     c_rarg0);
-    __ movq(rbx, rax);
+    __ mov(rbx, rax);
 
     // setup rax & rdx, remove return address & clear pending exception
-    __ popq(rdx);
-    __ movq(rax, Address(r15_thread, Thread::pending_exception_offset()));
+    __ pop(rdx);
+    __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int)NULL_WORD);
 
 #ifdef ASSERT
     // make sure exception is set
     { 
       Label L;
-      __ testq(rax, rax);
+      __ testptr(rax, rax);
       __ jcc(Assembler::notEqual, L);
       __ stop("StubRoutines::forward exception: no pending exception (2)");
       __ bind(L);
     }
 #endif

@@ -524,12 +524,12 @@
   //    *dest <- ex, return (orig *dest)
   address generate_atomic_xchg_ptr() {
     StubCodeMark mark(this, "StubRoutines", "atomic_xchg_ptr");
     address start = __ pc();
 
-    __ movq(rax, c_rarg0); // Copy to eax we need a return value anyhow
-    __ xchgq(rax, Address(c_rarg1, 0)); // automatic LOCK
+    __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
+    __ xchgptr(rax, Address(c_rarg1, 0)); // automatic LOCK
     __ ret(0);
 
     return start;
   }
 

@@ -618,14 +618,14 @@
   //    return *dest;
   address generate_atomic_add_ptr() {
     StubCodeMark mark(this, "StubRoutines", "atomic_add_ptr");
     address start = __ pc();
 
-    __ movq(rax, c_rarg0); // Copy to eax we need a return value anyhow
+    __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
    if ( os::is_MP() ) __ lock();
-    __ xaddl(Address(c_rarg1, 0), c_rarg0); 
-    __ addl(rax, c_rarg0);
+    __ xaddptr(Address(c_rarg1, 0), c_rarg0);
+    __ addptr(rax, c_rarg0);
     __ ret(0);
 
     return start;
   }
 

@@ -654,13 +654,13 @@
     const Address old_fp(rbp, 0);
     const Address older_fp(rax, 0);
     address start = __ pc();
 
     __ enter();    
-    __ movq(rax, old_fp); // callers fp
-    __ movq(rax, older_fp); // the frame for ps()
-    __ popq(rbp);
+    __ movptr(rax, old_fp); // callers fp
+    __ movptr(rax, older_fp); // the frame for ps()
+    __ pop(rbp);
     __ ret(0);
 
     return start;
   }
   

@@ -677,25 +677,25 @@
 
     const Address mxcsr_save(rsp, 0);
 
     if (CheckJNICalls) {
       Label ok_ret;
-      __ pushq(rax);
-      __ subq(rsp, wordSize);      // allocate a temp location
+      __ push(rax);
+      __ subptr(rsp, wordSize);      // allocate a temp location
       __ stmxcsr(mxcsr_save);
       __ movl(rax, mxcsr_save);
       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
-      __ cmpl(rax, *(int *)(StubRoutines::amd64::mxcsr_std()));
+      __ cmpl(rax, *(int *)(StubRoutines::x86::mxcsr_std()));
       __ jcc(Assembler::equal, ok_ret);
    
       __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
 
-      __ ldmxcsr(ExternalAddress(StubRoutines::amd64::mxcsr_std()));
+      __ ldmxcsr(ExternalAddress(StubRoutines::x86::mxcsr_std()));
 
       __ bind(ok_ret);
-      __ addq(rsp, wordSize);
-      __ popq(rax);
+      __ addptr(rsp, wordSize);
+      __ pop(rax);
     }
 
     __ ret(0);
 
     return start;

@@ -707,14 +707,14 @@
 
     address start = __ pc();
 
     Label L;
 
-    __ pushq(rax);
-    __ pushq(c_rarg3);
-    __ pushq(c_rarg2);
-    __ pushq(c_rarg1);
+    __ push(rax);
+    __ push(c_rarg3);
+    __ push(c_rarg2);
+    __ push(c_rarg1);
 
     __ movl(rax, 0x7f800000);
     __ xorl(c_rarg3, c_rarg3);
     __ movl(c_rarg2, inout);
     __ movl(c_rarg1, c_rarg2);

@@ -725,16 +725,16 @@
     __ movl(c_rarg3, 0x80000000);
     __ movl(rax, 0x7fffffff);
     __ cmovl(Assembler::positive, c_rarg3, rax);
 
     __ bind(L);
-    __ movq(inout, c_rarg3);
+    __ movptr(inout, c_rarg3);
 
-    __ popq(c_rarg1);
-    __ popq(c_rarg2);
-    __ popq(c_rarg3);
-    __ popq(rax);
+    __ pop(c_rarg1);
+    __ pop(c_rarg2);
+    __ pop(c_rarg3);
+    __ pop(rax);
 
     __ ret(0);
 
     return start;
   }

@@ -744,14 +744,14 @@
     Address inout(rsp, 5 * wordSize); // return address + 4 saves
     address start = __ pc();
 
     Label L;
 
-    __ pushq(rax);
-    __ pushq(c_rarg3);
-    __ pushq(c_rarg2);
-    __ pushq(c_rarg1);
+    __ push(rax);
+    __ push(c_rarg3);
+    __ push(c_rarg2);
+    __ push(c_rarg1);
 
     __ movl(rax, 0x7f800000);
     __ xorl(c_rarg3, c_rarg3);
     __ movl(c_rarg2, inout);
     __ movl(c_rarg1, c_rarg2);

@@ -759,19 +759,19 @@
     __ cmpl(rax, c_rarg1); // NaN? -> 0
     __ jcc(Assembler::negative, L);
     __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
     __ mov64(c_rarg3, 0x8000000000000000);
     __ mov64(rax, 0x7fffffffffffffff);
-    __ cmovq(Assembler::positive, c_rarg3, rax);
+    __ cmov(Assembler::positive, c_rarg3, rax);
 
     __ bind(L);
-    __ movq(inout, c_rarg3);
+    __ movptr(inout, c_rarg3);
 
-    __ popq(c_rarg1);
-    __ popq(c_rarg2);
-    __ popq(c_rarg3);
-    __ popq(rax);
+    __ pop(c_rarg1);
+    __ pop(c_rarg2);
+    __ pop(c_rarg3);
+    __ pop(rax);
 
     __ ret(0);
 
     return start;
   }

@@ -782,43 +782,43 @@
 
     address start = __ pc();
 
     Label L;
 
-    __ pushq(rax);
-    __ pushq(c_rarg3);
-    __ pushq(c_rarg2);
-    __ pushq(c_rarg1);
-    __ pushq(c_rarg0);
+    __ push(rax);
+    __ push(c_rarg3);
+    __ push(c_rarg2);
+    __ push(c_rarg1);
+    __ push(c_rarg0);
 
     __ movl(rax, 0x7ff00000);
     __ movq(c_rarg2, inout);
     __ movl(c_rarg3, c_rarg2);
-    __ movq(c_rarg1, c_rarg2);
-    __ movq(c_rarg0, c_rarg2);
+    __ mov(c_rarg1, c_rarg2);
+    __ mov(c_rarg0, c_rarg2);
     __ negl(c_rarg3);
-    __ shrq(c_rarg1, 0x20);
+    __ shrptr(c_rarg1, 0x20);
     __ orl(c_rarg3, c_rarg2);
     __ andl(c_rarg1, 0x7fffffff);
     __ xorl(c_rarg2, c_rarg2);
     __ shrl(c_rarg3, 0x1f);
     __ orl(c_rarg1, c_rarg3);
     __ cmpl(rax, c_rarg1);
     __ jcc(Assembler::negative, L); // NaN -> 0
-    __ testq(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
+    __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
     __ movl(c_rarg2, 0x80000000);
     __ movl(rax, 0x7fffffff);
-    __ cmovl(Assembler::positive, c_rarg2, rax);
+    __ cmov(Assembler::positive, c_rarg2, rax);
     
     __ bind(L);
-    __ movq(inout, c_rarg2);
+    __ movptr(inout, c_rarg2);
 
-    __ popq(c_rarg0);
-    __ popq(c_rarg1);
-    __ popq(c_rarg2);
-    __ popq(c_rarg3);
-    __ popq(rax);
+    __ pop(c_rarg0);
+    __ pop(c_rarg1);
+    __ pop(c_rarg2);
+    __ pop(c_rarg3);
+    __ pop(rax);
 
     __ ret(0);
 
     return start;
   }

@@ -829,23 +829,23 @@
 
     address start = __ pc();
 
     Label L;
 
-    __ pushq(rax);
-    __ pushq(c_rarg3);
-    __ pushq(c_rarg2);
-    __ pushq(c_rarg1);
-    __ pushq(c_rarg0);
+    __ push(rax);
+    __ push(c_rarg3);
+    __ push(c_rarg2);
+    __ push(c_rarg1);
+    __ push(c_rarg0);
 
     __ movl(rax, 0x7ff00000);
     __ movq(c_rarg2, inout);
     __ movl(c_rarg3, c_rarg2);
-    __ movq(c_rarg1, c_rarg2);
-    __ movq(c_rarg0, c_rarg2);
+    __ mov(c_rarg1, c_rarg2);
+    __ mov(c_rarg0, c_rarg2);
     __ negl(c_rarg3);
-    __ shrq(c_rarg1, 0x20);
+    __ shrptr(c_rarg1, 0x20);
     __ orl(c_rarg3, c_rarg2);
     __ andl(c_rarg1, 0x7fffffff);
     __ xorl(c_rarg2, c_rarg2);
     __ shrl(c_rarg3, 0x1f);
     __ orl(c_rarg1, c_rarg3);

@@ -857,15 +857,15 @@
     __ cmovq(Assembler::positive, c_rarg2, rax);
     
     __ bind(L);
     __ movq(inout, c_rarg2);
 
-    __ popq(c_rarg0);
-    __ popq(c_rarg1);
-    __ popq(c_rarg2);
-    __ popq(c_rarg3);
-    __ popq(rax);
+    __ pop(c_rarg0);
+    __ pop(c_rarg1);
+    __ pop(c_rarg2);
+    __ pop(c_rarg3);
+    __ pop(rax);
 
     __ ret(0);
 
     return start;
   }

@@ -888,21 +888,21 @@
   // SIGBUS/OBJERR.)
   address generate_handler_for_unsafe_access() {
     StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
     address start = __ pc();
 
-    __ pushq(0);                      // hole for return address-to-be
-    __ pushaq();                      // push registers
+    __ push(0);                       // hole for return address-to-be
+    __ pusha();                       // push registers
     Address next_pc(rsp, RegisterImpl::number_of_registers * BytesPerWord);
 
-    __ subq(rsp, frame::arg_reg_save_area_bytes);
+    __ subptr(rsp, frame::arg_reg_save_area_bytes);
     BLOCK_COMMENT("call handle_unsafe_access");
     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, handle_unsafe_access)));
-    __ addq(rsp, frame::arg_reg_save_area_bytes);
+    __ addptr(rsp, frame::arg_reg_save_area_bytes);
 
-    __ movq(next_pc, rax);            // stuff next address 
-    __ popaq();
+    __ movptr(next_pc, rax);          // stuff next address
+    __ popa();
     __ ret(0);                        // jump to next address
 
     return start;
   }
 

@@ -912,101 +912,123 @@
   //    all args on stack!
   // 
   // Stack after saving c_rarg3:
   //    [tos + 0]: saved c_rarg3
   //    [tos + 1]: saved c_rarg2
-  //    [tos + 2]: saved flags
-  //    [tos + 3]: return address
-  //  * [tos + 4]: error message (char*)
-  //  * [tos + 5]: object to verify (oop)
-  //  * [tos + 6]: saved rax - saved by caller and bashed
+  //    [tos + 2]: saved r12 (several TemplateTable methods use it)
+  //    [tos + 3]: saved flags
+  //    [tos + 4]: return address
+  //  * [tos + 5]: error message (char*)
+  //  * [tos + 6]: object to verify (oop)
+  //  * [tos + 7]: saved rax - saved by caller and bashed
   //  * = popped on exit
   address generate_verify_oop() {
     StubCodeMark mark(this, "StubRoutines", "verify_oop");
     address start = __ pc();
     
     Label exit, error;
 
-    __ pushfq();
+    __ pushf();
     __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 
+    __ push(r12);
+
     // save c_rarg2 and c_rarg3
-    __ pushq(c_rarg2);
-    __ pushq(c_rarg3);
+    __ push(c_rarg2);
+    __ push(c_rarg3);
+
+    enum {
+           // After previous pushes.
+           oop_to_verify = 6 * wordSize,
+           saved_rax     = 7 * wordSize,
+
+           // Before the call to MacroAssembler::debug(), see below.
+           return_addr   = 16 * wordSize,
+           error_msg     = 17 * wordSize
+    };
 
     // get object
-    __ movq(rax, Address(rsp, 5 * wordSize));
+    __ movptr(rax, Address(rsp, oop_to_verify));
 
     // make sure object is 'reasonable'
-    __ testq(rax, rax);
+    __ testptr(rax, rax);
     __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
     // Check if the oop is in the right area of memory
-    __ movq(c_rarg2, rax);
+    __ movptr(c_rarg2, rax);
     __ movptr(c_rarg3, (int64_t) Universe::verify_oop_mask());
-    __ andq(c_rarg2, c_rarg3);
+    __ andptr(c_rarg2, c_rarg3);
     __ movptr(c_rarg3, (int64_t) Universe::verify_oop_bits());
-    __ cmpq(c_rarg2, c_rarg3);
+    __ cmpptr(c_rarg2, c_rarg3);
     __ jcc(Assembler::notZero, error);
 
+    // set r12 to heapbase for load_klass()
+    __ reinit_heapbase();
+
     // make sure klass is 'reasonable'
-    __ movq(rax, Address(rax, oopDesc::klass_offset_in_bytes())); // get klass
-    __ testq(rax, rax);
+    __ load_klass(rax, rax);  // get klass
+    __ testptr(rax, rax);
     __ jcc(Assembler::zero, error); // if klass is NULL it is broken
     // Check if the klass is in the right area of memory
-    __ movq(c_rarg2, rax);
+    __ mov(c_rarg2, rax);
     __ movptr(c_rarg3, (int64_t) Universe::verify_klass_mask());
-    __ andq(c_rarg2, c_rarg3);
+    __ andptr(c_rarg2, c_rarg3);
     __ movptr(c_rarg3, (int64_t) Universe::verify_klass_bits());
-    __ cmpq(c_rarg2, c_rarg3);
+    __ cmpptr(c_rarg2, c_rarg3);
     __ jcc(Assembler::notZero, error);
 
     // make sure klass' klass is 'reasonable'
-    __ movq(rax, Address(rax, oopDesc::klass_offset_in_bytes()));
-    __ testq(rax, rax);
+    __ load_klass(rax, rax);
+    __ testptr(rax, rax);
     __ jcc(Assembler::zero, error); // if klass' klass is NULL it is broken
     // Check if the klass' klass is in the right area of memory
     __ movptr(c_rarg3, (int64_t) Universe::verify_klass_mask());
-    __ andq(rax, c_rarg3);
+    __ andptr(rax, c_rarg3);
     __ movptr(c_rarg3, (int64_t) Universe::verify_klass_bits());
-    __ cmpq(rax, c_rarg3);
+    __ cmpptr(rax, c_rarg3);
     __ jcc(Assembler::notZero, error);
 
     // return if everything seems ok
     __ bind(exit);
-    __ movq(rax, Address(rsp, 6 * wordSize));    // get saved rax back
-    __ popq(c_rarg3);                              // restore c_rarg3
-    __ popq(c_rarg2);                              // restore c_rarg2
-    __ popfq();                                  // restore flags
+    __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
+    __ pop(c_rarg3);                             // restore c_rarg3
+    __ pop(c_rarg2);                             // restore c_rarg2
+    __ pop(r12);                                 // restore r12
+    __ popf();                                   // restore flags
     __ ret(3 * wordSize);                        // pop caller saved stuff
 
     // handle errors
     __ bind(error);
-    __ movq(rax, Address(rsp, 6 * wordSize));    // get saved rax back
-    __ popq(c_rarg3);                              // get saved c_rarg3 back
-    __ popq(c_rarg2);                              // get saved c_rarg2 back
-    __ popfq();                                  // get saved flags off stack --
+    __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
+    __ pop(c_rarg3);                             // get saved c_rarg3 back
+    __ pop(c_rarg2);                             // get saved c_rarg2 back
+    __ pop(r12);                                 // get saved r12 back
+    __ popf();                                   // get saved flags off stack --
                                                  // will be ignored
 
-    __ pushaq();                                 // push registers
+    __ pusha();                                  // push registers
                                                  // (rip is already
                                                  // already pushed)
-    // debug(char* msg, int64_t regs[])
+    // debug(char* msg, int64_t pc, int64_t regs[])
     // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and
     // pushed all the registers, so now the stack looks like:
     //     [tos +  0] 16 saved registers
     //     [tos + 16] return address
-    //     [tos + 17] error message (char*)
+    //   * [tos + 17] error message (char*)
+    //   * [tos + 18] object to verify (oop)
+    //   * [tos + 19] saved rax - saved by caller and bashed
+    //   * = popped on exit
 
-    __ movq(c_rarg0, Address(rsp, 17 * wordSize)); // pass address of error message
-    __ movq(c_rarg1, rsp);                         // pass address of regs on stack
-    __ movq(r12, rsp);                           // remember rsp
-    __ subq(rsp, frame::arg_reg_save_area_bytes);// windows
-    __ andq(rsp, -16);                           // align stack as required by ABI
+    __ movptr(c_rarg0, Address(rsp, error_msg));    // pass address of error message
+    __ movptr(c_rarg1, Address(rsp, return_addr));  // pass return address
+    __ movq(c_rarg2, rsp);                          // pass address of regs on stack
+    __ mov(r12, rsp);                               // remember rsp
+    __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
+    __ andptr(rsp, -16);                            // align stack as required by ABI
     BLOCK_COMMENT("call MacroAssembler::debug");
-    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug)));
-    __ movq(rsp, r12);                           // restore rsp
-    __ popaq();                                  // pop registers
+    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
+    __ mov(rsp, r12);                               // restore rsp
+    __ popa();                                      // pop registers (includes r12)
     __ ret(3 * wordSize);                        // pop caller saved stuff
 
     return start;
   }
 

@@ -1036,11 +1058,11 @@
 #ifdef ASSERT
     Label L;
     assert_different_registers(Rtmp, Rint);
     __ movslq(Rtmp, Rint);
     __ cmpq(Rtmp, Rint);
-    __ jccb(Assembler::equal, L);
+    __ jcc(Assembler::equal, L);
     __ stop("high 32-bits of int value are not 0");
     __ bind(L);
 #endif
   }
 

@@ -1065,20 +1087,20 @@
     const Register from     = c_rarg0;
     const Register to       = c_rarg1;
     const Register count    = c_rarg2;
     const Register end_from = rax;
 
-    __ cmpq(to, from);
-    __ leaq(end_from, Address(from, count, sf, 0));
+    __ cmpptr(to, from);
+    __ lea(end_from, Address(from, count, sf, 0));
     if (NOLp == NULL) {
       ExternalAddress no_overlap(no_overlap_target);
       __ jump_cc(Assembler::belowEqual, no_overlap);
-      __ cmpq(to, end_from);
+      __ cmpptr(to, end_from);
       __ jump_cc(Assembler::aboveEqual, no_overlap);
     } else {
       __ jcc(Assembler::belowEqual, (*NOLp));
-      __ cmpq(to, end_from);
+      __ cmpptr(to, end_from);
       __ jcc(Assembler::aboveEqual, (*NOLp));
     }
   }
 
   // Shuffle first three arg regs on Windows into Linux/Solaris locations.

@@ -1098,30 +1120,30 @@
     assert(nargs == 3 || nargs == 4, "else fix");
 #ifdef _WIN64
     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
            "unexpected argument registers"); 
     if (nargs >= 4)
-      __ movq(rax, r9);  // r9 is also saved_rdi
-    __ movq(saved_rdi, rdi);
-    __ movq(saved_rsi, rsi);
-    __ movq(rdi, rcx); // c_rarg0
-    __ movq(rsi, rdx); // c_rarg1
-    __ movq(rdx, r8);  // c_rarg2
+      __ mov(rax, r9);  // r9 is also saved_rdi
+    __ movptr(saved_rdi, rdi);
+    __ movptr(saved_rsi, rsi);
+    __ mov(rdi, rcx); // c_rarg0
+    __ mov(rsi, rdx); // c_rarg1
+    __ mov(rdx, r8);  // c_rarg2
     if (nargs >= 4)
-      __ movq(rcx, rax); // c_rarg3 (via rax)
+      __ mov(rcx, rax); // c_rarg3 (via rax)
 #else
     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
            "unexpected argument registers"); 
 #endif
   }
 
   void restore_arg_regs() {
     const Register saved_rdi = r9;
     const Register saved_rsi = r10;
 #ifdef _WIN64
-    __ movq(rdi, saved_rdi);
-    __ movq(rsi, saved_rsi);
+    __ movptr(rdi, saved_rdi);
+    __ movptr(rsi, saved_rsi);
 #endif
   }
 
   // Generate code for an array write pre barrier
   //

@@ -1129,34 +1151,41 @@
   //     count    -  element count
   //
   //     Destroy no registers!
   //
   void  gen_write_ref_array_pre_barrier(Register addr, Register count) {
-#if 0 // G1 - only
-    assert_different_registers(addr, c_rarg1);
-    assert_different_registers(count, c_rarg0);
     BarrierSet* bs = Universe::heap()->barrier_set();
     switch (bs->kind()) {
       case BarrierSet::G1SATBCT:
       case BarrierSet::G1SATBCTLogging:
         {
-          __ pushaq();                      // push registers
-          __ movq(c_rarg0, addr);
-          __ movq(c_rarg1, count);
-          __ call(RuntimeAddress(BarrierSet::static_write_ref_array_pre));
-          __ popaq();
+          __ pusha();                      // push registers
+          if (count == c_rarg0) {
+            if (addr == c_rarg1) {
+              // exactly backwards!!
+              __ xchgptr(c_rarg1, c_rarg0);
+            } else {
+              __ movptr(c_rarg1, count);
+              __ movptr(c_rarg0, addr);
+            }
+
+          } else {
+            __ movptr(c_rarg0, addr);
+            __ movptr(c_rarg1, count);
+          }
+          __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre)));
+          __ popa();
         }
         break;
       case BarrierSet::CardTableModRef:
       case BarrierSet::CardTableExtension:
       case BarrierSet::ModRef: 
         break;
-      default      : 
+      default:
         ShouldNotReachHere();
         
     }
-#endif // 0 G1 - only
   }
 
   //
   // Generate code for an array write post barrier
   //

@@ -1169,51 +1198,62 @@
   //  The ending address is inclusive.
   void  gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
     assert_different_registers(start, end, scratch);
     BarrierSet* bs = Universe::heap()->barrier_set();
     switch (bs->kind()) {
-#if 0 // G1 - only
       case BarrierSet::G1SATBCT:
       case BarrierSet::G1SATBCTLogging:
 
         {
-          __ pushaq();                      // push registers (overkill)
+          __ pusha();                      // push registers (overkill)
           // must compute element count unless barrier set interface is changed (other platforms supply count)
           assert_different_registers(start, end, scratch);
-          __ leaq(scratch, Address(end, wordSize));
-          __ subq(scratch, start);
-          __ shrq(scratch, LogBytesPerWord);
-          __ movq(c_rarg0, start);
-          __ movq(c_rarg1, scratch);
-          __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
-          __ popaq();
+          __ lea(scratch, Address(end, wordSize));
+          __ subptr(scratch, start);
+          __ shrptr(scratch, LogBytesPerWord);
+          __ mov(c_rarg0, start);
+          __ mov(c_rarg1, scratch);
+          __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post)));
+          __ popa();
         }
         break;
-#endif // 0 G1 - only
       case BarrierSet::CardTableModRef:
       case BarrierSet::CardTableExtension:
         {
           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 
           Label L_loop;
 
-           __ shrq(start, CardTableModRefBS::card_shift);
-           __ shrq(end, CardTableModRefBS::card_shift);
-           __ subq(end, start); // number of bytes to copy
+           __ shrptr(start, CardTableModRefBS::card_shift);
+           __ shrptr(end, CardTableModRefBS::card_shift);
+           __ subptr(end, start); // number of bytes to copy
+
+          intptr_t disp = (intptr_t) ct->byte_map_base;
+          if (__ is_simm32(disp)) {
+            Address cardtable(noreg, noreg, Address::no_scale, disp);
+            __ lea(scratch, cardtable);
+          } else {
+            ExternalAddress cardtable((address)disp);
+            __ lea(scratch, cardtable);
+          }
 
           const Register count = end; // 'end' register contains bytes count now 
-          __ lea(scratch, ExternalAddress((address)ct->byte_map_base));
-          __ addq(start, scratch);
+          __ addptr(start, scratch);
         __ BIND(L_loop);
           __ movb(Address(start, count, Address::times_1), 0);
-          __ decrementq(count);
+          __ decrement(count);
           __ jcc(Assembler::greaterEqual, L_loop);
         }
+        break;
+      default:
+        ShouldNotReachHere();
+
       }
    }
 
+
   // Copy big chunks forward
   //
   // Inputs:
   //   end_from     - source arrays end address
   //   end_to       - destination array end address

@@ -1227,22 +1267,30 @@
                              Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
     DEBUG_ONLY(__ stop("enter at entry label, not here"));
     Label L_loop;
     __ align(16);
   __ BIND(L_loop);
+    if(UseUnalignedLoadStores) {
+      __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
+      __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
+      __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
+      __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
+
+    } else {
     __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
     __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
     __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
     __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
     __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
     __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
     __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
     __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
+    }
   __ BIND(L_copy_32_bytes);
-    __ addq(qword_count, 4);
+    __ addptr(qword_count, 4);
     __ jcc(Assembler::lessEqual, L_loop);
-    __ subq(qword_count, 4);
+    __ subptr(qword_count, 4);
     __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
   }
 
 
   // Copy big chunks backward

@@ -1260,22 +1308,30 @@
                               Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
     DEBUG_ONLY(__ stop("enter at entry label, not here"));
     Label L_loop;
     __ align(16);
   __ BIND(L_loop);
+    if(UseUnalignedLoadStores) {
+      __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
+      __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
+      __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
+      __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
+
+    } else {
     __ movq(to, Address(from, qword_count, Address::times_8, 24));
     __ movq(Address(dest, qword_count, Address::times_8, 24), to);
     __ movq(to, Address(from, qword_count, Address::times_8, 16));
     __ movq(Address(dest, qword_count, Address::times_8, 16), to);
     __ movq(to, Address(from, qword_count, Address::times_8,  8));
     __ movq(Address(dest, qword_count, Address::times_8,  8), to);
     __ movq(to, Address(from, qword_count, Address::times_8,  0));
     __ movq(Address(dest, qword_count, Address::times_8,  0), to);
+    }
   __ BIND(L_copy_32_bytes);
-    __ subq(qword_count, 4);
+    __ subptr(qword_count, 4);
     __ jcc(Assembler::greaterEqual, L_loop);
-    __ addq(qword_count, 4);
+    __ addptr(qword_count, 4);
     __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
   }
 
 
   // Arguments:

@@ -1323,57 +1379,57 @@
 
     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
                       // r9 and r10 may be used to save non-volatile registers
 
     // 'from', 'to' and 'count' are now valid
-    __ movq(byte_count, count);
-    __ shrq(count, 3); // count => qword_count
+    __ movptr(byte_count, count);
+    __ shrptr(count, 3); // count => qword_count
 
     // Copy from low to high addresses.  Use 'to' as scratch.
-    __ leaq(end_from, Address(from, qword_count, Address::times_8, -8));
-    __ leaq(end_to,   Address(to,   qword_count, Address::times_8, -8));
-    __ negq(qword_count); // make the count negative
+    __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
+    __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
+    __ negptr(qword_count); // make the count negative
     __ jmp(L_copy_32_bytes);
 
     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
-    __ incrementq(qword_count);
+    __ increment(qword_count);
     __ jcc(Assembler::notZero, L_copy_8_bytes);
 
     // Check for and copy trailing dword
   __ BIND(L_copy_4_bytes);
-    __ testq(byte_count, 4);
+    __ testl(byte_count, 4);
     __ jccb(Assembler::zero, L_copy_2_bytes);
     __ movl(rax, Address(end_from, 8));
     __ movl(Address(end_to, 8), rax);
 
-    __ addq(end_from, 4);
-    __ addq(end_to, 4);
+    __ addptr(end_from, 4);
+    __ addptr(end_to, 4);
 
     // Check for and copy trailing word
   __ BIND(L_copy_2_bytes);
-    __ testq(byte_count, 2);
+    __ testl(byte_count, 2);
     __ jccb(Assembler::zero, L_copy_byte);
     __ movw(rax, Address(end_from, 8));
     __ movw(Address(end_to, 8), rax);
 
-    __ addq(end_from, 2);
-    __ addq(end_to, 2);
+    __ addptr(end_from, 2);
+    __ addptr(end_to, 2);
 
     // Check for and copy trailing byte
   __ BIND(L_copy_byte);
-    __ testq(byte_count, 1);
+    __ testl(byte_count, 1);
     __ jccb(Assembler::zero, L_exit);
     __ movb(rax, Address(end_from, 8));
     __ movb(Address(end_to, 8), rax);
 
   __ BIND(L_exit);
     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr);
     restore_arg_regs();
-    __ xorq(rax, rax); // return 0
+    __ xorptr(rax, rax); // return 0
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
     // Copy in 32-bytes chunks
     copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);

@@ -1419,56 +1475,56 @@
     array_overlap_test(disjoint_byte_copy_entry, Address::times_1);
     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
                       // r9 and r10 may be used to save non-volatile registers
 
     // 'from', 'to' and 'count' are now valid
-    __ movq(byte_count, count);
-    __ shrq(count, 3);   // count => qword_count
+    __ movptr(byte_count, count);
+    __ shrptr(count, 3);   // count => qword_count
 
     // Copy from high to low addresses.
 
     // Check for and copy trailing byte
-    __ testq(byte_count, 1);
+    __ testl(byte_count, 1);
     __ jcc(Assembler::zero, L_copy_2_bytes);
     __ movb(rax, Address(from, byte_count, Address::times_1, -1));
     __ movb(Address(to, byte_count, Address::times_1, -1), rax);
-    __ decrementq(byte_count); // Adjust for possible trailing word 
+    __ decrement(byte_count); // Adjust for possible trailing word
 
     // Check for and copy trailing word
   __ BIND(L_copy_2_bytes);
-    __ testq(byte_count, 2);
+    __ testl(byte_count, 2);
     __ jcc(Assembler::zero, L_copy_4_bytes);
     __ movw(rax, Address(from, byte_count, Address::times_1, -2));
     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
 
     // Check for and copy trailing dword
   __ BIND(L_copy_4_bytes);
-    __ testq(byte_count, 4);
+    __ testl(byte_count, 4);
     __ jcc(Assembler::zero, L_copy_32_bytes);
     __ movl(rax, Address(from, qword_count, Address::times_8));
     __ movl(Address(to, qword_count, Address::times_8), rax);
     __ jmp(L_copy_32_bytes);
 
     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
-    __ decrementq(qword_count);
+    __ decrement(qword_count);
     __ jcc(Assembler::notZero, L_copy_8_bytes);
 
     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr);
     restore_arg_regs();
-    __ xorq(rax, rax); // return 0
+    __ xorptr(rax, rax); // return 0
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
     // Copy in 32-bytes chunks
     copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
 
     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr);
     restore_arg_regs();
-    __ xorq(rax, rax); // return 0
+    __ xorptr(rax, rax); // return 0
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
     return start;
   }

@@ -1517,50 +1573,50 @@
 
     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
                       // r9 and r10 may be used to save non-volatile registers
 
     // 'from', 'to' and 'count' are now valid
-    __ movq(word_count, count);
-    __ shrq(count, 2); // count => qword_count
+    __ movptr(word_count, count);
+    __ shrptr(count, 2); // count => qword_count
 
     // Copy from low to high addresses.  Use 'to' as scratch.
-    __ leaq(end_from, Address(from, qword_count, Address::times_8, -8));
-    __ leaq(end_to,   Address(to,   qword_count, Address::times_8, -8));
-    __ negq(qword_count);
+    __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
+    __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
+    __ negptr(qword_count);
     __ jmp(L_copy_32_bytes);
 
     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
-    __ incrementq(qword_count);
+    __ increment(qword_count);
     __ jcc(Assembler::notZero, L_copy_8_bytes);
 
     // Original 'dest' is trashed, so we can't use it as a
     // base register for a possible trailing word copy
 
     // Check for and copy trailing dword
   __ BIND(L_copy_4_bytes);
-    __ testq(word_count, 2);
+    __ testl(word_count, 2);
     __ jccb(Assembler::zero, L_copy_2_bytes);
     __ movl(rax, Address(end_from, 8));
     __ movl(Address(end_to, 8), rax);
 
-    __ addq(end_from, 4);
-    __ addq(end_to, 4);
+    __ addptr(end_from, 4);
+    __ addptr(end_to, 4);
 
     // Check for and copy trailing word
   __ BIND(L_copy_2_bytes);
-    __ testq(word_count, 1);
+    __ testl(word_count, 1);
     __ jccb(Assembler::zero, L_exit);
     __ movw(rax, Address(end_from, 8));
     __ movw(Address(end_to, 8), rax);
 
   __ BIND(L_exit);
     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr);
     restore_arg_regs();
-    __ xorq(rax, rax); // return 0
+    __ xorptr(rax, rax); // return 0
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
     // Copy in 32-bytes chunks
     copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);

@@ -1606,57 +1662,58 @@
     array_overlap_test(disjoint_short_copy_entry, Address::times_2);
     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
                       // r9 and r10 may be used to save non-volatile registers
 
     // 'from', 'to' and 'count' are now valid
-    __ movq(word_count, count);
-    __ shrq(count, 2); // count => qword_count
+    __ movptr(word_count, count);
+    __ shrptr(count, 2); // count => qword_count
 
     // Copy from high to low addresses.  Use 'to' as scratch.
 
     // Check for and copy trailing word
-    __ testq(word_count, 1);
+    __ testl(word_count, 1);
     __ jccb(Assembler::zero, L_copy_4_bytes);
     __ movw(rax, Address(from, word_count, Address::times_2, -2));
     __ movw(Address(to, word_count, Address::times_2, -2), rax);
 
     // Check for and copy trailing dword
   __ BIND(L_copy_4_bytes);
-    __ testq(word_count, 2);
+    __ testl(word_count, 2);
     __ jcc(Assembler::zero, L_copy_32_bytes);
     __ movl(rax, Address(from, qword_count, Address::times_8));
     __ movl(Address(to, qword_count, Address::times_8), rax);
     __ jmp(L_copy_32_bytes);
 
     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
-    __ decrementq(qword_count);
+    __ decrement(qword_count);
     __ jcc(Assembler::notZero, L_copy_8_bytes);
 
     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr);
     restore_arg_regs();
-    __ xorq(rax, rax); // return 0
+    __ xorptr(rax, rax); // return 0
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
     // Copy in 32-bytes chunks
     copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
 
     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr);
     restore_arg_regs();
-    __ xorq(rax, rax); // return 0
+    __ xorptr(rax, rax); // return 0
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
     return start;
   }
 
   // Arguments:
   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
   //             ignored
+  //   is_oop  - true => oop array, so generate store check code
   //   name    - stub name string
   //
   // Inputs:
   //   c_rarg0   - source array address
   //   c_rarg1   - destination array address

@@ -1666,13 +1723,13 @@
   // the hardware handle it.  The two dwords within qwords that span
   // cache line boundaries will still be loaded and stored atomicly.
   //
   // Side Effects:
   //   disjoint_int_copy_entry is set to the no-overlap entry point
-  //   used by generate_conjoint_int_copy().
+  //   used by generate_conjoint_int_oop_copy().
   //
-  address generate_disjoint_int_copy(bool aligned, const char *name) {
+  address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, const char *name) {
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();
 
     Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;

@@ -1681,51 +1738,66 @@
     const Register count       = rdx;  // elements count
     const Register dword_count = rcx;
     const Register qword_count = count;
     const Register end_from    = from; // source array end address
     const Register end_to      = to;   // destination array end address
+    const Register saved_to    = r11;  // saved destination array address
     // End pointers are inclusive, and if count is not zero they point
     // to the last unit copied:  end_to[0] := end_from[0]
 
     __ enter(); // required for proper stackwalking of RuntimeStub frame
     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
 
-    disjoint_int_copy_entry = __ pc();
+    (is_oop ? disjoint_oop_copy_entry : disjoint_int_copy_entry) = __ pc();
+
+    if (is_oop) {
+      // no registers are destroyed by this call
+      gen_write_ref_array_pre_barrier(/* dest */ c_rarg1, /* count */ c_rarg2);
+    }
+
     BLOCK_COMMENT("Entry:");
     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 
     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
                       // r9 and r10 may be used to save non-volatile registers
 
+    if (is_oop) {
+      __ movq(saved_to, to);
+    }
+
     // 'from', 'to' and 'count' are now valid
-    __ movq(dword_count, count);
-    __ shrq(count, 1); // count => qword_count
+    __ movptr(dword_count, count);
+    __ shrptr(count, 1); // count => qword_count
 
     // Copy from low to high addresses.  Use 'to' as scratch.
-    __ leaq(end_from, Address(from, qword_count, Address::times_8, -8));
-    __ leaq(end_to,   Address(to,   qword_count, Address::times_8, -8));
-    __ negq(qword_count);
+    __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
+    __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
+    __ negptr(qword_count);
     __ jmp(L_copy_32_bytes);
 
     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
-    __ incrementq(qword_count);
+    __ increment(qword_count);
     __ jcc(Assembler::notZero, L_copy_8_bytes);
 
     // Check for and copy trailing dword
   __ BIND(L_copy_4_bytes);
-    __ testq(dword_count, 1); // Only byte test since the value is 0 or 1
+    __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
     __ jccb(Assembler::zero, L_exit);
     __ movl(rax, Address(end_from, 8));
     __ movl(Address(end_to, 8), rax);
 
   __ BIND(L_exit);
+    if (is_oop) {
+      __ leaq(end_to, Address(saved_to, dword_count, Address::times_4, -4));
+      gen_write_ref_array_post_barrier(saved_to, end_to, rax);
+    }
     inc_counter_np(SharedRuntime::_jint_array_copy_ctr);
     restore_arg_regs();
-    __ xorq(rax, rax); // return 0
+    __ xorptr(rax, rax); // return 0
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
     // Copy 32-bytes chunks
     copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);

@@ -1735,10 +1807,11 @@
   }
 
   // Arguments:
   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
   //             ignored
+  //   is_oop  - true => oop array, so generate store check code
   //   name    - stub name string
   //
   // Inputs:
   //   c_rarg0   - source array address
   //   c_rarg1   - destination array address

@@ -1746,65 +1819,81 @@
   //
   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
   // the hardware handle it.  The two dwords within qwords that span
   // cache line boundaries will still be loaded and stored atomicly.
   //
-  address generate_conjoint_int_copy(bool aligned, const char *name) {
+  address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, const char *name) {
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ pc();
 
-    Label L_copy_32_bytes, L_copy_8_bytes, L_copy_2_bytes;
+    Label L_copy_32_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit;
     const Register from        = rdi;  // source array address
     const Register to          = rsi;  // destination array address
     const Register count       = rdx;  // elements count
     const Register dword_count = rcx;
     const Register qword_count = count;
 
     __ enter(); // required for proper stackwalking of RuntimeStub frame
     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
 
-    int_copy_entry = __ pc();
+    if (is_oop) {
+      // no registers are destroyed by this call
+      gen_write_ref_array_pre_barrier(/* dest */ c_rarg1, /* count */ c_rarg2);
+    }
+
+    (is_oop ? oop_copy_entry : int_copy_entry) = __ pc();
     BLOCK_COMMENT("Entry:");
     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 
-    array_overlap_test(disjoint_int_copy_entry, Address::times_4);
+    array_overlap_test(is_oop ? disjoint_oop_copy_entry : disjoint_int_copy_entry,
+                       Address::times_4);
     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
                       // r9 and r10 may be used to save non-volatile registers
 
+    assert_clean_int(count, rax); // Make sure 'count' is clean int.
     // 'from', 'to' and 'count' are now valid
-    __ movq(dword_count, count);
-    __ shrq(count, 1); // count => qword_count
+    __ movptr(dword_count, count);
+    __ shrptr(count, 1); // count => qword_count
 
     // Copy from high to low addresses.  Use 'to' as scratch.
 
     // Check for and copy trailing dword
-    __ testq(dword_count, 1);
+    __ testl(dword_count, 1);
     __ jcc(Assembler::zero, L_copy_32_bytes);
     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
     __ jmp(L_copy_32_bytes);
 
     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
-    __ decrementq(qword_count);
+    __ decrement(qword_count);
     __ jcc(Assembler::notZero, L_copy_8_bytes);
 
     inc_counter_np(SharedRuntime::_jint_array_copy_ctr);
+    if (is_oop) {
+      __ jmp(L_exit);
+    }
     restore_arg_regs();
-    __ xorq(rax, rax); // return 0
+    __ xorptr(rax, rax); // return 0
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
     // Copy in 32-bytes chunks
     copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
 
     inc_counter_np(SharedRuntime::_jint_array_copy_ctr);
+   __ bind(L_exit);
+     if (is_oop) {
+       Register end_to = rdx;
+       __ leaq(end_to, Address(to, dword_count, Address::times_4, -4));
+       gen_write_ref_array_post_barrier(to, end_to, rax);
+     }
     restore_arg_regs();
-    __ xorq(rax, rax); // return 0
+    __ xorptr(rax, rax); // return 0
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
     return start;
   }

@@ -1857,28 +1946,28 @@
                       // r9 and r10 may be used to save non-volatile registers
 
     // 'from', 'to' and 'qword_count' are now valid
 
     // Copy from low to high addresses.  Use 'to' as scratch.
-    __ leaq(end_from, Address(from, qword_count, Address::times_8, -8));
-    __ leaq(end_to,   Address(to, qword_count, Address::times_8, -8));
-    __ negq(qword_count);
+    __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
+    __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
+    __ negptr(qword_count);
     __ jmp(L_copy_32_bytes);
 
     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
-    __ incrementq(qword_count);
+    __ increment(qword_count);
     __ jcc(Assembler::notZero, L_copy_8_bytes);
 
     if (is_oop) {
       __ jmp(L_exit);
     } else {
       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
       restore_arg_regs();
-      __ xorq(rax, rax); // return 0
+      __ xorptr(rax, rax); // return 0
       __ leave(); // required for proper stackwalking of RuntimeStub frame
       __ ret(0);
     }
 
     // Copy 64-byte chunks

@@ -1890,11 +1979,11 @@
       inc_counter_np(SharedRuntime::_oop_array_copy_ctr);
     } else {
       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
     }
     restore_arg_regs();
-    __ xorq(rax, rax); // return 0
+    __ xorptr(rax, rax); // return 0
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
     return start;
   }

@@ -1924,15 +2013,18 @@
     __ enter(); // required for proper stackwalking of RuntimeStub frame
     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
 
     address disjoint_copy_entry = NULL;
     if (is_oop) {
+      assert(!UseCompressedOops, "shouldn't be called for compressed oops");
       disjoint_copy_entry = disjoint_oop_copy_entry;
       oop_copy_entry  = __ pc();
+      array_overlap_test(disjoint_oop_copy_entry, Address::times_8);
     } else {
       disjoint_copy_entry = disjoint_long_copy_entry;
       long_copy_entry = __ pc();
+      array_overlap_test(disjoint_long_copy_entry, Address::times_8);
     }
     BLOCK_COMMENT("Entry:");
     // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 
     array_overlap_test(disjoint_copy_entry, Address::times_8);

@@ -1941,49 +2033,47 @@
 
     // 'from', 'to' and 'qword_count' are now valid
 
     if (is_oop) {
       // Save to and count for store barrier
-      __ movq(saved_count, qword_count);
+      __ movptr(saved_count, qword_count);
       // No registers are destroyed by this call
       gen_write_ref_array_pre_barrier(to, saved_count);
     }
 
-    // Copy from high to low addresses.  Use rcx as scratch.
-
     __ jmp(L_copy_32_bytes);
 
     // Copy trailing qwords
   __ BIND(L_copy_8_bytes);
     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
-    __ decrementq(qword_count);
+    __ decrement(qword_count);
     __ jcc(Assembler::notZero, L_copy_8_bytes);
 
     if (is_oop) {
       __ jmp(L_exit);
     } else {
       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
       restore_arg_regs();
-      __ xorq(rax, rax); // return 0
+      __ xorptr(rax, rax); // return 0
       __ leave(); // required for proper stackwalking of RuntimeStub frame
       __ ret(0);
     }
 
     // Copy in 32-bytes chunks
     copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
 
     if (is_oop) {
     __ BIND(L_exit);
-      __ leaq(rcx, Address(to, saved_count, Address::times_8, -8));
+      __ lea(rcx, Address(to, saved_count, Address::times_8, -8));
       gen_write_ref_array_post_barrier(to, rcx, rax);
       inc_counter_np(SharedRuntime::_oop_array_copy_ctr);
     } else {
       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr);
     }
     restore_arg_regs();
-    __ xorq(rax, rax); // return 0
+    __ xorptr(rax, rax); // return 0
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
     return start;
   }

@@ -2008,16 +2098,16 @@
                      Klass::secondary_super_cache_offset_in_bytes());
     Address secondary_supers_addr(sub_klass, ss_offset);
     Address super_cache_addr(     sub_klass, sc_offset);
 
     // if the pointers are equal, we are done (e.g., String[] elements)
-    __ cmpq(super_klass, sub_klass);
+    __ cmpptr(super_klass, sub_klass);
     __ jcc(Assembler::equal, L_success);
 
     // check the supertype display:
     Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
-    __ cmpq(super_klass, super_check_addr); // test the super type
+    __ cmpptr(super_klass, super_check_addr); // test the super type
     __ jcc(Assembler::equal, L_success);
 
     // if it was a primary super, we can just fail immediately
     __ cmpl(super_check_offset, sc_offset);
     __ jcc(Assembler::notEqual, L_miss);

@@ -2026,35 +2116,42 @@
     // The repne_scan instruction uses fixed registers, which we must spill.
     // (We need a couple more temps in any case.)
     // This code is rarely used, so simplicity is a virtue here.
     inc_counter_np(SharedRuntime::_partial_subtype_ctr);
     {
-      __ pushq(rax);
-      __ pushq(rcx);
-      __ pushq(rdi);
+      __ push(rax);
+      __ push(rcx);
+      __ push(rdi);
       assert_different_registers(sub_klass, super_klass, rax, rcx, rdi);
 
-      __ movq(rdi, secondary_supers_addr);
+      __ movptr(rdi, secondary_supers_addr);
       // Load the array length.
       __ movl(rcx, Address(rdi, arrayOopDesc::length_offset_in_bytes())); 
       // Skip to start of data.
-      __ addq(rdi, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
+      __ addptr(rdi, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
       // Scan rcx words at [rdi] for occurance of rax
       // Set NZ/Z based on last compare
-      __ movq(rax, super_klass);
+      __ movptr(rax, super_klass);
+      if (UseCompressedOops) {
+        // Compare against compressed form.  Don't need to uncompress because
+        // looks like orig rax is restored in popq below.
+        __ encode_heap_oop(rax);
+        __ repne_scanl();
+      } else {
       __ repne_scan();
+      }
 
       // Unspill the temp. registers:
-      __ popq(rdi);
-      __ popq(rcx);
-      __ popq(rax);
+      __ pop(rdi);
+      __ pop(rcx);
+      __ pop(rax);
 
       __ jcc(Assembler::notEqual, L_miss);
     }
 
     // Success.  Cache the super we found and proceed in triumph.
-    __ movq(super_cache_addr, super_klass); // note: rax is dead
+    __ movptr(super_cache_addr, super_klass); // note: rax is dead
     __ jmp(L_success);
 
     // Fall through on failure!
     __ BIND(L_miss);
   }

@@ -2116,11 +2213,11 @@
 
 #ifdef ASSERT
     // caller guarantees that the arrays really are different
     // otherwise, we would have to make conjoint checks
     { Label L;
-      array_overlap_test(L, Address::times_8);
+      array_overlap_test(L, TIMES_OOP);
       __ stop("checkcast_copy within a single array");
       __ bind(L);
     }
 #endif //ASSERT
 

@@ -2130,20 +2227,20 @@
       saved_r14_offset,
       saved_rbp_offset,
       saved_rip_offset,
       saved_rarg0_offset
     };
-    __ subq(rsp, saved_rbp_offset * wordSize);
-    __ movq(Address(rsp, saved_r13_offset * wordSize), r13);
-    __ movq(Address(rsp, saved_r14_offset * wordSize), r14);
+    __ subptr(rsp, saved_rbp_offset * wordSize);
+    __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
+    __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
     setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
                        // ckoff => rcx, ckval => r8
                        // r9 and r10 may be used to save non-volatile registers
 #ifdef _WIN64
     // last argument (#4) is on stack on Win64
     const int ckval_offset = saved_rarg0_offset + 4;
-    __ movq(ckval, Address(rsp, ckval_offset * wordSize));
+    __ movptr(ckval, Address(rsp, ckval_offset * wordSize));
 #endif
 
     // check that int operands are properly extended to size_t
     assert_clean_int(length, rax);
     assert_clean_int(ckoff, rax);

@@ -2161,75 +2258,74 @@
       __ bind(L);
     }
 #endif //ASSERT
 
     // Loop-invariant addresses.  They are exclusive end pointers.
-    Address end_from_addr(from, length, Address::times_8, 0);
-    Address   end_to_addr(to,   length, Address::times_8, 0);
+    Address end_from_addr(from, length, TIMES_OOP, 0);
+    Address   end_to_addr(to,   length, TIMES_OOP, 0);
     // Loop-variant addresses.  They assume post-incremented count < 0.
-    Address from_element_addr(end_from, count, Address::times_8, 0);
-    Address   to_element_addr(end_to,   count, Address::times_8, 0);
-    Address oop_klass_addr(rax_oop, oopDesc::klass_offset_in_bytes());
+    Address from_element_addr(end_from, count, TIMES_OOP, 0);
+    Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
 
     gen_write_ref_array_pre_barrier(to, count);
 
     // Copy from low to high addresses, indexed from the end of each array.
-    __ leaq(end_from, end_from_addr);
-    __ leaq(end_to,   end_to_addr);
-    __ movq(r14_length, length);        // save a copy of the length
+    __ lea(end_from, end_from_addr);
+    __ lea(end_to,   end_to_addr);
+    __ movptr(r14_length, length);        // save a copy of the length
     assert(length == count, "");        // else fix next line:
-    __ negq(count);                     // negate and test the length
+    __ negptr(count);                     // negate and test the length
     __ jcc(Assembler::notZero, L_load_element);
 
     // Empty array:  Nothing to do.
-    __ xorq(rax, rax);                  // return 0 on (trivial) success
+    __ xorptr(rax, rax);                  // return 0 on (trivial) success
     __ jmp(L_done);
 
     // ======== begin loop ========
     // (Loop is rotated; its entry is L_load_element.)
     // Loop control:
     //   for (count = -count; count != 0; count++)
     // Base pointers src, dst are biased by 8*(count-1),to last element.
     __ align(16);
     
     __ BIND(L_store_element);
-    __ movq(to_element_addr, rax_oop);  // store the oop
-    __ incrementq(count);               // increment the count toward zero
+    __ store_heap_oop(to_element_addr, rax_oop);  // store the oop
+    __ increment(count);               // increment the count toward zero
     __ jcc(Assembler::zero, L_do_card_marks);
 
     // ======== loop entry is here ========
     __ BIND(L_load_element);
-    __ movq(rax_oop, from_element_addr); // load the oop
-    __ testq(rax_oop, rax_oop);
+    __ load_heap_oop(rax_oop, from_element_addr); // load the oop
+    __ testptr(rax_oop, rax_oop);
     __ jcc(Assembler::zero, L_store_element);
 
-    __ movq(r11_klass, oop_klass_addr); // query the object klass
+    __ load_klass(r11_klass, rax_oop);// query the object klass
     generate_type_check(r11_klass, ckoff, ckval, L_store_element);
     // ======== end loop ========
 
     // It was a real error; we must depend on the caller to finish the job.
     // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
     // Emit GC store barriers for the oops we have copied (r14 + rdx),
     // and report their number to the caller.
     assert_different_registers(rax, r14_length, count, to, end_to, rcx);
-    __ leaq(end_to, to_element_addr);
-    gen_write_ref_array_post_barrier(to, end_to, rcx);
-    __ movq(rax, r14_length);           // original oops
-    __ addq(rax, count);                // K = (original - remaining) oops
-    __ notq(rax);                       // report (-1^K) to caller
+    __ lea(end_to, to_element_addr);
+    gen_write_ref_array_post_barrier(to, end_to, rscratch1);
+    __ movptr(rax, r14_length);           // original oops
+    __ addptr(rax, count);                // K = (original - remaining) oops
+    __ notptr(rax);                       // report (-1^K) to caller
     __ jmp(L_done);
 
     // Come here on success only.
     __ BIND(L_do_card_marks);
-    __ addq(end_to, -wordSize);         // make an inclusive end pointer
-    gen_write_ref_array_post_barrier(to, end_to, rcx);
-    __ xorq(rax, rax);                  // return 0 on success
+    __ addptr(end_to, -wordSize);         // make an inclusive end pointer
+    gen_write_ref_array_post_barrier(to, end_to, rscratch1);
+    __ xorptr(rax, rax);                  // return 0 on success
 
     // Common exit point (success or failure).
     __ BIND(L_done);
-    __ movq(r13, Address(rsp, saved_r13_offset * wordSize));
-    __ movq(r14, Address(rsp, saved_r14_offset * wordSize));
+    __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
+    __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
     restore_arg_regs();
     __ leave(); // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 

@@ -2268,13 +2364,13 @@
     __ enter(); // required for proper stackwalking of RuntimeStub frame
 
     // bump this on entry, not on exit:
     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
 
-    __ movq(bits, from);
-    __ orq(bits, to);
-    __ orq(bits, size);
+    __ mov(bits, from);
+    __ orptr(bits, to);
+    __ orptr(bits, size);
 
     __ testb(bits, BytesPerLong-1);
     __ jccb(Assembler::zero, L_long_aligned);
 
     __ testb(bits, BytesPerInt-1);

@@ -2282,19 +2378,19 @@
 
     __ testb(bits, BytesPerShort-1);
     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
 
     __ BIND(L_short_aligned);
-    __ shrq(size, LogBytesPerShort); // size => short_count
+    __ shrptr(size, LogBytesPerShort); // size => short_count
     __ jump(RuntimeAddress(short_copy_entry));
 
     __ BIND(L_int_aligned);
-    __ shrq(size, LogBytesPerInt); // size => int_count
+    __ shrptr(size, LogBytesPerInt); // size => int_count
     __ jump(RuntimeAddress(int_copy_entry));
 
     __ BIND(L_long_aligned);
-    __ shrq(size, LogBytesPerLong); // size => qword_count
+    __ shrptr(size, LogBytesPerLong); // size => qword_count
     __ jump(RuntimeAddress(long_copy_entry));
 
     return start;
   }
 

@@ -2398,20 +2494,20 @@
     // (7) src_pos + length must not exceed length of src.
     // (8) dst_pos + length must not exceed length of dst.
     // 
 
     //  if (src == NULL) return -1;
-    __ testq(src, src);         // src oop
+    __ testptr(src, src);         // src oop
     size_t j1off = __ offset();
     __ jccb(Assembler::zero, L_failed_0);
 
     //  if (src_pos < 0) return -1;
     __ testl(src_pos, src_pos); // src_pos (32-bits)
     __ jccb(Assembler::negative, L_failed_0);
 
     //  if (dst == NULL) return -1;
-    __ testq(dst, dst);         // dst oop
+    __ testptr(dst, dst);         // dst oop
     __ jccb(Assembler::zero, L_failed_0);
 
     //  if (dst_pos < 0) return -1;
     __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
     size_t j4off = __ offset();

@@ -2426,29 +2522,29 @@
     guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
 
     // registers used as temp
     const Register r11_length    = r11; // elements count to copy
     const Register r10_src_klass = r10; // array klass
+    const Register r9_dst_klass  = r9;  // dest array klass
 
     //  if (length < 0) return -1;
     __ movl(r11_length, C_RARG4);       // length (elements count, 32-bits value)
     __ testl(r11_length, r11_length);
     __ jccb(Assembler::negative, L_failed_0);
 
-    Address src_klass_addr(src, oopDesc::klass_offset_in_bytes());
-    Address dst_klass_addr(dst, oopDesc::klass_offset_in_bytes());
-    __ movq(r10_src_klass, src_klass_addr);
+    __ load_klass(r10_src_klass, src);
 #ifdef ASSERT
     //  assert(src->klass() != NULL);
     BLOCK_COMMENT("assert klasses not null");
     { Label L1, L2;
-      __ testq(r10_src_klass, r10_src_klass);
+      __ testptr(r10_src_klass, r10_src_klass);
       __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
       __ bind(L1);
       __ stop("broken null klass");
       __ bind(L2);
-      __ cmpq(dst_klass_addr, 0);
+      __ load_klass(r9_dst_klass, dst);
+      __ cmpq(r9_dst_klass, 0);
       __ jcc(Assembler::equal, L1);     // this would be broken also
       BLOCK_COMMENT("assert done");
     }
 #endif
 

@@ -2471,11 +2567,12 @@
     jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
     __ cmpl(rax_lh, objArray_lh);
     __ jcc(Assembler::equal, L_objArray);
 
     //  if (src->klass() != dst->klass()) return -1;
-    __ cmpq(r10_src_klass, dst_klass_addr);
+    __ load_klass(r9_dst_klass, dst);
+    __ cmpq(r10_src_klass, r9_dst_klass);
     __ jcc(Assembler::notEqual, L_failed);
 
     //  if (!src->is_Array()) return -1;
     __ cmpl(rax_lh, Klass::_lh_neutral_value);
     __ jcc(Assembler::greaterEqual, L_failed);

@@ -2502,13 +2599,13 @@
     const Register r10_offset = r10;    // array offset
     const Register rax_elsize = rax_lh; // element size
 
     __ movl(r10_offset, rax_lh);
     __ shrl(r10_offset, Klass::_lh_header_size_shift);
-    __ andq(r10_offset, Klass::_lh_header_size_mask);   // array_offset
-    __ addq(src, r10_offset);           // src array offset
-    __ addq(dst, r10_offset);           // dst array offset
+    __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
+    __ addptr(src, r10_offset);           // src array offset
+    __ addptr(dst, r10_offset);           // dst array offset
     BLOCK_COMMENT("choose copy loop based on element size");
     __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
 
     // next registers should be set before the jump to corresponding stub
     const Register from     = c_rarg0;  // source array address

@@ -2519,29 +2616,29 @@
     // since they are the same as 'src', 'src_pos', 'dst'.
 
   __ BIND(L_copy_bytes);
     __ cmpl(rax_elsize, 0);
     __ jccb(Assembler::notEqual, L_copy_shorts);
-    __ leaq(from, Address(src, src_pos, Address::times_1, 0));// src_addr
-    __ leaq(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
-    __ movslq(count, r11_length); // length
+    __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
+    __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
+    __ movl2ptr(count, r11_length); // length
     __ jump(RuntimeAddress(byte_copy_entry));
 
   __ BIND(L_copy_shorts);
     __ cmpl(rax_elsize, LogBytesPerShort);
     __ jccb(Assembler::notEqual, L_copy_ints);
-    __ leaq(from, Address(src, src_pos, Address::times_2, 0));// src_addr
-    __ leaq(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
-    __ movslq(count, r11_length); // length
+    __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
+    __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
+    __ movl2ptr(count, r11_length); // length
     __ jump(RuntimeAddress(short_copy_entry));
 
   __ BIND(L_copy_ints);
     __ cmpl(rax_elsize, LogBytesPerInt);
     __ jccb(Assembler::notEqual, L_copy_longs);
-    __ leaq(from, Address(src, src_pos, Address::times_4, 0));// src_addr
-    __ leaq(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
-    __ movslq(count, r11_length); // length
+    __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
+    __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
+    __ movl2ptr(count, r11_length); // length
     __ jump(RuntimeAddress(int_copy_entry));
 
   __ BIND(L_copy_longs);
 #ifdef ASSERT
     { Label L;

@@ -2549,42 +2646,43 @@
       __ jcc(Assembler::equal, L);
       __ stop("must be long copy, but elsize is wrong");
       __ bind(L);
     }
 #endif
-    __ leaq(from, Address(src, src_pos, Address::times_8, 0));// src_addr
-    __ leaq(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
-    __ movslq(count, r11_length); // length
+    __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
+    __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
+    __ movl2ptr(count, r11_length); // length
     __ jump(RuntimeAddress(long_copy_entry));
 
     // objArrayKlass
   __ BIND(L_objArray);
     // live at this point:  r10_src_klass, src[_pos], dst[_pos]
 
     Label L_plain_copy, L_checkcast_copy;
     //  test array classes for subtyping
-    __ cmpq(r10_src_klass, dst_klass_addr); // usual case is exact equality
+    __ load_klass(r9_dst_klass, dst);
+    __ cmpq(r10_src_klass, r9_dst_klass); // usual case is exact equality
     __ jcc(Assembler::notEqual, L_checkcast_copy);
 
     // Identically typed arrays can be copied without element-wise checks.
     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
                            r10, L_failed);
 
-    __ leaq(from, Address(src, src_pos, Address::times_8,
+    __ lea(from, Address(src, src_pos, TIMES_OOP,
                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
-    __ leaq(to,   Address(dst, dst_pos, Address::times_8,
+    __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
-    __ movslq(count, r11_length); // length
+    __ movl2ptr(count, r11_length); // length
   __ BIND(L_plain_copy);
     __ jump(RuntimeAddress(oop_copy_entry));
 
   __ BIND(L_checkcast_copy);
     // live at this point:  r10_src_klass, !r11_length
     {
       // assert(r11_length == C_RARG4); // will reload from here
       Register r11_dst_klass = r11;
-      __ movq(r11_dst_klass, dst_klass_addr);
+      __ load_klass(r11_dst_klass, dst);
 
       // Before looking at dst.length, make sure dst is also an objArray.
       __ cmpl(Address(r11_dst_klass, lh_offset), objArray_lh);
       __ jcc(Assembler::notEqual, L_failed);
 

@@ -2594,17 +2692,17 @@
                              rax, L_failed);
 #else
       __ movl(r11_length, C_RARG4);     // reload
       arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
                              rax, L_failed);
-      __ movl(r11_dst_klass, dst_klass_addr); // reload
+      __ load_klass(r11_dst_klass, dst); // reload
 #endif
 
       // Marshal the base address arguments now, freeing registers.
-      __ leaq(from, Address(src, src_pos, Address::times_8,
+      __ lea(from, Address(src, src_pos, TIMES_OOP,
                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
-      __ leaq(to,   Address(dst, dst_pos, Address::times_8,
+      __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
       __ movl(count, C_RARG4);          // length (reloaded)
       Register sco_temp = c_rarg3;      // this register is free now
       assert_different_registers(from, to, count, sco_temp,
                                  r11_dst_klass, r10_src_klass);

@@ -2618,23 +2716,23 @@
       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
 
       // Fetch destination element klass from the objArrayKlass header.
       int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
                        objArrayKlass::element_klass_offset_in_bytes());
-      __ movq(r11_dst_klass, Address(r11_dst_klass, ek_offset));
+      __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
       __ movl(sco_temp,      Address(r11_dst_klass, sco_offset));
       assert_clean_int(sco_temp, rax);
 
       // the checkcast_copy loop needs two extra arguments:
       assert(c_rarg3 == sco_temp, "#3 already in place");
-      __ movq(C_RARG4, r11_dst_klass);  // dst.klass.element_klass
+      __ movptr(C_RARG4, r11_dst_klass);  // dst.klass.element_klass
       __ jump(RuntimeAddress(checkcast_copy_entry));
     }
 
   __ BIND(L_failed);
-    __ xorq(rax, rax);
-    __ notq(rax); // return -1
+    __ xorptr(rax, rax);
+    __ notptr(rax); // return -1
     __ leave();   // required for proper stackwalking of RuntimeStub frame
     __ ret(0);
 
     return start;
   }

@@ -2649,18 +2747,24 @@
     StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
 
     StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
     StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, "jshort_arraycopy");
 
-    StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_copy(false, "jint_disjoint_arraycopy");
-    StubRoutines::_jint_arraycopy            = generate_conjoint_int_copy(false, "jint_arraycopy");
+    StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, "jint_disjoint_arraycopy");
+    StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(false, false, "jint_arraycopy");
 
     StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, false, "jlong_disjoint_arraycopy");
     StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(false, false, "jlong_arraycopy");
 
+
+    if (UseCompressedOops) {
+      StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(false, true, "oop_disjoint_arraycopy");
+      StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(false, true, "oop_arraycopy");
+    } else {
     StubRoutines::_oop_disjoint_arraycopy    = generate_disjoint_long_oop_copy(false, true, "oop_disjoint_arraycopy");
     StubRoutines::_oop_arraycopy             = generate_conjoint_long_oop_copy(false, true, "oop_arraycopy");
+    }
 
     StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy");
     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy");
     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy");
 

@@ -2727,30 +2831,30 @@
     // This is an inlined and slightly modified version of call_VM
     // which has the ability to fetch the return PC out of
     // thread-local storage and also sets up last_Java_sp slightly
     // differently than the real call_VM
     if (restore_saved_exception_pc) {
-      __ movq(rax,
+      __ movptr(rax,
               Address(r15_thread,
                       in_bytes(JavaThread::saved_exception_pc_offset())));
-      __ pushq(rax);
+      __ push(rax);
     }
       
     __ enter(); // required for proper stackwalking of RuntimeStub frame
 
     assert(is_even(framesize/2), "sp not 16-byte aligned");
 
     // return address and rbp are already in place
-    __ subq(rsp, (framesize-4) << LogBytesPerInt); // prolog
+    __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
 
     int frame_complete = __ pc() - start;
 
     // Set up last_Java_sp and last_Java_fp
     __ set_last_Java_frame(rsp, rbp, NULL);
 
     // Call runtime
-    __ movq(c_rarg0, r15_thread);
+    __ movptr(c_rarg0, r15_thread);
     BLOCK_COMMENT("call runtime_entry");
     __ call(RuntimeAddress(runtime_entry));
 
     // Generate oop map
     OopMap* map = new OopMap(framesize, 0);

@@ -2762,12 +2866,12 @@
     __ leave(); // required for proper stackwalking of RuntimeStub frame
 
     // check for pending exceptions
 #ifdef ASSERT
     Label L;
-    __ cmpq(Address(r15_thread, Thread::pending_exception_offset()),
-            (int) NULL);
+    __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()),
+            (int32_t) NULL_WORD);
     __ jcc(Assembler::notEqual, L);
     __ should_not_reach_here();
     __ bind(L);
 #endif // ASSERT
     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));

@@ -2786,11 +2890,11 @@
   // Initialization
   void generate_initial() {
     // Generates all stubs and initializes the entry points
 
     // This platform-specific stub is needed by generate_call_stub()
-    StubRoutines::amd64::_mxcsr_std        = generate_fp_mask("mxcsr_std",        0x0000000000001F80);
+    StubRoutines::x86::_mxcsr_std        = generate_fp_mask("mxcsr_std",        0x0000000000001F80);
 
     // entry points that exist in all platforms Note: This is code
     // that could be shared among different platforms - however the
     // benefit seems to be smaller than the disadvantage of having a
     // much more complicated generator structure. See also comment in

@@ -2815,13 +2919,13 @@
 
     StubRoutines::_handler_for_unsafe_access_entry =
       generate_handler_for_unsafe_access();
 
     // platform dependent
-    StubRoutines::amd64::_get_previous_fp_entry = generate_get_previous_fp();
+    StubRoutines::x86::_get_previous_fp_entry = generate_get_previous_fp();
 
-    StubRoutines::amd64::_verify_mxcsr_entry    = generate_verify_mxcsr();
+    StubRoutines::x86::_verify_mxcsr_entry    = generate_verify_mxcsr();
   }
 
   void generate_all() {
     // Generates all stubs and initializes the entry points
     

@@ -2869,19 +2973,19 @@
                                                 SharedRuntime::
                                                 throw_StackOverflowError),
                                false);
 
     // entry points that are platform specific  
-    StubRoutines::amd64::_f2i_fixup = generate_f2i_fixup();
-    StubRoutines::amd64::_f2l_fixup = generate_f2l_fixup();
-    StubRoutines::amd64::_d2i_fixup = generate_d2i_fixup();
-    StubRoutines::amd64::_d2l_fixup = generate_d2l_fixup();
-
-    StubRoutines::amd64::_float_sign_mask  = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
-    StubRoutines::amd64::_float_sign_flip  = generate_fp_mask("float_sign_flip",  0x8000000080000000);
-    StubRoutines::amd64::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
-    StubRoutines::amd64::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
+    StubRoutines::x86::_f2i_fixup = generate_f2i_fixup();
+    StubRoutines::x86::_f2l_fixup = generate_f2l_fixup();
+    StubRoutines::x86::_d2i_fixup = generate_d2i_fixup();
+    StubRoutines::x86::_d2l_fixup = generate_d2l_fixup();
+
+    StubRoutines::x86::_float_sign_mask  = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
+    StubRoutines::x86::_float_sign_flip  = generate_fp_mask("float_sign_flip",  0x8000000080000000);
+    StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
+    StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
 
     // support for verify_oop (must happen after universe_init)
     StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
 
     // arraycopy stubs used by compilers