--- old/src/cpu/aarch64/vm/c1_CodeStubs_aarch64.cpp	2015-08-12 14:08:48.000000000 +0200
+++ new/src/cpu/aarch64/vm/c1_CodeStubs_aarch64.cpp	2015-08-12 14:08:48.000000000 +0200
@@ -347,6 +347,9 @@
   __ b(_continuation);
 }
 
+void C1ThreadLocalSafepoint::emit_code(LIR_Assembler* ce) {
+  ShouldNotReachHere();
+}
 
 /////////////////////////////////////////////////////////////////////////////
 #if INCLUDE_ALL_GCS
--- old/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	2015-08-12 14:08:49.000000000 +0200
+++ new/src/cpu/aarch64/vm/c1_LIRAssembler_aarch64.cpp	2015-08-12 14:08:49.000000000 +0200
@@ -486,7 +486,7 @@
 
 // This is the fast version of java.lang.String.compare; it has not
 // OSR-entry and therefore, we generate a slow version for OSR's
-void LIR_Assembler::emit_string_compare(LIR_Opr arg0, LIR_Opr arg1, LIR_Opr dst, CodeEmitInfo* info)  {
+void LIR_Assembler::emit_string_compare(LIR_Opr arg0, LIR_Opr arg1, LIR_Opr dst, C1ThreadLocalSafepoint *tls_stub, CodeEmitInfo* info)  {
   __ mov(r2, (address)__FUNCTION__);
   __ call_Unimplemented();
 }
@@ -538,7 +538,7 @@
   __ bind(nope);
 }
 
-void LIR_Assembler::return_op(LIR_Opr result) {
+void LIR_Assembler::return_op(LIR_Opr result, C1ThreadLocalSafepoint *tls_stub) {
   assert(result->is_illegal() || !result->is_single_cpu() || result->as_register() == r0, "word returns are in r0,");
   // Pop the stack before the safepoint code
   __ remove_frame(initial_frame_size_in_bytes());
@@ -547,7 +547,7 @@
   __ ret(lr);
 }
 
-int LIR_Assembler::safepoint_poll(LIR_Opr tmp, CodeEmitInfo* info) {
+int LIR_Assembler::safepoint_poll(LIR_Opr tmp, C1ThreadLocalSafepoint *tls_stub, CodeEmitInfo* info) {
   address polling_page(os::get_polling_page());
   guarantee(info != NULL, "Shouldn't be NULL");
   assert(os::is_poll_address(polling_page), "should be");
--- old/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp	2015-08-12 14:08:51.000000000 +0200
+++ new/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp	2015-08-12 14:08:50.000000000 +0200
@@ -452,6 +452,9 @@
   __ delayed()->nop();
 }
 
+void C1ThreadLocalSafepoint::emit_code(LIR_Assembler* ce) {
+  ShouldNotReachHere();
+}
 
 ///////////////////////////////////////////////////////////////////////////////////
 #if INCLUDE_ALL_GCS
--- old/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp	2015-08-12 14:08:52.000000000 +0200
+++ new/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp	2015-08-12 14:08:52.000000000 +0200
@@ -235,7 +235,7 @@
 // Optimized Library calls
 // This is the fast version of java.lang.String.compare; it has not
 // OSR-entry and therefore, we generate a slow version for OSR's
-void LIR_Assembler::emit_string_compare(LIR_Opr left, LIR_Opr right, LIR_Opr dst, CodeEmitInfo* info) {
+void LIR_Assembler::emit_string_compare(LIR_Opr left, LIR_Opr right, LIR_Opr dst, C1ThreadLocalSafepoint *tls_stub, CodeEmitInfo* info) {
   Register str0 = left->as_register();
   Register str1 = right->as_register();
 
@@ -1564,7 +1564,7 @@
 }
 
 
-void LIR_Assembler::return_op(LIR_Opr result) {
+void LIR_Assembler::return_op(LIR_Opr result, C1ThreadLocalSafepoint *tls_stub) {
   // the poll may need a register so just pick one that isn't the return register
 #if defined(TIERED) && !defined(_LP64)
   if (result->type_field() == LIR_OprDesc::long_type) {
@@ -1588,7 +1588,7 @@
 }
 
 
-int LIR_Assembler::safepoint_poll(LIR_Opr tmp, CodeEmitInfo* info) {
+int LIR_Assembler::safepoint_poll(LIR_Opr tmp, C1ThreadLocalSafepoint *tls_stub, CodeEmitInfo* info) {
   __ set((intptr_t)os::get_polling_page(), tmp->as_register());
   if (info != NULL) {
     add_debug_info_for_branch(info);
--- old/src/cpu/x86/vm/assembler_x86.cpp	2015-08-12 14:08:53.000000000 +0200
+++ new/src/cpu/x86/vm/assembler_x86.cpp	2015-08-12 14:08:53.000000000 +0200
@@ -3498,6 +3498,14 @@
   emit_arith_b(0xF6, 0xC0, dst, imm8);
 }
 
+void Assembler::testb(Address dst, int8_t imm8) {
+  InstructionMark im(this);
+  prefix(dst);
+  emit_int8((unsigned char)0xF6);
+  emit_operand(rax, dst, 1);
+  emit_int8((unsigned char)imm8);
+}
+
 void Assembler::testl(Register dst, int32_t imm32) {
   // not using emit_arith because test
   // doesn't support sign-extension of
--- old/src/cpu/x86/vm/assembler_x86.hpp	2015-08-12 14:08:55.000000000 +0200
+++ new/src/cpu/x86/vm/assembler_x86.hpp	2015-08-12 14:08:54.000000000 +0200
@@ -1818,6 +1818,7 @@
   void subss(XMMRegister dst, XMMRegister src);
 
   void testb(Register dst, int imm8);
+  void testb(Address dst, int8_t imm8);
 
   void testl(Register dst, int32_t imm32);
   void testl(Register dst, Register src);
--- old/src/cpu/x86/vm/c1_CodeStubs_x86.cpp	2015-08-12 14:08:56.000000000 +0200
+++ new/src/cpu/x86/vm/c1_CodeStubs_x86.cpp	2015-08-12 14:08:56.000000000 +0200
@@ -518,6 +518,33 @@
   __ jmp(_continuation);
 }
 
+void C1ThreadLocalSafepoint::emit_code(LIR_Assembler* ce) {
+#ifdef _LP64
+  __ bind(_entry);
+  InternalAddress pc_addr(safepoint_pc());
+  __ lea(rscratch1, pc_addr);
+  __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rscratch1);
+
+  address stub;
+
+  if (is_return()) {
+    assert(SharedRuntime::polling_page_return_handler_blob() != NULL,
+           "polling page return stub not created yet");
+    stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
+  } else {
+    assert(SharedRuntime::polling_page_safepoint_handler_blob() != NULL,
+           "polling page safepoint stub not created yet");
+    stub = SharedRuntime::polling_page_safepoint_handler_blob()->entry_point();
+  }
+
+  RuntimeAddress callback_addr(stub);
+
+  __ jump(callback_addr);
+#else
+  ShouldNotReachHere();
+#endif /* _LP64 */
+}
+
 /////////////////////////////////////////////////////////////////////////////
 #if INCLUDE_ALL_GCS
 
--- old/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	2015-08-12 14:08:57.000000000 +0200
+++ new/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	2015-08-12 14:08:57.000000000 +0200
@@ -26,6 +26,7 @@
 #include "asm/macroAssembler.hpp"
 #include "asm/macroAssembler.inline.hpp"
 #include "c1/c1_Compilation.hpp"
+#include "c1/c1_CodeStubs.hpp"
 #include "c1/c1_LIRAssembler.hpp"
 #include "c1/c1_MacroAssembler.hpp"
 #include "c1/c1_Runtime1.hpp"
@@ -513,7 +514,7 @@
 
 // This is the fast version of java.lang.String.compare; it has not
 // OSR-entry and therefore, we generate a slow version for OSR's
-void LIR_Assembler::emit_string_compare(LIR_Opr arg0, LIR_Opr arg1, LIR_Opr dst, CodeEmitInfo* info) {
+void LIR_Assembler::emit_string_compare(LIR_Opr arg0, LIR_Opr arg1, LIR_Opr dst, C1ThreadLocalSafepoint *tls_stub, CodeEmitInfo* info) {
   __ movptr (rbx, rcx); // receiver is in rcx
   __ movptr (rax, arg1->as_register());
 
@@ -583,7 +584,7 @@
 
   __ bind(noLoop);
   __ pop(rax);
-  return_op(LIR_OprFact::illegalOpr);
+  return_op(LIR_OprFact::illegalOpr, tls_stub);
 
   __ bind(haveResult);
   // leave instruction is going to discard the TOS value
@@ -591,7 +592,7 @@
 }
 
 
-void LIR_Assembler::return_op(LIR_Opr result) {
+void LIR_Assembler::return_op(LIR_Opr result, C1ThreadLocalSafepoint *code_stub) {
   assert(result->is_illegal() || !result->is_single_cpu() || result->as_register() == rax, "word returns are in rax,");
   if (!result->is_illegal() && result->is_float_kind() && !result->is_xmm_register()) {
     assert(result->fpu() == 0, "result must already be on TOS");
@@ -604,33 +605,58 @@
 
   // Note: we do not need to round double result; float result has the right precision
   // the poll sets the condition code, but no data registers
-  AddressLiteral polling_page(os::get_polling_page(), relocInfo::poll_return_type);
 
-  if (Assembler::is_polling_page_far()) {
-    __ lea(rscratch1, polling_page);
-    __ relocate(relocInfo::poll_return_type);
-    __ testl(rax, Address(rscratch1, 0));
+  if (!ThreadLocalSafepoints) {
+    AddressLiteral polling_page(os::get_polling_page(), relocInfo::poll_return_type);
+
+    if (Assembler::is_polling_page_far()) {
+      __ lea(rscratch1, polling_page);
+      __ relocate(relocInfo::poll_return_type);
+      __ testl(rax, Address(rscratch1, 0));
+    } else {
+      __ testl(rax, polling_page);
+    }
   } else {
-    __ testl(rax, polling_page);
+#ifdef _LP64
+    code_stub->set_safepoint_pc(__ pc());
+    __ relocate(relocInfo::poll_return_type);
+    __ testb(Address(r15_thread, Thread::yieldpoint_offset()), 2);
+    __ jcc(Assembler::equal, *code_stub->entry());
+#else
+    ShouldNotReachHere();
+#endif
   }
   __ ret(0);
 }
 
 
-int LIR_Assembler::safepoint_poll(LIR_Opr tmp, CodeEmitInfo* info) {
+int LIR_Assembler::safepoint_poll(LIR_Opr tmp, C1ThreadLocalSafepoint *code_stub, CodeEmitInfo* info) {
   AddressLiteral polling_page(os::get_polling_page(), relocInfo::poll_type);
   guarantee(info != NULL, "Shouldn't be NULL");
   int offset = __ offset();
-  if (Assembler::is_polling_page_far()) {
-    __ lea(rscratch1, polling_page);
-    offset = __ offset();
-    add_debug_info_for_branch(info);
-    __ relocate(relocInfo::poll_type);
-    __ testl(rax, Address(rscratch1, 0));
+  if (!ThreadLocalSafepoints) {
+    if (Assembler::is_polling_page_far()) {
+      __ lea(rscratch1, polling_page);
+      offset = __ offset();
+      add_debug_info_for_branch(info);
+      __ relocate(relocInfo::poll_type);
+      __ testl(rax, Address(rscratch1, 0));
+    } else {
+      add_debug_info_for_branch(info);
+      __ testl(rax, polling_page);
+    }
   } else {
+#ifdef _LP64
     add_debug_info_for_branch(info);
-    __ testl(rax, polling_page);
+    code_stub->set_safepoint_pc(__ pc());
+    __ relocate(relocInfo::poll_type);
+    __ testb(Address(r15_thread, Thread::yieldpoint_offset()), 1);
+    __ jcc(Assembler::equal, *code_stub->entry());
+#else
+    ShouldNotReachHere();
+#endif
   }
+
   return offset;
 }
 
--- old/src/cpu/x86/vm/c1_Runtime1_x86.cpp	2015-08-12 14:08:58.000000000 +0200
+++ new/src/cpu/x86/vm/c1_Runtime1_x86.cpp	2015-08-12 14:08:58.000000000 +0200
@@ -1774,12 +1774,17 @@
 
         NOT_LP64(__ get_thread(thread);)
 
-        __ cmpb(Address(card_addr, 0), (int)G1SATBCardTableModRefBS::g1_young_card_val());
-        __ jcc(Assembler::equal, done);
-
-        __ membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));
-        __ cmpb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val());
-        __ jcc(Assembler::equal, done);
+        if (G1ElideMembar) {
+          __ cmpb(Address(card_addr, 0), (int)CardTableModRefBS::clean_card_val());
+          __ jcc(Assembler::notEqual, done);
+        } else {
+          __ cmpb(Address(card_addr, 0), (int)G1SATBCardTableModRefBS::g1_young_card_val());
+          __ jcc(Assembler::equal, done);
+
+          __ membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));
+          __ cmpb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val());
+          __ jcc(Assembler::equal, done);
+        }
 
         // storing region crossing non-NULL, card is clean.
         // dirty card and log.
--- old/src/cpu/x86/vm/macroAssembler_x86.cpp	2015-08-12 14:08:59.000000000 +0200
+++ new/src/cpu/x86/vm/macroAssembler_x86.cpp	2015-08-12 14:08:59.000000000 +0200
@@ -4223,13 +4223,17 @@
   movptr(cardtable, (intptr_t)ct->byte_map_base);
   addptr(card_addr, cardtable);
 
-  cmpb(Address(card_addr, 0), (int)G1SATBCardTableModRefBS::g1_young_card_val());
-  jcc(Assembler::equal, done);
-
-  membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));
-  cmpb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val());
-  jcc(Assembler::equal, done);
+  if (G1ElideMembar) {
+    cmpb(Address(card_addr, 0), (int)CardTableModRefBS::clean_card_val());
+    jcc(Assembler::notEqual, done);
+  } else {
+    cmpb(Address(card_addr, 0), (int)G1SATBCardTableModRefBS::g1_young_card_val());
+    jcc(Assembler::equal, done);
 
+    membar(Assembler::Membar_mask_bits(Assembler::StoreLoad));
+    cmpb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val());
+    jcc(Assembler::equal, done);
+  }
 
   // storing a region crossing, non-NULL oop, card is clean.
   // dirty card and log.
--- old/src/cpu/x86/vm/nativeInst_x86.hpp	2015-08-12 14:09:01.000000000 +0200
+++ new/src/cpu/x86/vm/nativeInst_x86.hpp	2015-08-12 14:09:01.000000000 +0200
@@ -526,7 +526,11 @@
 inline bool NativeInstruction::is_cond_jump()    { return (int_at(0) & 0xF0FF) == 0x800F /* long jump */ ||
                                                           (ubyte_at(0) & 0xF0) == 0x70;  /* short jump */ }
 inline bool NativeInstruction::is_safepoint_poll() {
+  // TODO: Fix up parsing of safepoint poll code. Skipping now as it doesn't seem to be used for much other than asserts.
 #ifdef AMD64
+  if (ThreadLocalSafepoints) {
+    return true;
+  }
   if (Assembler::is_polling_page_far()) {
     // two cases, depending on the choice of the base register in the address.
     if (((ubyte_at(0) & NativeTstRegMem::instruction_rex_prefix_mask) == NativeTstRegMem::instruction_rex_prefix &&
--- old/src/cpu/x86/vm/relocInfo_x86.cpp	2015-08-12 14:09:02.000000000 +0200
+++ new/src/cpu/x86/vm/relocInfo_x86.cpp	2015-08-12 14:09:02.000000000 +0200
@@ -180,7 +180,7 @@
 
 void poll_Relocation::fix_relocation_after_move(const CodeBuffer* src, CodeBuffer* dest) {
 #ifdef _LP64
-  if (!Assembler::is_polling_page_far()) {
+  if (!ThreadLocalSafepoints && !Assembler::is_polling_page_far()) {
     typedef Assembler::WhichOperand WhichOperand;
     WhichOperand which = (WhichOperand) format();
     // This format is imm but it is really disp32
@@ -202,7 +202,7 @@
 
 void poll_return_Relocation::fix_relocation_after_move(const CodeBuffer* src, CodeBuffer* dest) {
 #ifdef _LP64
-  if (!Assembler::is_polling_page_far()) {
+  if (!ThreadLocalSafepoints && !Assembler::is_polling_page_far()) {
     typedef Assembler::WhichOperand WhichOperand;
     WhichOperand which = (WhichOperand) format();
     // This format is imm but it is really disp32
--- old/src/cpu/x86/vm/x86_64.ad	2015-08-12 14:09:03.000000000 +0200
+++ new/src/cpu/x86/vm/x86_64.ad	2015-08-12 14:09:03.000000000 +0200
@@ -938,7 +938,11 @@
   st->print_cr("popq   rbp");
   if (do_polling() && C->is_method_compilation()) {
     st->print("\t");
-    if (Assembler::is_polling_page_far()) {
+    if (ThreadLocalSafepoints) {
+      st->print_cr("testb  $1, [r15]\t"
+                   "# Safepoint: poll for GC\n\t");
+      st->print_cr("je  #slow_safepoint_runtime");
+    } else if (Assembler::is_polling_page_far()) {
       st->print_cr("movq   rscratch1, #polling_page_address\n\t"
                    "testl  rax, [rscratch1]\t"
                    "# Safepoint: poll for GC");
@@ -987,7 +991,14 @@
   if (do_polling() && C->is_method_compilation()) {
     MacroAssembler _masm(&cbuf);
     AddressLiteral polling_page(os::get_polling_page(), relocInfo::poll_return_type);
-    if (Assembler::is_polling_page_far()) {
+    if (ThreadLocalSafepoints) {
+      InternalAddress safepoint_pc(__ pc());
+      Label dummy_label;
+      Label &code_stub = &cbuf == C->code_buffer() ? C->tls_table()->add_safepoint(safepoint_pc, true) : dummy_label;
+      __ relocate(relocInfo::poll_return_type);
+      __ testb(Address(r15_thread, Thread::yieldpoint_offset()), 2);
+      __ jcc(Assembler::equal, code_stub);
+    } else if (Assembler::is_polling_page_far()) {
       __ lea(rscratch1, polling_page);
       __ relocate(relocInfo::poll_return_type);
       __ testl(rax, Address(rscratch1, 0));
@@ -1005,7 +1016,7 @@
 
 int MachEpilogNode::reloc() const
 {
-  return 2; // a large enough number
+  return 3; // a large enough number
 }
 
 const Pipeline* MachEpilogNode::pipeline() const
@@ -11552,7 +11563,7 @@
 // Safepoint Instructions
 instruct safePoint_poll(rFlagsReg cr)
 %{
-  predicate(!Assembler::is_polling_page_far());
+  predicate(!Assembler::is_polling_page_far() || ThreadLocalSafepoints);
   match(SafePoint);
   effect(KILL cr);
 
@@ -11560,15 +11571,25 @@
             "# Safepoint: poll for GC" %}
   ins_cost(125);
   ins_encode %{
-    AddressLiteral addr(os::get_polling_page(), relocInfo::poll_type);
-    __ testl(rax, addr);
+    if (ThreadLocalSafepoints) {
+      Compile* C = ra_->C;
+      InternalAddress safepoint_pc(__ pc());
+      Label dummy_label;
+      Label &code_stub = &cbuf == C->code_buffer() ? C->tls_table()->add_safepoint(safepoint_pc, false) : dummy_label;
+      __ relocate(relocInfo::poll_type);
+      __ testb(Address(r15_thread, Thread::yieldpoint_offset()), 1);
+      __ jcc(Assembler::equal, code_stub);
+    } else {
+      AddressLiteral addr(os::get_polling_page(), relocInfo::poll_type);
+      __ testl(rax, addr);
+    }
   %}
   ins_pipe(ialu_reg_mem);
 %}
 
 instruct safePoint_poll_far(rFlagsReg cr, rRegP poll)
 %{
-  predicate(Assembler::is_polling_page_far());
+  predicate(Assembler::is_polling_page_far() && !ThreadLocalSafepoints);
   match(SafePoint poll);
   effect(KILL cr, USE poll);
 
--- old/src/os/aix/vm/osThread_aix.cpp	2015-08-12 14:09:05.000000000 +0200
+++ new/src/os/aix/vm/osThread_aix.cpp	2015-08-12 14:09:05.000000000 +0200
@@ -53,3 +53,7 @@
 void OSThread::pd_destroy() {
   delete _startThread_lock;
 }
+
+bool OSThread::is_online() {
+  return true;
+}
--- old/src/os/bsd/vm/osThread_bsd.cpp	2015-08-12 14:09:06.000000000 +0200
+++ new/src/os/bsd/vm/osThread_bsd.cpp	2015-08-12 14:09:06.000000000 +0200
@@ -51,3 +51,28 @@
 void OSThread::pd_destroy() {
   delete _startThread_lock;
 }
+
+bool OSThread::is_online() {
+#ifdef __APPLE__
+  mach_msg_type_number_t thread_info_count = THREAD_BASIC_INFO_COUNT;
+  thread_basic_info_data_t thread_info_data;
+
+  kern_return_t kr = thread_info(
+                                 _thread_id,
+                                 THREAD_BASIC_INFO,
+                                 reinterpret_cast<thread_info_t>(&thread_info_data),
+                                 &thread_info_count);
+  if (kr != KERN_SUCCESS) {
+    return false;
+  }
+
+  if (thread_info_data.run_state != TH_STATE_RUNNING) return false;
+
+  const bool swapped = (thread_info_data.flags & TH_FLAGS_SWAPPED);
+  const bool idle = (thread_info_data.flags & TH_FLAGS_IDLE);
+
+  return !swapped && !idle;
+#else
+  return true;
+#endif
+}
--- old/src/os/linux/vm/osThread_linux.cpp	2015-08-12 14:09:07.000000000 +0200
+++ new/src/os/linux/vm/osThread_linux.cpp	2015-08-12 14:09:07.000000000 +0200
@@ -47,3 +47,7 @@
 void OSThread::pd_destroy() {
   delete _startThread_lock;
 }
+
+bool OSThread::is_online() {
+  return true;
+}
--- old/src/os/solaris/vm/osThread_solaris.cpp	2015-08-12 14:09:09.000000000 +0200
+++ new/src/os/solaris/vm/osThread_solaris.cpp	2015-08-12 14:09:08.000000000 +0200
@@ -51,3 +51,8 @@
 void OSThread::SR_handler(Thread* thread, ucontext_t* uc) {
   os::Solaris::SR_handler(thread, uc);
 }
+
+bool OSThread::is_online() {
+  // TODO: Solaris can do better: find out if a thread is ONPROC.
+  return true;
+}
--- old/src/os/windows/vm/osThread_windows.cpp	2015-08-12 14:09:10.000000000 +0200
+++ new/src/os/windows/vm/osThread_windows.cpp	2015-08-12 14:09:10.000000000 +0200
@@ -41,3 +41,7 @@
 // free_thread. Should follow pattern of Linux/Solaris code here.
 void OSThread::pd_destroy() {
 }
+
+bool OSThread::is_online() {
+  return true;
+}
--- old/src/share/vm/c1/c1_CodeStubs.hpp	2015-08-12 14:09:11.000000000 +0200
+++ new/src/share/vm/c1/c1_CodeStubs.hpp	2015-08-12 14:09:11.000000000 +0200
@@ -535,6 +535,29 @@
 #endif // PRODUCT
 };
 
+class C1ThreadLocalSafepoint: public CodeStub {
+ private:
+  address _safepoint_pc;
+  bool _is_return;
+
+ public:
+  C1ThreadLocalSafepoint(bool is_return) : _is_return(is_return) { }
+
+  bool is_return() { return _is_return; }
+
+  address safepoint_pc() { return _safepoint_pc; }
+  void set_safepoint_pc(address pc) { _safepoint_pc = pc; }
+
+  virtual void emit_code(LIR_Assembler* e);
+  virtual void visit(LIR_OpVisitState* visitor) {
+    // don't pass in the code emit info since it's processed in the fast path
+    visitor->do_slow_case();
+  }
+#ifndef PRODUCT
+  virtual void print_name(outputStream* out) const { out->print("C1ThreadLocalSafepoint"); }
+#endif // PRODUCT
+};
+
 //////////////////////////////////////////////////////////////////////////////////////////
 #if INCLUDE_ALL_GCS
 
--- old/src/share/vm/c1/c1_LIR.cpp	2015-08-12 14:09:12.000000000 +0200
+++ new/src/share/vm/c1/c1_LIR.cpp	2015-08-12 14:09:12.000000000 +0200
@@ -518,7 +518,6 @@
     case lir_ffree:          // input always valid, result and info always invalid
     case lir_push:           // input always valid, result and info always invalid
     case lir_pop:            // input always valid, result and info always invalid
-    case lir_return:         // input always valid, result and info always invalid
     case lir_leal:           // input and result always valid, info always invalid
     case lir_neg:            // input and result always valid, info always invalid
     case lir_monaddr:        // input and result always valid, info always invalid
@@ -537,15 +536,31 @@
       break;
     }
 
+    case lir_return:         // input always valid, result and info always invalid
+    {
+      assert(op->as_Op1Safepoint() != NULL, "must be");
+      LIR_Op1Safepoint* op_ret = (LIR_Op1Safepoint*)op;
+
+      if (op_ret->_info)                  do_info(op_ret->_info);
+      if (op_ret->_opr->is_valid())       do_input(op_ret->_opr);
+      if (op_ret->_result->is_valid())    do_output(op_ret->_result);
+
+      if (op_ret->tls_stub() != NULL)     do_stub(op_ret->tls_stub());
+
+      break;
+    }
+
     case lir_safepoint:
     {
-      assert(op->as_Op1() != NULL, "must be");
-      LIR_Op1* op1 = (LIR_Op1*)op;
+      assert(op->as_Op1Safepoint() != NULL, "must be");
+      LIR_Op1Safepoint* op1 = (LIR_Op1Safepoint*)op;
 
       assert(op1->_info != NULL, "");  do_info(op1->_info);
       if (op1->_opr->is_valid())       do_temp(op1->_opr); // safepoints on SPARC need temporary register
       assert(op1->_result->is_illegal(), "safepoint does not produce value");
 
+      if (op1->tls_stub() != NULL)     do_stub(op1->tls_stub());
+
       break;
     }
 
@@ -1527,6 +1542,14 @@
   append(new LIR_OpCompareAndSwap(lir_cas_int, addr, cmp_value, new_value, t1, t2, result));
 }
 
+// LIR_Op1Safepoint
+LIR_Op1Safepoint::LIR_Op1Safepoint(LIR_Code code, LIR_Opr opr, CodeEmitInfo* info)
+  : LIR_Op1(code, opr, info)
+  , _tls_stub(NULL) {
+  if (ThreadLocalSafepoints) {
+    _tls_stub = new C1ThreadLocalSafepoint(code == lir_return);
+  }
+}
 
 #ifdef PRODUCT
 
--- old/src/share/vm/c1/c1_LIR.hpp	2015-08-12 14:09:13.000000000 +0200
+++ new/src/share/vm/c1/c1_LIR.hpp	2015-08-12 14:09:13.000000000 +0200
@@ -36,6 +36,7 @@
 class CodeStub;
 class CodeStubList;
 class ArrayCopyStub;
+class C1ThreadLocalSafepoint;
 class LIR_Op;
 class ciType;
 class ValueType;
@@ -873,6 +874,7 @@
 class      LIR_OpLabel;
 class    LIR_Op1;
 class      LIR_OpBranch;
+class      LIR_Op1Safepoint;
 class      LIR_OpConvert;
 class      LIR_OpAllocObj;
 class      LIR_OpRoundFP;
@@ -1142,6 +1144,7 @@
   virtual LIR_OpAllocObj* as_OpAllocObj() { return NULL; }
   virtual LIR_OpRoundFP* as_OpRoundFP() { return NULL; }
   virtual LIR_OpBranch* as_OpBranch() { return NULL; }
+  virtual LIR_Op1Safepoint* as_Op1Safepoint() { return NULL; }
   virtual LIR_OpRTCall* as_OpRTCall() { return NULL; }
   virtual LIR_OpConvert* as_OpConvert() { return NULL; }
   virtual LIR_Op0* as_Op0() { return NULL; }
@@ -1468,6 +1471,19 @@
   virtual void print_instr(outputStream* out) const PRODUCT_RETURN;
 };
 
+class LIR_Op1Safepoint: public LIR_Op1 {
+ friend class LIR_OpVisitState;
+
+ private:
+  C1ThreadLocalSafepoint* _tls_stub;
+
+ public:
+  LIR_Op1Safepoint(LIR_Code code, LIR_Opr opr, CodeEmitInfo* info);
+
+  C1ThreadLocalSafepoint* tls_stub() const { return _tls_stub; }
+
+  virtual LIR_Op1Safepoint* as_Op1Safepoint() { return this; }
+};
 
 class ConversionStub;
 
@@ -2140,9 +2156,9 @@
   void metadata2reg  (Metadata* o, LIR_Opr reg)  { assert(reg->type() == T_METADATA, "bad reg"); append(new LIR_Op1(lir_move, LIR_OprFact::metadataConst(o), reg));   }
   void klass2reg_patch(Metadata* o, LIR_Opr reg, CodeEmitInfo* info);
 
-  void return_op(LIR_Opr result)                 { append(new LIR_Op1(lir_return, result)); }
+  void return_op(LIR_Opr result)                 { append(new LIR_Op1Safepoint(lir_return, result, NULL)); }
 
-  void safepoint(LIR_Opr tmp, CodeEmitInfo* info)  { append(new LIR_Op1(lir_safepoint, tmp, info)); }
+  void safepoint(LIR_Opr tmp, CodeEmitInfo* info)  { append(new LIR_Op1Safepoint(lir_safepoint, tmp, info)); }
 
 #ifdef PPC
   void convert(Bytecodes::Code code, LIR_Opr left, LIR_Opr dst, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_OpConvert(code, left, dst, NULL, tmp1, tmp2)); }
--- old/src/share/vm/c1/c1_LIRAssembler.cpp	2015-08-12 14:09:15.000000000 +0200
+++ new/src/share/vm/c1/c1_LIRAssembler.cpp	2015-08-12 14:09:14.000000000 +0200
@@ -510,16 +510,24 @@
       break;
     }
 
-    case lir_return:
-      return_op(op->in_opr());
+    case lir_return: {
+      assert(op->as_Op1Safepoint() != NULL, "sanity");
+      LIR_Op1Safepoint *ret_op = (LIR_Op1Safepoint*)op;
+      return_op(ret_op->in_opr(), ret_op->tls_stub());
+      if (ret_op->tls_stub()) append_code_stub(ret_op->tls_stub());
       break;
+    }
 
-    case lir_safepoint:
+    case lir_safepoint: {
+      assert(op->as_Op1Safepoint() != NULL, "sanity");
+      LIR_Op1Safepoint *sp_op = (LIR_Op1Safepoint*)op;
       if (compilation()->debug_info_recorder()->last_pc_offset() == code_offset()) {
         _masm->nop();
       }
-      safepoint_poll(op->in_opr(), op->info());
+      safepoint_poll(op->in_opr(), sp_op->tls_stub(), op->info());
+      if (sp_op->tls_stub()) append_code_stub(sp_op->tls_stub());
       break;
+    }
 
     case lir_fxch:
       fxch(op->in_opr()->as_jint());
--- old/src/share/vm/c1/c1_LIRAssembler.hpp	2015-08-12 14:09:16.000000000 +0200
+++ new/src/share/vm/c1/c1_LIRAssembler.hpp	2015-08-12 14:09:16.000000000 +0200
@@ -161,12 +161,12 @@
   // particular sparc uses this for delay slot filling.
   void peephole(LIR_List* list);
 
-  void emit_string_compare(LIR_Opr left, LIR_Opr right, LIR_Opr dst, CodeEmitInfo* info);
+  void emit_string_compare(LIR_Opr left, LIR_Opr right, LIR_Opr dst, C1ThreadLocalSafepoint *tls_stub, CodeEmitInfo* info);
 
-  void return_op(LIR_Opr result);
+  void return_op(LIR_Opr result, C1ThreadLocalSafepoint *code_stub);
 
   // returns offset of poll instruction
-  int safepoint_poll(LIR_Opr result, CodeEmitInfo* info);
+  int safepoint_poll(LIR_Opr result, C1ThreadLocalSafepoint *tls_stub, CodeEmitInfo* info);
 
   void const2reg  (LIR_Opr src, LIR_Opr dest, LIR_PatchCode patch_code, CodeEmitInfo* info);
   void const2stack(LIR_Opr src, LIR_Opr dest);
--- old/src/share/vm/c1/c1_LinearScan.cpp	2015-08-12 14:09:17.000000000 +0200
+++ new/src/share/vm/c1/c1_LinearScan.cpp	2015-08-12 14:09:17.000000000 +0200
@@ -6304,7 +6304,7 @@
           if (pred_last_branch->block() == block && pred_last_branch->cond() == lir_cond_always && pred_last_branch->info() == NULL) {
             // replace the jump to a return with a direct return
             // Note: currently the edge between the blocks is not deleted
-            pred_instructions->at_put(pred_instructions->length() - 1, new LIR_Op1(lir_return, return_opr));
+            pred_instructions->at_put(pred_instructions->length() - 1, new LIR_Op1Safepoint(lir_return, return_opr, NULL));
 #ifdef ASSERT
             return_converted.set_bit(pred->block_id());
 #endif
--- old/src/share/vm/gc/g1/concurrentG1Refine.cpp	2015-08-12 14:09:18.000000000 +0200
+++ new/src/share/vm/gc/g1/concurrentG1Refine.cpp	2015-08-12 14:09:18.000000000 +0200
@@ -29,7 +29,7 @@
 #include "gc/g1/g1HotCardCache.hpp"
 #include "runtime/java.hpp"
 
-ConcurrentG1Refine::ConcurrentG1Refine(G1CollectedHeap* g1h, CardTableEntryClosure* refine_closure) :
+ConcurrentG1Refine::ConcurrentG1Refine(G1CollectedHeap* g1h) :
   _threads(NULL), _n_threads(0),
   _hot_card_cache(g1h)
 {
@@ -61,7 +61,7 @@
 
   ConcurrentG1RefineThread *next = NULL;
   for (uint i = _n_threads - 1; i != UINT_MAX; i--) {
-    ConcurrentG1RefineThread* t = new ConcurrentG1RefineThread(this, next, refine_closure, worker_id_offset, i);
+    ConcurrentG1RefineThread* t = new ConcurrentG1RefineThread(this, next, worker_id_offset, i);
     assert(t != NULL, "Conc refine should have been created");
     if (t->osthread() == NULL) {
         vm_shutdown_during_initialization("Could not create ConcurrentG1RefineThread");
--- old/src/share/vm/gc/g1/concurrentG1Refine.hpp	2015-08-12 14:09:19.000000000 +0200
+++ new/src/share/vm/gc/g1/concurrentG1Refine.hpp	2015-08-12 14:09:19.000000000 +0200
@@ -72,7 +72,7 @@
   void reset_threshold_step();
 
  public:
-  ConcurrentG1Refine(G1CollectedHeap* g1h, CardTableEntryClosure* refine_closure);
+  ConcurrentG1Refine(G1CollectedHeap* g1h);
   ~ConcurrentG1Refine();
 
   void init(G1RegionToSpaceMapper* card_counts_storage);
--- old/src/share/vm/gc/g1/concurrentG1RefineThread.cpp	2015-08-12 14:09:21.000000000 +0200
+++ new/src/share/vm/gc/g1/concurrentG1RefineThread.cpp	2015-08-12 14:09:20.000000000 +0200
@@ -34,10 +34,8 @@
 
 ConcurrentG1RefineThread::
 ConcurrentG1RefineThread(ConcurrentG1Refine* cg1r, ConcurrentG1RefineThread *next,
-                         CardTableEntryClosure* refine_closure,
                          uint worker_id_offset, uint worker_id) :
   ConcurrentGCThread(),
-  _refine_closure(refine_closure),
   _worker_id_offset(worker_id_offset),
   _worker_id(worker_id),
   _active(false),
@@ -190,6 +188,7 @@
 
     {
       SuspendibleThreadSetJoiner sts_join;
+      BufferedRefineCardTableEntryClosure cl;
 
       do {
         int curr_buffer_num = (int)dcqs.completed_buffers_num();
@@ -203,6 +202,7 @@
           // If the number of the buffer has fallen below our threshold
           // we should deactivate. The predecessor will reactivate this
           // thread should the number of the buffers cross the threshold again.
+          cl.flush_buffer();
           deactivate();
           break;
         }
@@ -211,7 +211,9 @@
         if (_next != NULL && !_next->is_active() && curr_buffer_num > _next->_threshold) {
           _next->activate();
         }
-      } while (dcqs.apply_closure_to_completed_buffer(_refine_closure, _worker_id + _worker_id_offset, cg1r()->green_zone()));
+      } while (dcqs.apply_closure_to_completed_buffer(&cl, _worker_id + _worker_id_offset, cg1r()->green_zone()));
+
+      cl.flush_buffer();
 
       // We can exit the loop above while being active if there was a yield request.
       if (is_active()) {
@@ -251,4 +253,3 @@
     gclog_or_tty->print_cr("G1-Refine-stop");
   }
 }
-
--- old/src/share/vm/gc/g1/concurrentG1RefineThread.hpp	2015-08-12 14:09:22.000000000 +0200
+++ new/src/share/vm/gc/g1/concurrentG1RefineThread.hpp	2015-08-12 14:09:21.000000000 +0200
@@ -50,9 +50,6 @@
   Monitor* _monitor;
   ConcurrentG1Refine* _cg1r;
 
-  // The closure applied to completed log buffers.
-  CardTableEntryClosure* _refine_closure;
-
   int _thread_threshold_step;
   // This thread activation threshold
   int _threshold;
@@ -72,7 +69,6 @@
   virtual void run();
   // Constructor
   ConcurrentG1RefineThread(ConcurrentG1Refine* cg1r, ConcurrentG1RefineThread* next,
-                           CardTableEntryClosure* refine_closure,
                            uint worker_id_offset, uint worker_id);
 
   void initialize();
--- old/src/share/vm/gc/g1/dirtyCardQueue.cpp	2015-08-12 14:09:23.000000000 +0200
+++ new/src/share/vm/gc/g1/dirtyCardQueue.cpp	2015-08-12 14:09:23.000000000 +0200
@@ -70,7 +70,6 @@
 
 DirtyCardQueueSet::DirtyCardQueueSet(bool notify_when_complete) :
   PtrQueueSet(notify_when_complete),
-  _mut_process_closure(NULL),
   _shared_dirty_card_queue(this, true /*perm*/),
   _free_ids(NULL),
   _processed_buffers_mut(0), _processed_buffers_rs_thread(0)
@@ -83,11 +82,11 @@
   return (uint)os::processor_count();
 }
 
-void DirtyCardQueueSet::initialize(CardTableEntryClosure* cl, Monitor* cbl_mon, Mutex* fl_lock,
+void DirtyCardQueueSet::initialize(bool should_do_processing, Monitor* cbl_mon, Mutex* fl_lock,
                                    int process_completed_threshold,
                                    int max_completed_queue,
                                    Mutex* lock, PtrQueueSet* fl_owner) {
-  _mut_process_closure = cl;
+  _should_do_processing = should_do_processing;
   PtrQueueSet::initialize(cbl_mon, fl_lock, process_completed_threshold,
                           max_completed_queue, fl_owner);
   set_buffer_size(G1UpdateBufferSize);
@@ -141,8 +140,10 @@
 
   bool b = false;
   if (worker_i != UINT_MAX) {
-    b = DirtyCardQueue::apply_closure_to_buffer(_mut_process_closure, buf, 0,
+    BufferedRefineCardTableEntryClosure cl;
+    b = DirtyCardQueue::apply_closure_to_buffer(_should_do_processing ? &cl : NULL, buf, 0,
                                                 _sz, true, worker_i);
+    cl.flush_buffer();
     if (b) Atomic::inc(&_processed_buffers_mut);
 
     // If we had not claimed an id before entering the method
--- old/src/share/vm/gc/g1/dirtyCardQueue.hpp	2015-08-12 14:09:24.000000000 +0200
+++ new/src/share/vm/gc/g1/dirtyCardQueue.hpp	2015-08-12 14:09:24.000000000 +0200
@@ -80,9 +80,6 @@
 
 
 class DirtyCardQueueSet: public PtrQueueSet {
-  // The closure used in mut_process_buffer().
-  CardTableEntryClosure* _mut_process_closure;
-
   DirtyCardQueue _shared_dirty_card_queue;
 
   // Override.
@@ -98,10 +95,12 @@
 
   // Current buffer node used for parallel iteration.
   BufferNode* volatile _cur_par_buffer_node;
+
+  bool _should_do_processing;
 public:
   DirtyCardQueueSet(bool notify_when_complete = true);
 
-  void initialize(CardTableEntryClosure* cl, Monitor* cbl_mon, Mutex* fl_lock,
+  void initialize(bool should_do_processing, Monitor* cbl_mon, Mutex* fl_lock,
                   int process_completed_threshold,
                   int max_completed_queue,
                   Mutex* lock, PtrQueueSet* fl_owner = NULL);
--- old/src/share/vm/gc/g1/g1CollectedHeap.cpp	2015-08-12 14:09:25.000000000 +0200
+++ new/src/share/vm/gc/g1/g1CollectedHeap.cpp	2015-08-12 14:09:25.000000000 +0200
@@ -65,6 +65,7 @@
 #include "memory/iterator.hpp"
 #include "oops/oop.inline.hpp"
 #include "runtime/atomic.inline.hpp"
+#include "runtime/globalSynchronizer.hpp"
 #include "runtime/orderAccess.inline.hpp"
 #include "runtime/vmThread.hpp"
 #include "utilities/globalDefinitions.hpp"
@@ -89,29 +90,289 @@
 // is done by clients of this interface.)
 
 // Local to this file.
+bool RefineCardTableEntryClosure::do_card_ptr(jbyte* card_ptr, uint worker_i) {
+  bool oops_into_cset = G1CollectedHeap::heap()->g1_rem_set()->refine_card(card_ptr, worker_i, false);
+  // This path is executed by the concurrent refine or mutator threads,
+  // concurrently, and so we do not care if card_ptr contains references
+  // that point into the collection set.
+  assert(!oops_into_cset, "should be");
 
-class RefineCardTableEntryClosure: public CardTableEntryClosure {
-  bool _concurrent;
-public:
-  RefineCardTableEntryClosure() : _concurrent(true) { }
+  // return false if caller should yield
+  return !(G1CollectedHeap::heap()->refine_cte_cl_concurrency() && SuspendibleThreadSet::should_yield());
+}
 
-  bool do_card_ptr(jbyte* card_ptr, uint worker_i) {
-    bool oops_into_cset = G1CollectedHeap::heap()->g1_rem_set()->refine_card(card_ptr, worker_i, false);
-    // This path is executed by the concurrent refine or mutator threads,
-    // concurrently, and so we do not care if card_ptr contains references
-    // that point into the collection set.
-    assert(!oops_into_cset, "should be");
+CardBuffer::CardBuffer()
+  : _next(NULL) {
+  int size = BufferedRefineCardTableEntryClosure::buffer_size();
+  _card_buffer = NEW_C_HEAP_ARRAY(jbyte*, size, mtGC);
+  _mr_buffer = NEW_C_HEAP_ARRAY(MemRegion, size, mtGC);
+  _gs = new SynchronizerObj<mtGC>();
+  _misses = 0;
+}
 
-    if (_concurrent && SuspendibleThreadSet::should_yield()) {
-      // Caller will actually yield.
-      return false;
+CardBuffer::~CardBuffer() {
+  FREE_C_HEAP_ARRAY(jbyte*, _card_buffer);
+  FREE_C_HEAP_ARRAY(MemRegion, _mr_buffer);
+  delete _gs;
+}
+
+BufferedRefineCardTableEntryClosure::BufferedRefineCardTableEntryClosure()
+  : _index(0), _g1h(G1CollectedHeap::heap()), _head_buffer(NULL), _tail_buffer(NULL),
+    _current_buffer(NULL), _async_buffers(0) {
+}
+
+BufferedRefineCardTableEntryClosure::~BufferedRefineCardTableEntryClosure() {
+  assert(_index == 0, "must flush refine card buffer");
+  assert(_head_buffer == NULL && _tail_buffer == NULL, "must flush all async cards first");
+  assert(_async_buffers == 0, "must flush all async cards first");
+  if (_current_buffer) delete _current_buffer;
+}
+
+bool BufferedRefineCardTableEntryClosure::do_card_ptr(jbyte *card_ptr, uint worker_i) {
+  _worker_i = worker_i;
+  if (_index == buffer_size()) soft_flush();
+  if (_current_buffer == NULL) _current_buffer = new CardBuffer();
+  _current_buffer->_card_buffer[_index++] = card_ptr;
+
+  bool should_yield = _g1h->refine_cte_cl_concurrency() && SuspendibleThreadSet::should_yield();
+  if (should_yield) flush_buffer();
+
+  // return false if caller should yield
+  return !should_yield;
+}
+
+void BufferedRefineCardTableEntryClosure::soft_flush() {
+  general_flush(false);
+}
+
+// Procedures used to sort and join G1 cards during refinement
+static void quick_sort(jbyte **card_array, MemRegion *region_array, int left, int right);
+static int partition(jbyte **card_array, MemRegion *region_array, int left, int right);
+static int join_cards(jbyte **card_array, MemRegion *region_array, int length);
+
+static void quick_sort(jbyte **card_array, MemRegion *region_array, int left, int right) {
+  int middle;
+  if (left < right)
+  {
+    middle = partition(card_array, region_array, left, right);
+    quick_sort(card_array, region_array, left, middle);
+    quick_sort(card_array, region_array, middle + 1, right);
+  }
+}
+
+static int partition(jbyte **card_array, MemRegion *region_array, int left, int right) {
+  jbyte *card = card_array[left];
+  int i = left;
+  int j;
+
+  for (j = left + 1; j < right; j++)
+  {
+    if (card_array[j] <= card)
+    {
+      i = i + 1;
+      swap(card_array[i], card_array[j]);
+      swap(region_array[i], region_array[j]);
+    }
+  }
+
+  swap(card_array[i], card_array[left]);
+  swap(region_array[i], region_array[left]);
+  return i;
+}
+
+static int join_cards(jbyte **card_array, MemRegion *region_array, int length) {
+  G1CollectedHeap *g1h = G1CollectedHeap::heap();
+  jbyte *prev_card = NULL;
+  HeapRegion *prev_hr = NULL;
+  int insert_head = 0;
+  for (int i = 0; i < length; i++) {
+    jbyte *card = card_array[i];
+
+    if (*card == CardTableModRefBS::clean_card_val()) {
+      HeapRegion *hr = g1h->heap_region_containing_raw(region_array[i].start());
+      if (card == prev_card + 1 && hr == prev_hr) {
+        MemRegion insert_region = region_array[insert_head - 1];
+        region_array[insert_head - 1] = MemRegion(insert_region.start(), region_array[i].end());
+      } else {
+        card_array[insert_head] = card;
+        region_array[insert_head] = region_array[i];
+        insert_head++;
+      }
+      prev_hr = hr;
+    }
+
+    prev_card = card;
+  }
+
+  return insert_head;
+}
+
+int BufferedRefineCardTableEntryClosure::buffer_size() {
+  return (int)G1UpdateBufferSize;
+}
+
+void BufferedRefineCardTableEntryClosure::flush_buffer() {
+  general_flush(true);
+}
+
+// Returns true if it needs post sync
+bool BufferedRefineCardTableEntryClosure::pre_sync(CardBuffer *buffer, bool hard) {
+  // 1. Clean all cards in the batch.
+  G1RemSet *g1rs = G1CollectedHeap::heap()->g1_rem_set();
+  int needs_processing = 0;
+
+  jbyte    **const card_buffer = buffer->_card_buffer;
+  MemRegion *const mr_buffer   = buffer->_mr_buffer;
+  const int length = buffer->_length;
+
+  for (int i = 0; i < length; i++) {
+    if (g1rs->clean_card(card_buffer[i], _worker_i, mr_buffer[i])) {
+      card_buffer[needs_processing] = card_buffer[i];
+      mr_buffer[needs_processing] = mr_buffer[i];
+      needs_processing++;
+    }
+  }
+  buffer->_length = needs_processing;
+
+  if (needs_processing == 0) {
+    if (hard) {
+      // If we are forced to finish scanning, we must serialize stores anyway.
+      OrderAccess::storeload();
+      if (G1ElideMembar) {
+        buffer->_gs->start_synchronizing();
+      }
+    }
+    return false;
+  }
+
+  OrderAccess::storeload();
+  if (G1ElideMembar) {
+    buffer->_gs->start_synchronizing();
+  }
+
+  // 2. Sort the cards
+  quick_sort(buffer->_card_buffer, buffer->_mr_buffer, 0, buffer->_length);
+
+  return true;
+}
+
+bool BufferedRefineCardTableEntryClosure::sync(CardBuffer *buffer, bool hard) {
+  if (!G1ElideMembar) return true;
+
+  bool success = buffer->_gs->try_synchronize();
+  if (hard) {
+    if (!success) {
+      buffer->_gs->maximize_urgency();
+      buffer->_gs->synchronize();
     }
-    // Otherwise, we finished successfully; return true.
     return true;
+  } else {
+    return success;
   }
+}
 
-  void set_concurrent(bool b) { _concurrent = b; }
-};
+void BufferedRefineCardTableEntryClosure::post_sync(CardBuffer *buffer) {
+  const int length = buffer->_length;
+
+  const int card_batch_size = 16;
+  jbyte **current_card = buffer->_card_buffer;
+  MemRegion *current_region = buffer->_mr_buffer;
+
+  const uintx interval = PrefetchScanIntervalInBytes * 2;
+
+  G1RemSet *g1rs = G1CollectedHeap::heap()->g1_rem_set();
+
+  // 3. Batch 16 cards at a time
+
+  for (int j = 0; j < length; j += card_batch_size) {
+    // 4. Join consecutive cards together and prefetch next card
+    int batch = MIN2((length - j), card_batch_size);
+    batch = join_cards(current_card, current_region, batch);
+
+    jbyte dirty_card_val = CardTableModRefBS::dirty_card_val();
+    jbyte *end_card;
+    HeapWord *end_prefetch;
+
+    if (j + card_batch_size < length) {
+      end_prefetch = current_region[card_batch_size].start();
+      end_card = current_card[card_batch_size];
+    } else {
+      end_card = &dirty_card_val;
+    }
+
+    MemRegion *region_end = current_region + batch;
+    jbyte** batch_card;
+    MemRegion* batch_region;
+
+    for (batch_card = current_card, batch_region = current_region; batch_region != region_end; batch_card++) {
+      jbyte *card = *batch_card;
+      MemRegion mr = *batch_region;
+      MemRegion *next_region = batch_region + 1;
+
+      if (next_region != region_end) {
+        MemRegion next_region_val = *next_region;
+        // Prefetch interval in batch
+        Prefetch::read(next_region_val.start(), next_region_val.byte_size());
+      } else if (*end_card == CardTableModRefBS::clean_card_val()) {
+        // Prefetch broken interval to next batch
+        Prefetch::read(end_prefetch, interval);
+      }
+
+      g1rs->refine_card_buffered(card, _worker_i, /*check_for_cset_refs*/ false, mr);
+
+      batch_region = next_region;
+    }
+
+    current_region += card_batch_size;
+    current_card += card_batch_size;
+  }
+}
+
+void BufferedRefineCardTableEntryClosure::general_flush(bool hard) {
+  if (_index == 0) {
+    assert(hard, "invariant");
+    if (_async_buffers == 0) return;
+  }
+
+  // 1. Start asynchronous synchronization for the current buffer
+  if (_current_buffer == NULL) _current_buffer = new CardBuffer();
+  _current_buffer->_length = _index;
+  if (pre_sync(_current_buffer, hard) || hard) {
+    // append async buffer
+    CardBuffer *tail = _tail_buffer;
+    if (tail != NULL) tail->_next = _current_buffer;
+    _tail_buffer = _current_buffer;
+    if (_head_buffer == NULL) _head_buffer = _current_buffer;
+    if (hard) sync(_current_buffer, hard);
+    _current_buffer = NULL;
+    _async_buffers++;
+  }
+
+  _index = 0;
+
+  // 2. Process old batches that have been cleaned but couldn't synchronize (async completion)
+  CardBuffer *current = _head_buffer;
+  bool check_sync = true;
+  while (current != NULL) {
+    if (hard || sync(current, hard)) {
+      post_sync(current);
+      CardBuffer *next = current->_next;
+      _head_buffer = next;
+      if (next == NULL) _tail_buffer = NULL;
+      delete current;
+      current = next;
+      _async_buffers--;
+    } else {
+      current->_misses++;
+      if (_async_buffers > 4 && current->_misses > 2
+          || _async_buffers > 8 && current->_misses > 4
+          || _async_buffers > 16 && current->_misses > 6) {
+        current->_gs->increase_urgency();
+      }
+      break;
+    }
+  }
+}
 
 
 class RedirtyLoggedCardTableEntryClosure : public CardTableEntryClosure {
@@ -1919,7 +2180,7 @@
   _bot_shared(NULL),
   _cg1r(NULL),
   _g1mm(NULL),
-  _refine_cte_cl(NULL),
+  _refine_cte_cl_concurrency(true),
   _secondary_free_list("Secondary Free List", new SecondaryFreeRegionListMtSafeChecker()),
   _old_set("Old Set", false /* humongous */, new OldRegionSetMtSafeChecker()),
   _humongous_set("Master Humongous Set", true /* humongous */, new HumongousRegionSetMtSafeChecker()),
@@ -2032,9 +2293,7 @@
   Universe::check_alignment(max_byte_size, HeapRegion::GrainBytes, "g1 heap");
   Universe::check_alignment(max_byte_size, heap_alignment, "g1 heap");
 
-  _refine_cte_cl = new RefineCardTableEntryClosure();
-
-  _cg1r = new ConcurrentG1Refine(this, _refine_cte_cl);
+  _cg1r = new ConcurrentG1Refine(this);
 
   // Reserve the maximum.
 
@@ -2158,14 +2417,14 @@
                                                G1SATBProcessCompletedThreshold,
                                                Shared_SATB_Q_lock);
 
-  JavaThread::dirty_card_queue_set().initialize(_refine_cte_cl,
+  JavaThread::dirty_card_queue_set().initialize(true,
                                                 DirtyCardQ_CBL_mon,
                                                 DirtyCardQ_FL_lock,
                                                 concurrent_g1_refine()->yellow_zone(),
                                                 concurrent_g1_refine()->red_zone(),
                                                 Shared_DirtyCardQ_lock);
 
-  dirty_card_queue_set().initialize(NULL, // Should never be called by the Java code
+  dirty_card_queue_set().initialize(false, // Should never be called by the Java code
                                     DirtyCardQ_CBL_mon,
                                     DirtyCardQ_FL_lock,
                                     -1, // never trigger processing
@@ -2175,7 +2434,7 @@
 
   // Initialize the card queue set used to hold cards containing
   // references into the collection set.
-  _into_cset_dirty_card_queue_set.initialize(NULL, // Should never be called by the Java code
+  _into_cset_dirty_card_queue_set.initialize(false, // Should never be called by the Java code
                                              DirtyCardQ_CBL_mon,
                                              DirtyCardQ_FL_lock,
                                              -1, // never trigger processing
@@ -6381,7 +6640,11 @@
 }
 
 void G1CollectedHeap::set_refine_cte_cl_concurrency(bool concurrent) {
-  _refine_cte_cl->set_concurrent(concurrent);
+  _refine_cte_cl_concurrency = concurrent;
+}
+
+bool G1CollectedHeap::refine_cte_cl_concurrency() {
+  return _refine_cte_cl_concurrency;
 }
 
 bool G1CollectedHeap::is_in_closed_subset(const void* p) const {
--- old/src/share/vm/gc/g1/g1CollectedHeap.hpp	2015-08-12 14:09:27.000000000 +0200
+++ new/src/share/vm/gc/g1/g1CollectedHeap.hpp	2015-08-12 14:09:26.000000000 +0200
@@ -57,6 +57,7 @@
 class OopsInHeapRegionClosure;
 class G1KlassScanClosure;
 class G1ParScanThreadState;
+class GlobalSynchronizer;
 class ObjectClosure;
 class SpaceClosure;
 class CompactibleSpaceClosure;
@@ -169,7 +170,51 @@
   bool do_object_b(oop p);
 };
 
-class RefineCardTableEntryClosure;
+class RefineCardTableEntryClosure: public CardTableEntryClosure {
+public:
+  RefineCardTableEntryClosure() { }
+  bool do_card_ptr(jbyte* card_ptr, uint worker_i);
+};
+
+class CardBuffer : public CHeapObj<mtGC> {
+public:
+  CardBuffer *_next;
+  GlobalSynchronizer *_gs;
+  jbyte **_card_buffer;
+  MemRegion *_mr_buffer;
+  int _length;
+
+  int _misses;
+
+  CardBuffer();
+  virtual ~CardBuffer();
+};
+
+class BufferedRefineCardTableEntryClosure: public CardTableEntryClosure {
+  CardBuffer *_head_buffer;
+  CardBuffer *_tail_buffer;
+  CardBuffer *_current_buffer;
+
+  int _index;
+  int _async_buffers;
+
+  uint _worker_i;
+  G1CollectedHeap *const _g1h;
+
+  bool pre_sync(CardBuffer *buffer, bool hard);
+  bool sync(CardBuffer *buffer, bool hard);
+  void post_sync(CardBuffer *buffer);
+
+  void general_flush(bool hard);
+  void soft_flush();
+public:
+  BufferedRefineCardTableEntryClosure();
+  ~BufferedRefineCardTableEntryClosure();
+  static int buffer_size();
+  bool do_card_ptr(jbyte *card_ptr, uint worker_i);
+  void flush_buffer();
+};
+
 
 class G1RegionMappingChangedListener : public G1MappingChangedListener {
  private:
@@ -831,8 +876,7 @@
   // concurrently after the collection.
   DirtyCardQueueSet _dirty_card_queue_set;
 
-  // The closure used to refine a single card.
-  RefineCardTableEntryClosure* _refine_cte_cl;
+  bool _refine_cte_cl_concurrency;
 
   // A DirtyCardQueueSet that is used to hold cards that contain
   // references into the current collection set. This is used to
@@ -1020,6 +1064,7 @@
 
 public:
 
+  bool refine_cte_cl_concurrency();
   void set_refine_cte_cl_concurrency(bool concurrent);
 
   RefToScanQueue *task_queue(uint i) const;
--- old/src/share/vm/gc/g1/g1RemSet.cpp	2015-08-12 14:09:28.000000000 +0200
+++ new/src/share/vm/gc/g1/g1RemSet.cpp	2015-08-12 14:09:28.000000000 +0200
@@ -227,6 +227,186 @@
   size_t cards_looked_up() { return _cards;}
 };
 
+bool G1RemSet::clean_card(jbyte* &card_ptr,
+                          uint worker_i,
+                          MemRegion &dirty_region) {
+  assert(_g1->is_in_exact(_ct_bs->addr_for(card_ptr)),
+         err_msg("Card at " PTR_FORMAT " index " SIZE_FORMAT " representing heap at " PTR_FORMAT " (%u) must be in committed heap",
+                 p2i(card_ptr),
+                 _ct_bs->index_for(_ct_bs->addr_for(card_ptr)),
+                 p2i(_ct_bs->addr_for(card_ptr)),
+                 _g1->addr_to_region(_ct_bs->addr_for(card_ptr))));
+
+  // If the card is no longer dirty, nothing to do.
+  if (*card_ptr != CardTableModRefBS::dirty_card_val()) {
+    // No need to return that this card contains refs that point
+    // into the collection set.
+    return false;
+  }
+
+  // Construct the region representing the card.
+  HeapWord* start = _ct_bs->addr_for(card_ptr);
+  // And find the region containing it.
+  HeapRegion* r = _g1->heap_region_containing(start);
+
+  // Why do we have to check here whether a card is on a young region,
+  // given that we dirty young regions and, as a result, the
+  // post-barrier is supposed to filter them out and never to enqueue
+  // them? When we allocate a new region as the "allocation region" we
+  // actually dirty its cards after we release the lock, since card
+  // dirtying while holding the lock was a performance bottleneck. So,
+  // as a result, it is possible for other threads to actually
+  // allocate objects in the region (after the acquire the lock)
+  // before all the cards on the region are dirtied. This is unlikely,
+  // and it doesn't happen often, but it can happen. So, the extra
+  // check below filters out those cards.
+  if (r->is_young()) {
+    return false;
+  }
+
+  // While we are processing RSet buffers during the collection, we
+  // actually don't want to scan any cards on the collection set,
+  // since we don't want to update remembered sets with entries that
+  // point into the collection set, given that live objects from the
+  // collection set are about to move and such entries will be stale
+  // very soon. This change also deals with a reliability issue which
+  // involves scanning a card in the collection set and coming across
+  // an array that was being chunked and looking malformed. Note,
+  // however, that if evacuation fails, we have to scan any objects
+  // that were not moved and create any missing entries.
+  if (r->in_collection_set()) {
+    return false;
+  }
+
+  // The result from the hot card cache insert call is either:
+  //   * pointer to the current card
+  //     (implying that the current card is not 'hot'),
+  //   * null
+  //     (meaning we had inserted the card ptr into the "hot" card cache,
+  //     which had some headroom),
+  //   * a pointer to a "hot" card that was evicted from the "hot" cache.
+  //
+
+  G1HotCardCache* hot_card_cache = _cg1r->hot_card_cache();
+  if (hot_card_cache->use_cache()) {
+    assert(!SafepointSynchronize::is_at_safepoint(), "sanity");
+
+    card_ptr = hot_card_cache->insert(card_ptr);
+    if (card_ptr == NULL) {
+      // There was no eviction. Nothing to do.
+      return false;
+    }
+
+    start = _ct_bs->addr_for(card_ptr);
+    r = _g1->heap_region_containing(start);
+
+    // Checking whether the region we got back from the cache
+    // is young here is inappropriate. The region could have been
+    // freed, reallocated and tagged as young while in the cache.
+    // Hence we could see its young type change at any time.
+  }
+
+  // Don't use addr_for(card_ptr + 1) which can ask for
+  // a card beyond the heap.  This is not safe without a perm
+  // gen at the upper end of the heap.
+  HeapWord* end   = start + CardTableModRefBS::card_size_in_words;
+  dirty_region = MemRegion(start, end);
+
+#if CARD_REPEAT_HISTO
+  init_ct_freq_table(_g1->max_capacity());
+  ct_freq_note_card(_ct_bs->index_for(start));
+#endif
+
+  return r->clean_card(dirty_region, /*filter young*/ true, card_ptr);
+}
+
+bool G1RemSet::refine_card_buffered(jbyte* card_ptr,
+                                    uint worker_i,
+                                    bool check_for_refs_into_cset,
+                                    MemRegion dirty_region) {
+  // And find the region containing it.
+  HeapRegion* r = _g1->heap_region_containing(dirty_region.start());
+
+  G1ParPushHeapRSClosure* oops_in_heap_closure = NULL;
+  if (check_for_refs_into_cset) {
+    // ConcurrentG1RefineThreads have worker numbers larger than what
+    // _cset_rs_update_cl[] is set up to handle. But those threads should
+    // only be active outside of a collection which means that when they
+    // reach here they should have check_for_refs_into_cset == false.
+    assert((size_t)worker_i < n_workers(), "index of worker larger than _cset_rs_update_cl[].length");
+    oops_in_heap_closure = _cset_rs_update_cl[worker_i];
+  }
+
+  G1UpdateRSOrPushRefOopClosure update_rs_oop_cl(_g1,
+                                                 _g1->g1_rem_set(),
+                                                 oops_in_heap_closure,
+                                                 check_for_refs_into_cset,
+                                                 worker_i);
+  update_rs_oop_cl.set_from(r);
+
+  G1TriggerClosure trigger_cl;
+  FilterIntoCSClosure into_cs_cl(NULL, _g1, &trigger_cl);
+  G1InvokeIfNotTriggeredClosure invoke_cl(&trigger_cl, &into_cs_cl);
+  G1Mux2Closure mux(&invoke_cl, &update_rs_oop_cl);
+
+  FilterOutOfRegionClosure filter_then_update_rs_oop_cl(r,
+                        (check_for_refs_into_cset ?
+                                (OopClosure*)&mux :
+                                (OopClosure*)&update_rs_oop_cl));
+
+  // The region for the current card may be a young region. The
+  // current card may have been a card that was evicted from the
+  // card cache. When the card was inserted into the cache, we had
+  // determined that its region was non-young. While in the cache,
+  // the region may have been freed during a cleanup pause, reallocated
+  // and tagged as young.
+  //
+  // We wish to filter out cards for such a region but the current
+  // thread, if we're running concurrently, may "see" the young type
+  // change at any time (so an earlier "is_young" check may pass or
+  // fail arbitrarily). We tell the iteration code to perform this
+  // filtering when it has been determined that there has been an actual
+  // allocation in this region and making it safe to check the young type.
+  bool filter_young = true;
+
+  HeapWord* stop_point =
+    r->process_oops_on_card(dirty_region, &filter_then_update_rs_oop_cl, card_ptr);
+
+  // If stop_point is non-null, then we encountered an unallocated region
+  // (perhaps the unfilled portion of a TLAB.)  For now, we'll dirty the
+  // card and re-enqueue: if we put off the card until a GC pause, then the
+  // unallocated portion will be filled in.  Alternatively, we might try
+  // the full complexity of the technique used in "regular" precleaning.
+  if (stop_point != NULL) {
+    // The card might have gotten re-dirtied and re-enqueued while we
+    // worked.  (In fact, it's pretty likely.)
+    card_ptr = G1CollectedHeap::heap()->g1_barrier_set()->byte_for(stop_point);
+
+    if (*card_ptr != CardTableModRefBS::dirty_card_val()) {
+      *card_ptr = CardTableModRefBS::dirty_card_val();
+      MutexLockerEx x(Shared_DirtyCardQ_lock,
+                      Mutex::_no_safepoint_check_flag);
+      DirtyCardQueue* sdcq =
+        JavaThread::dirty_card_queue_set().shared_dirty_card_queue();
+      sdcq->enqueue(card_ptr);
+    }
+  } else {
+    _conc_refine_cards++;
+  }
+
+  // This gets set to true if the card being refined has
+  // references that point into the collection set.
+  bool has_refs_into_cset = trigger_cl.triggered();
+
+  // We should only be detecting that the card contains references
+  // that point into the collection set if the current thread is
+  // a GC worker thread.
+  assert(!has_refs_into_cset || SafepointSynchronize::is_at_safepoint(),
+           "invalid result at non safepoint");
+
+  return has_refs_into_cset;
+}
+
 void G1RemSet::scanRS(G1ParPushHeapRSClosure* oc,
                       CodeBlobClosure* code_root_cl,
                       uint worker_i) {
--- old/src/share/vm/gc/g1/g1RemSet.hpp	2015-08-12 14:09:29.000000000 +0200
+++ new/src/share/vm/gc/g1/g1RemSet.hpp	2015-08-12 14:09:29.000000000 +0200
@@ -106,6 +106,15 @@
   void prepare_for_oops_into_collection_set_do();
   void cleanup_after_oops_into_collection_set_do();
 
+  bool clean_card(jbyte* &card_ptr,
+                  uint worker_i,
+                  MemRegion &dirty_region);
+  bool refine_card_buffered(jbyte* card_ptr,
+                            uint worker_i,
+                            bool check_for_refs_into_cset,
+                            MemRegion dirty_region);
+
+
   void scanRS(G1ParPushHeapRSClosure* oc,
               CodeBlobClosure* code_root_cl,
               uint worker_i);
--- old/src/share/vm/gc/g1/heapRegion.cpp	2015-08-12 14:09:30.000000000 +0200
+++ new/src/share/vm/gc/g1/heapRegion.cpp	2015-08-12 14:09:30.000000000 +0200
@@ -361,12 +361,10 @@
   return NULL;
 }
 
-HeapWord*
-HeapRegion::
-oops_on_card_seq_iterate_careful(MemRegion mr,
-                                 FilterOutOfRegionClosure* cl,
-                                 bool filter_young,
-                                 jbyte* card_ptr) {
+
+bool HeapRegion::clean_card(MemRegion& mr,
+                            bool filter_young,
+                            jbyte* &card_ptr) {
   // Currently, we should only have to clean the card if filter_young
   // is true and vice versa.
   if (filter_young) {
@@ -384,7 +382,7 @@
   } else {
     mr = mr.intersection(used_region());
   }
-  if (mr.is_empty()) return NULL;
+  if (mr.is_empty()) return false;
   // Otherwise, find the obj that extends onto mr.start().
 
   // The intersection of the incoming mr (for the card) and the
@@ -394,7 +392,7 @@
   // is_young tag on the region before allocating. Thus we
   // safely know if this region is young.
   if (is_young() && filter_young) {
-    return NULL;
+    return false;
   }
 
   assert(!is_young(), "check value of filter_young");
@@ -404,17 +402,25 @@
   // asked to (i.e., card_ptr != NULL).
   if (card_ptr != NULL) {
     *card_ptr = CardTableModRefBS::clean_card_val();
-    // We must complete this write before we do any of the reads below.
-    OrderAccess::storeload();
   }
 
+  return true;
+}
+
+HeapWord* HeapRegion::process_oops_on_card(MemRegion mr,
+                                           FilterOutOfRegionClosure *cl,
+                                           jbyte *card_ptr) {
+  G1CollectedHeap* g1h = G1CollectedHeap::heap();
+  G1SATBCardTableLoggingModRefBS* bs = g1h->g1_barrier_set();
   // Cache the boundaries of the memory region in some const locals
   HeapWord* const start = mr.start();
   HeapWord* const end = mr.end();
 
+  HeapWord* cur;
+
   // We used to use "block_start_careful" here.  But we're actually happy
   // to update the BOT while we do this...
-  HeapWord* cur = block_start(start);
+  cur = block_start(start);
   assert(cur <= start, "Postcondition");
 
   oop obj;
@@ -464,6 +470,17 @@
   return NULL;
 }
 
+HeapWord*
+HeapRegion::
+oops_on_card_seq_iterate_careful(MemRegion mr,
+                                 FilterOutOfRegionClosure* cl,
+                                 bool filter_young,
+                                 jbyte* card_ptr) {
+  if (!clean_card(mr, filter_young, card_ptr)) return NULL;
+  if (card_ptr != NULL) OrderAccess::storeload();   // serialize card cleaning
+  return process_oops_on_card(mr, cl, card_ptr);
+}
+
 // Code roots support
 
 void HeapRegion::add_strong_code_root(nmethod* nm) {
@@ -1029,4 +1046,3 @@
   set_saved_mark_word(NULL);
   reset_bot();
 }
-
--- old/src/share/vm/gc/g1/heapRegion.hpp	2015-08-12 14:09:31.000000000 +0200
+++ new/src/share/vm/gc/g1/heapRegion.hpp	2015-08-12 14:09:31.000000000 +0200
@@ -715,6 +715,14 @@
   HeapWord*
   object_iterate_mem_careful(MemRegion mr, ObjectClosure* cl);
 
+  bool clean_card(MemRegion& mr,
+                  bool filter_young,
+                  jbyte* &card_ptr);
+
+  HeapWord* process_oops_on_card(MemRegion mr,
+                                 FilterOutOfRegionClosure *cl,
+                                 jbyte *card_ptr);
+
   // filter_young: if true and the region is a young region then we
   // skip the iteration.
   // card_ptr: if not NULL, and we decide that the card is not young
--- old/src/share/vm/gc/shared/cardTableModRefBS.hpp	2015-08-12 14:09:33.000000000 +0200
+++ new/src/share/vm/gc/shared/cardTableModRefBS.hpp	2015-08-12 14:09:32.000000000 +0200
@@ -43,6 +43,7 @@
 class CardTableModRefBS: public ModRefBarrierSet {
   // Some classes get to look at some private stuff.
   friend class VMStructs;
+  friend class G1RemSet;
  protected:
 
   enum CardValues {
--- old/src/share/vm/opto/compile.cpp	2015-08-12 14:09:34.000000000 +0200
+++ new/src/share/vm/opto/compile.cpp	2015-08-12 14:09:34.000000000 +0200
@@ -3388,6 +3388,7 @@
   return false;
 }
 
+
 //-----------------------------too_many_traps----------------------------------
 // Report if there are too many traps at the current method and bci.
 // Return true if there was a trap, and/or PerMethodTrapLimit is exceeded.
--- old/src/share/vm/opto/compile.hpp	2015-08-12 14:09:35.000000000 +0200
+++ new/src/share/vm/opto/compile.hpp	2015-08-12 14:09:35.000000000 +0200
@@ -34,6 +34,7 @@
 #include "libadt/dict.hpp"
 #include "libadt/vectset.hpp"
 #include "memory/resourceArea.hpp"
+#include "opto/safepointTable.hpp"
 #include "opto/idealGraphPrinter.hpp"
 #include "opto/phasetype.hpp"
 #include "opto/phase.hpp"
@@ -276,6 +277,12 @@
     bool    can_be_reused() const  { return _can_be_reused; }
   };
 
+private:
+  ThreadLocalSafepointTable _tls_table;
+
+public:
+  ThreadLocalSafepointTable *tls_table() { return &_tls_table; }
+
   // Constant table.
   class ConstantTable {
   private:
--- old/src/share/vm/opto/graphKit.cpp	2015-08-12 14:09:36.000000000 +0200
+++ new/src/share/vm/opto/graphKit.cpp	2015-08-12 14:09:36.000000000 +0200
@@ -4185,6 +4185,7 @@
   Node* no_base = __ top();
   float likely  = PROB_LIKELY(0.999);
   float unlikely  = PROB_UNLIKELY(0.999);
+  Node* clean_card = __ ConI((jint)CardTableModRefBS::clean_card_val());
   Node* young_card = __ ConI((jint)G1SATBCardTableModRefBS::g1_young_card_val());
   Node* dirty_card = __ ConI((jint)CardTableModRefBS::dirty_card_val());
   Node* zeroX = __ ConX(0);
@@ -4242,17 +4243,23 @@
         // load the original value of the card
         Node* card_val = __ load(__ ctrl(), card_adr, TypeInt::INT, T_BYTE, Compile::AliasIdxRaw);
 
-        __ if_then(card_val, BoolTest::ne, young_card); {
-          sync_kit(ideal);
-          // Use Op_MemBarVolatile to achieve the effect of a StoreLoad barrier.
-          insert_mem_bar(Op_MemBarVolatile, oop_store);
-          __ sync_kit(this);
-
-          Node* card_val_reload = __ load(__ ctrl(), card_adr, TypeInt::INT, T_BYTE, Compile::AliasIdxRaw);
-          __ if_then(card_val_reload, BoolTest::ne, dirty_card); {
+        if (G1ElideMembar) {
+          __ if_then(card_val, BoolTest::eq, clean_card); {
             g1_mark_card(ideal, card_adr, oop_store, alias_idx, index, index_adr, buffer, tf);
           } __ end_if();
-        } __ end_if();
+        } else {
+          __ if_then(card_val, BoolTest::ne, young_card); {
+            sync_kit(ideal);
+            // Use Op_MemBarVolatile to achieve the effect of a StoreLoad barrier.
+            insert_mem_bar(Op_MemBarVolatile, oop_store);
+            __ sync_kit(this);
+
+            Node* card_val_reload = __ load(__ ctrl(), card_adr, TypeInt::INT, T_BYTE, Compile::AliasIdxRaw);
+            __ if_then(card_val_reload, BoolTest::ne, dirty_card); {
+              g1_mark_card(ideal, card_adr, oop_store, alias_idx, index, index_adr, buffer, tf);
+            } __ end_if();
+          } __ end_if();
+        }
       } __ end_if();
     } __ end_if();
   } else {
--- old/src/share/vm/opto/output.cpp	2015-08-12 14:09:37.000000000 +0200
+++ new/src/share/vm/opto/output.cpp	2015-08-12 14:09:37.000000000 +0200
@@ -42,6 +42,7 @@
 #include "opto/subnode.hpp"
 #include "opto/type.hpp"
 #include "runtime/handles.inline.hpp"
+#include "runtime/sharedRuntime.hpp"
 #include "utilities/xmlstream.hpp"
 
 #ifndef PRODUCT
@@ -1143,7 +1144,7 @@
   // class HandlerImpl is platform-specific and defined in the *.ad files.
   int exception_handler_req = HandlerImpl::size_exception_handler() + MAX_stubs_size; // add marginal slop for handler
   int deopt_handler_req     = HandlerImpl::size_deopt_handler()     + MAX_stubs_size; // add marginal slop for handler
-  stub_req += MAX_stubs_size;   // ensure per-stub margin
+  code_req += tls_table()->stub_size();   // ensure per-stub margin
   code_req += MAX_inst_size;    // ensure per-instruction margin
 
   if (StressCodeBuffers)
@@ -1634,6 +1635,9 @@
   // Fill in exception table entries.
   FillExceptionTables(inct_cnt, call_returns, inct_starts, blk_labels);
 
+  // Fill in stubs for calling the runtime from safepoint polls.
+  tls_table()->emit(*cb, SharedRuntime::is_wide_vector(max_vector_size()));
+
   // Only java methods have exception handlers and deopt handlers
   // class HandlerImpl is platform-specific and defined in the *.ad files.
   if (_method) {
--- old/src/share/vm/runtime/arguments.cpp	2015-08-12 14:09:39.000000000 +0200
+++ new/src/share/vm/runtime/arguments.cpp	2015-08-12 14:09:39.000000000 +0200
@@ -36,6 +36,7 @@
 #include "memory/allocation.inline.hpp"
 #include "memory/universe.inline.hpp"
 #include "oops/oop.inline.hpp"
+#include "opto/safepointTable.hpp"
 #include "prims/jvmtiExport.hpp"
 #include "runtime/arguments.hpp"
 #include "runtime/arguments_ext.hpp"
@@ -1680,6 +1681,31 @@
     FLAG_SET_DEFAULT(GCTimeRatio, 9);
   }
 
+#ifdef THREAD_LOCAL_SAFEPOINT_SUPPORT
+  if (!FLAG_IS_DEFAULT(G1ElideMembar) && G1ElideMembar) {
+    if (!FLAG_IS_DEFAULT(UseMembar) && !UseMembar) {
+      UseMembar = true;
+      jio_fprintf(defaultStream::error_stream(),
+                  "When G1ElideMembar is set, UseMembar must also be set\n");
+    } else {
+      UseMembar = true;
+    }
+    if (!FLAG_IS_DEFAULT(ThreadLocalSafepoints) && !ThreadLocalSafepoints) {
+      ThreadLocalSafepoints = true;
+      jio_fprintf(defaultStream::error_stream(),
+                  "When G1ElideMembar is set, ThreadLocalSafepoints must also be set\n");
+    } else {
+      ThreadLocalSafepoints = true;
+    }
+  }
+#else
+  if (G1ElideMembar) {
+    jio_fprintf(defaultStream::error_stream(),
+                "G1ElideMembar is not supported on this platform\n");
+    G1ElideMembar = false;
+  }
+#endif
+
   if (PrintGCDetails && Verbose) {
     tty->print_cr("MarkStackSize: %uk  MarkStackSizeMax: %uk",
       (unsigned int) (MarkStackSize / K), (uint) (MarkStackSizeMax / K));
@@ -3955,6 +3981,15 @@
   }
 #endif
 
+#ifndef THREAD_LOCAL_SAFEPOINT_SUPPORT
+  if (ThreadLocalSafepoints) {
+    ThreadLocalSafepoints = false;
+    jio_fprintf(defaultStream::error_stream(),
+                "ThreadLocalSafepoints is not supported on this platform\n");
+
+  }
+#endif
+
   return JNI_OK;
 }
 
--- old/src/share/vm/runtime/globals.hpp	2015-08-12 14:09:40.000000000 +0200
+++ new/src/share/vm/runtime/globals.hpp	2015-08-12 14:09:40.000000000 +0200
@@ -647,6 +647,12 @@
   develop(bool, CleanChunkPoolAsync, falseInEmbedded,                       \
           "Clean the chunk pool asynchronously")                            \
                                                                             \
+  product(bool, ThreadLocalSafepoints, false,                               \
+          "Use thread-local safepoints instead of global polling")          \
+                                                                            \
+  product(bool, G1ElideMembar, false,                                       \
+          "Elide G1 write barrier membar using a handshake")                \
+                                                                            \
   experimental(bool, AlwaysSafeConstructors, false,                         \
           "Force safe construction, as if all fields are final.")           \
                                                                             \
--- old/src/share/vm/runtime/osThread.hpp	2015-08-12 14:09:42.000000000 +0200
+++ new/src/share/vm/runtime/osThread.hpp	2015-08-12 14:09:42.000000000 +0200
@@ -125,6 +125,9 @@
   // thread has a unique thread_id (BsdThreads or NPTL). It can be used
   // to access /proc.
   thread_id_t _thread_id;
+
+ public:
+  bool is_online();
 };
 
 
--- old/src/share/vm/runtime/safepoint.cpp	2015-08-12 14:09:43.000000000 +0200
+++ new/src/share/vm/runtime/safepoint.cpp	2015-08-12 14:09:43.000000000 +0200
@@ -182,7 +182,12 @@
   // Make interpreter safepoint aware
   Interpreter::notice_safepoints();
 
-  if (DeferPollingPageLoopCount < 0) {
+  if (ThreadLocalSafepoints) {
+    for (JavaThread *cur = Threads::first(); cur != NULL; cur = cur->next()) {
+      // Make sure the threads start polling it's time to yield.
+      cur->set_yieldpoint(true);
+    }
+  } else if (DeferPollingPageLoopCount < 0) {
     // Make polling safepoint aware
     guarantee (PageArmed == 0, "invariant") ;
     PageArmed = 1 ;
@@ -288,7 +293,7 @@
       // 9. On windows consider using the return value from SwitchThreadTo()
       //    to drive subsequent spin/SwitchThreadTo()/Sleep(N) decisions.
 
-      if (int(iterations) == DeferPollingPageLoopCount) {
+      if (!ThreadLocalSafepoints && int(iterations) == DeferPollingPageLoopCount) {
          guarantee (PageArmed == 0, "invariant") ;
          PageArmed = 1 ;
          os::make_polling_page_unreadable();
--- old/src/share/vm/runtime/sharedRuntime.cpp	2015-08-12 14:09:44.000000000 +0200
+++ new/src/share/vm/runtime/sharedRuntime.cpp	2015-08-12 14:09:44.000000000 +0200
@@ -94,14 +94,22 @@
   _resolve_virtual_call_blob           = generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::resolve_virtual_call_C),       "resolve_virtual_call");
   _resolve_static_call_blob            = generate_resolve_blob(CAST_FROM_FN_PTR(address, SharedRuntime::resolve_static_call_C),        "resolve_static_call");
 
+  address runtime_exception_handler;
+
+  if (ThreadLocalSafepoints) {
+    runtime_exception_handler = CAST_FROM_FN_PTR(address, SharedRuntime::thread_local_safepoint);
+  } else {
+    runtime_exception_handler = CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception);
+  }
+
 #ifdef COMPILER2
   // Vectors are generated only by C2.
   if (is_wide_vector(MaxVectorSize)) {
-    _polling_page_vectors_safepoint_handler_blob = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), POLL_AT_VECTOR_LOOP);
+    _polling_page_vectors_safepoint_handler_blob = generate_handler_blob(runtime_exception_handler, POLL_AT_VECTOR_LOOP);
   }
 #endif // COMPILER2
-  _polling_page_safepoint_handler_blob = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), POLL_AT_LOOP);
-  _polling_page_return_handler_blob    = generate_handler_blob(CAST_FROM_FN_PTR(address, SafepointSynchronize::handle_polling_page_exception), POLL_AT_RETURN);
+  _polling_page_safepoint_handler_blob = generate_handler_blob(runtime_exception_handler, POLL_AT_LOOP);
+  _polling_page_return_handler_blob    = generate_handler_blob(runtime_exception_handler, POLL_AT_RETURN);
 
   generate_deopt_blob();
 
@@ -110,6 +118,14 @@
 #endif // COMPILER2
 }
 
+void SharedRuntime::thread_local_safepoint(JavaThread *thread) {
+  thread->set_yieldpoint(false);
+  thread->update_serialized_memory_version();
+  if (SafepointSynchronize::is_synchronizing()) {
+    SafepointSynchronize::handle_polling_page_exception(thread);
+  }
+}
+
 #include <math.h>
 
 // Implementation of SharedRuntime
@@ -196,6 +212,7 @@
 
 // G1 write-barrier pre: executed before a pointer store.
 JRT_LEAF(void, SharedRuntime::g1_wb_pre(oopDesc* orig, JavaThread *thread))
+  thread->update_serialized_memory_version();
   if (orig == NULL) {
     assert(false, "should be optimized out");
     return;
@@ -207,6 +224,7 @@
 
 // G1 write-barrier post: executed after a pointer store.
 JRT_LEAF(void, SharedRuntime::g1_wb_post(void* card_addr, JavaThread* thread))
+  thread->update_serialized_memory_version();
   thread->dirty_card_queue().enqueue(card_addr);
 JRT_END
 
@@ -520,13 +538,6 @@
   // Look up the code blob
   CodeBlob *cb = CodeCache::find_blob(pc);
 
-  // Should be an nmethod
-  assert(cb && cb->is_nmethod(), "safepoint polling: pc must refer to an nmethod");
-
-  // Look up the relocation information
-  assert(((nmethod*)cb)->is_at_poll_or_poll_return(pc),
-    "safepoint polling: type must be poll");
-
   assert(((NativeInstruction*)pc)->is_safepoint_poll(),
     "Only polling locations are used for safepoint");
 
--- old/src/share/vm/runtime/sharedRuntime.hpp	2015-08-12 14:09:45.000000000 +0200
+++ new/src/share/vm/runtime/sharedRuntime.hpp	2015-08-12 14:09:45.000000000 +0200
@@ -175,6 +175,7 @@
   // exception handling across interpreter/compiler boundaries
   static address raw_exception_handler_for_return_address(JavaThread* thread, address return_address);
   static address exception_handler_for_return_address(JavaThread* thread, address return_address);
+  static void thread_local_safepoint(JavaThread *thread);
 
 #if INCLUDE_ALL_GCS
   // G1 write barriers
--- old/src/share/vm/runtime/thread.cpp	2015-08-12 14:09:46.000000000 +0200
+++ new/src/share/vm/runtime/thread.cpp	2015-08-12 14:09:46.000000000 +0200
@@ -58,6 +58,7 @@
 #include "runtime/fprofiler.hpp"
 #include "runtime/frame.inline.hpp"
 #include "runtime/globals.hpp"
+#include "runtime/globalSynchronizer.hpp"
 #include "runtime/init.hpp"
 #include "runtime/interfaceSupport.hpp"
 #include "runtime/java.hpp"
@@ -92,6 +93,7 @@
 #include "utilities/defaultStream.hpp"
 #include "utilities/dtrace.hpp"
 #include "utilities/events.hpp"
+#include "utilities/hashtable.hpp"
 #include "utilities/macros.hpp"
 #include "utilities/preserveException.hpp"
 #if INCLUDE_ALL_GCS
@@ -209,6 +211,8 @@
   // This initial value ==> never claimed.
   _oops_do_parity = 0;
 
+  _java_threads_do_hp = NULL;
+
   // the handle mark links itself to last_handle_mark
   new HandleMark(this);
 
@@ -1394,6 +1398,10 @@
   // Set the claimed par_id to UINT_MAX (ie not claiming any par_ids)
   set_claimed_par_id(UINT_MAX);
 
+  set_yieldpoint(false);
+  _serialized_memory_version = GlobalSynchronizer::global_serialized_memory_version();
+  _force_yield = false;
+
   set_saved_exception_pc(NULL);
   set_threadObj(NULL);
   _anchor.clear();
@@ -1489,6 +1497,15 @@
   assert(deferred_card_mark().is_empty(), "Default MemRegion ctor");
 }
 
+void JavaThread::update_serialized_memory_version() {
+  int global_version = GlobalSynchronizer::global_serialized_memory_version();
+  int local_version = OrderAccess::load_acquire(&_serialized_memory_version);
+  if (local_version != global_version) {
+    assert(local_version < global_version, "sanity");
+    OrderAccess::release_store(&_serialized_memory_version, global_version);
+  }
+}
+
 bool JavaThread::reguard_stack(address cur_sp) {
   if (_stack_guard_state != stack_guard_yellow_disabled) {
     return true; // Stack already guarded or guard pages not needed.
@@ -1526,6 +1543,13 @@
   }
 }
 
+bool JavaThread::is_online_vm() {
+  return thread_state() == _thread_in_Java;
+}
+
+bool JavaThread::is_online_os() {
+  return _osthread->is_online();
+}
 
 // Remove this ifdef when C1 is ported to the compiler interface.
 static void compiler_thread_entry(JavaThread* thread, TRAPS);
@@ -1664,7 +1688,7 @@
   DTRACE_THREAD_PROBE(stop, this);
 
   this->exit(false);
-  delete this;
+  Threads::smr_free(this, false);
 }
 
 
@@ -1936,7 +1960,7 @@
 #endif // INCLUDE_ALL_GCS
 
   Threads::remove(this);
-  delete this;
+  Threads::smr_free(this, false);
 }
 
 
@@ -3199,11 +3223,14 @@
 // operations from having the thread being operated on from exiting
 // and going away unexpectedly (e.g., safepoint synchronization)
 
-JavaThread* Threads::_thread_list = NULL;
-int         Threads::_number_of_threads = 0;
-int         Threads::_number_of_non_daemon_threads = 0;
-int         Threads::_return_code = 0;
-int         Threads::_thread_claim_parity = 0;
+JavaThread*           Threads::_thread_list = NULL;
+JavaThread*           Threads::_thread_smr_list = NULL;
+JavaThread**          Threads::_thread_smr_list_list = NULL;
+int                   Threads::_number_of_threads = 0;
+int                   Threads::_number_of_non_daemon_threads = 0;
+int                   Threads::_return_code = 0;
+int                   Threads::_thread_claim_parity = 0;
+JavaThread **volatile Threads::_fast_java_thread_list = NULL;
 size_t      JavaThread::_stack_size_at_create = 0;
 #ifdef ASSERT
 bool        Threads::_vm_complete = false;
@@ -3238,6 +3265,22 @@
   // If CompilerThreads ever become non-JavaThreads, add them here
 }
 
+void Threads::java_threads_do_fast(ThreadClosure *tc, Thread *self) {
+  JavaThread **threads;
+
+  // Stable load of thread list w.r.t. hazard pointer for SMR
+  do {
+    threads = (JavaThread**)OrderAccess::load_ptr_acquire((volatile void*)&_fast_java_thread_list);
+    OrderAccess::release_store_ptr_fence((volatile void*)&self->_java_threads_do_hp, (void*)threads);
+  } while ((JavaThread**)OrderAccess::load_ptr_acquire((volatile void*)&_fast_java_thread_list) != threads);
+  if (threads == NULL) return;
+
+  for (JavaThread **current = threads; *current != NULL; current++) {
+    tc->do_thread(*current);
+  }
+  OrderAccess::release_store_ptr_fence((volatile void*)&self->_java_threads_do_hp, NULL);
+}
+
 void Threads::initialize_java_lang_classes(JavaThread* main_thread, TRAPS) {
   TraceTime timer("Initialize java.lang classes", TraceStartupTime);
 
@@ -3397,7 +3440,7 @@
   if (!main_thread->set_as_starting_thread()) {
     vm_shutdown_during_initialization(
                                       "Failed necessary internal allocation. Out of swap space");
-    delete main_thread;
+    smr_free(main_thread, false);
     *canTryAgain = false; // don't let caller call JNI_CreateJavaVM again
     return JNI_ENOMEM;
   }
@@ -3412,7 +3455,7 @@
   // Initialize global modules
   jint status = init_globals();
   if (status != JNI_OK) {
-    delete main_thread;
+    smr_free(main_thread, false);
     *canTryAgain = false; // don't let caller call JNI_CreateJavaVM again
     return status;
   }
@@ -3962,7 +4005,7 @@
 
   notify_vm_shutdown();
 
-  delete thread;
+  smr_free(thread, true);
 
   // exit_globals() will delete tty
   exit_globals();
@@ -3985,6 +4028,192 @@
   return JNI_FALSE;
 }
 
+class ThreadScanEntry: public BasicHashtableEntry<mtThread> {
+public:
+  void *_pointer;
+
+  ThreadScanEntry* next() {
+    return (ThreadScanEntry*)BasicHashtableEntry<mtThread>::next();
+  }
+
+  const void* pointer()           { return _pointer; }
+  void set_pointer(void* pointer) { _pointer = pointer; }
+};
+
+class ThreadScanHashtable : public BasicHashtable<mtThread> {
+private:
+  inline unsigned int compute_hash(void* pointer) {
+    return (unsigned int)(((uint32_t)(uintptr_t)pointer) * 2654435761u);
+  }
+
+  ThreadScanEntry* bucket(int index) {
+    return (ThreadScanEntry*)BasicHashtable<mtThread>::bucket(index);
+  }
+
+  ThreadScanEntry* get_entry(int index, unsigned int hash, void *pointer) {
+    for (ThreadScanEntry* pp = bucket(index); pp != NULL; pp = pp->next()) {
+      if (pp->hash() == hash &&
+          pp->pointer() == pointer) {
+        return pp;
+      }
+    }
+    return NULL;
+  }
+
+public:
+  ThreadScanHashtable(int table_size)
+    : BasicHashtable<mtThread>(table_size, sizeof(ThreadScanEntry)) {}
+
+  ThreadScanEntry* get_entry(void *pointer) {
+    unsigned int hash = compute_hash(pointer);
+    return get_entry(hash_to_index(hash), hash, pointer);
+  }
+
+  ThreadScanEntry* new_entry(void *pointer) {
+    unsigned int hash = compute_hash(pointer);
+    ThreadScanEntry* pp;
+    pp = (ThreadScanEntry*)BasicHashtable<mtThread>::new_entry(hash);
+    pp->set_pointer(pointer);
+    return pp;
+  }
+
+  void add_entry(ThreadScanEntry* pp) {
+    int index = hash_to_index(pp->hash());
+    BasicHashtable<mtThread>::add_entry(index, pp);
+  }
+};
+
+class ScanHazardPointerThreadClosure: public ThreadClosure {
+private:
+  ThreadScanHashtable *_table;
+public:
+  ScanHazardPointerThreadClosure(ThreadScanHashtable *table) : _table(table) {}
+
+  virtual void do_thread(Thread *thread) {
+    assert_locked_or_safepoint(Threads_lock);
+    assert(thread->is_Java_thread(), "sanity");
+    JavaThread *const jthread = reinterpret_cast<JavaThread*>(thread);
+    JavaThread **threads = (JavaThread**)OrderAccess::load_ptr_acquire((volatile void*)&jthread->_java_threads_do_hp);
+    if (threads == NULL) return;
+    for (JavaThread** current = threads; *current != NULL; current++) {
+      JavaThread *p = *current;
+      if (_table->get_entry((void*)p) == NULL) {
+        _table->add_entry(_table->new_entry((void*)p));
+      }
+    }
+  }
+};
+
+void Threads::smr_free(JavaThread *thread, bool have_lock) {
+  assert(!have_lock || Threads_lock->is_locked(), "Threads_lock inconsistency");
+  JavaThread *delete_head;
+  if (!have_lock) {
+    MutexLocker ml(Threads_lock);
+    delete_head = smr_free_work(thread);
+  } else {
+    delete_head = smr_free_work(thread);
+  }
+
+  while (delete_head != NULL) {
+    JavaThread *next = delete_head->next();
+    delete delete_head;
+    delete_head = next;
+  }
+}
+
+JavaThread *Threads::smr_free_work(JavaThread *thread) {
+  assert(Threads_lock->is_locked(), "Threads_lock should be locked");
+
+  thread->set_next(_thread_smr_list);
+  _thread_smr_list = thread;
+
+  JavaThread *current     = _thread_smr_list;
+  JavaThread *prev        = NULL;
+  JavaThread *next        = NULL;
+  JavaThread *delete_head = NULL;
+
+  ThreadScanHashtable *scan_table = new ThreadScanHashtable(32);
+  ScanHazardPointerThreadClosure scan_cl(scan_table);
+  ALL_JAVA_THREADS(q) {
+    scan_cl.do_thread(q);
+  }
+
+  while (current != NULL) {
+    next = current->next();
+    if (!scan_table->get_entry((void*)current)) {
+      if (prev != NULL) {
+        prev->set_next(next);
+      }
+      if (_thread_smr_list == current) _thread_smr_list = next;
+
+      current->set_next(delete_head);
+      delete_head = current;
+    } else {
+      prev = current;
+    }
+
+    current = next;
+  }
+
+  delete scan_table;
+
+  return delete_head;
+}
+
+class ScanHazardPointerThreadsClosure: public ThreadClosure {
+private:
+  ThreadScanHashtable *_table;
+public:
+  ScanHazardPointerThreadsClosure(ThreadScanHashtable *table) : _table(table) {}
+
+  virtual void do_thread(Thread *thread) {
+    assert_locked_or_safepoint(Threads_lock);
+    assert(thread->is_Java_thread(), "sanity");
+    JavaThread *const jthread = reinterpret_cast<JavaThread*>(thread);
+    JavaThread **threads = (JavaThread**)OrderAccess::load_ptr_acquire((volatile void*)&jthread->_java_threads_do_hp);
+    if (threads == NULL) return;
+    if (_table->get_entry((void*)threads) == NULL) {
+      _table->add_entry(_table->new_entry((void*)threads));
+    }
+  }
+};
+
+void Threads::smr_free_list(JavaThread **threads) {
+  assert(Threads_lock->is_locked(), "Threads_lock should be locked");
+
+  JavaThread ***threads_header_addr = (JavaThread***)(threads - 1);
+
+  *threads_header_addr = _thread_smr_list_list;
+  _thread_smr_list_list = threads;
+
+  JavaThread **current = _thread_smr_list_list;
+  JavaThread **prev    = NULL;
+  JavaThread **next    = NULL;
+
+  ThreadScanHashtable *scan_table = new ThreadScanHashtable(32);
+  ScanHazardPointerThreadsClosure scan_cl(scan_table);
+  ALL_JAVA_THREADS(q) {
+    scan_cl.do_thread(q);
+  }
+
+  while (current != NULL) {
+    JavaThread ***current_header_addr = (JavaThread***)(current - 1);
+    next = *current_header_addr;
+    if (!scan_table->get_entry((void*)current)) {
+      if (prev != NULL) {
+        JavaThread ***prev_header_addr = (JavaThread***)(prev - 1);
+        // prev->next = current->next
+        *prev_header_addr = *current_header_addr;
+      }
+      if (_thread_smr_list_list == current) _thread_smr_list_list = *current_header_addr;
+      FREE_C_HEAP_ARRAY(JavaThread*, (JavaThread**)current_header_addr);
+    } else {
+      prev = current;
+    }
+
+    current = next;
+  }
+}
 
 void Threads::add(JavaThread* p, bool force_daemon) {
   // The threads lock must be owned at this point
@@ -4007,6 +4236,15 @@
 
   ThreadService::add_thread(p, daemon);
 
+  JavaThread **new_thread_list = NEW_C_HEAP_ARRAY(JavaThread*, _number_of_threads + 2, mtThread) + 1;
+  int i = 0;
+  ALL_JAVA_THREADS(q) {
+    new_thread_list[i++] = q;
+  }
+  new_thread_list[i] = NULL;
+  JavaThread **old_list = (JavaThread**)Atomic::xchg_ptr((void*)new_thread_list, (volatile void*)&_fast_java_thread_list);
+  if (old_list != NULL) smr_free_list(old_list);
+
   // Possible GC point.
   Events::log(p, "Thread added: " INTPTR_FORMAT, p);
 }
@@ -4021,7 +4259,11 @@
     JavaThread* current = _thread_list;
     JavaThread* prev    = NULL;
 
+    JavaThread **new_thread_list = NEW_C_HEAP_ARRAY(JavaThread*, _number_of_threads + 1, mtThread) + 1;
+    int i = 0;
+
     while (current != p) {
+      new_thread_list[i++] = current;
       prev    = current;
       current = current->next();
     }
@@ -4031,6 +4273,16 @@
     } else {
       _thread_list = p->next();
     }
+
+    current = current->next();
+    while (current != NULL) {
+      new_thread_list[i++] = current;
+      current = current->next();
+    }
+    new_thread_list[i] = NULL;
+    JavaThread **old_list = (JavaThread**)Atomic::xchg_ptr((void*)new_thread_list, (volatile void*)&_fast_java_thread_list);
+    if (old_list != NULL) smr_free_list(old_list);
+
     _number_of_threads--;
     oop threadObj = p->threadObj();
     bool daemon = true;
--- old/src/share/vm/runtime/thread.hpp	2015-08-12 14:09:48.000000000 +0200
+++ new/src/share/vm/runtime/thread.hpp	2015-08-12 14:09:48.000000000 +0200
@@ -101,6 +101,9 @@
 
 class Thread: public ThreadShadow {
   friend class VMStructs;
+  friend class Threads;
+  friend class ScanHazardPointerThreadClosure;
+  friend class ScanHazardPointerThreadsClosure;
  private:
   // Exception handling
   // (Note: _pending_exception and friends are in ThreadShadow)
@@ -237,6 +240,8 @@
   // claimed as a task.
   jint _oops_do_parity;
 
+  JavaThread **volatile _java_threads_do_hp;
+
  public:
   void set_last_handle_mark(HandleMark* mark)   { _last_handle_mark = mark; }
   HandleMark* last_handle_mark() const          { return _last_handle_mark; }
@@ -588,6 +593,8 @@
   static ByteSize exception_line_offset()        { return byte_offset_of(Thread, _exception_line); }
   static ByteSize active_handles_offset()        { return byte_offset_of(Thread, _active_handles); }
 
+  static ByteSize yieldpoint_offset()            { return byte_offset_of(Thread, _yieldpoint_poll); }
+
   static ByteSize stack_base_offset()            { return byte_offset_of(Thread, _stack_base); }
   static ByteSize stack_size_offset()            { return byte_offset_of(Thread, _stack_size); }
 
@@ -936,6 +943,19 @@
   }   _jmp_ring[jump_ring_buffer_size];
 #endif // PRODUCT
 
+private:
+  volatile int _serialized_memory_version;
+  volatile bool _force_yield;
+
+public:
+  int serialized_memory_version() { return _serialized_memory_version; }
+  void update_serialized_memory_version();
+
+  void set_force_yield() { _force_yield = true; }
+
+  bool is_online_vm();
+  bool is_online_os();
+
 #if INCLUDE_ALL_GCS
   // Support for G1 barriers
 
@@ -1866,17 +1886,23 @@
 class Threads: AllStatic {
   friend class VMStructs;
  private:
-  static JavaThread* _thread_list;
-  static int         _number_of_threads;
-  static int         _number_of_non_daemon_threads;
-  static int         _return_code;
-  static int         _thread_claim_parity;
+  static JavaThread*  _thread_list;
+  static JavaThread*  _thread_smr_list;
+  static JavaThread** _thread_smr_list_list;
+  static int          _number_of_threads;
+  static int          _number_of_non_daemon_threads;
+  static int          _return_code;
+  static int          _thread_claim_parity;
 #ifdef ASSERT
-  static bool        _vm_complete;
+  static bool         _vm_complete;
 #endif
 
+  static JavaThread **volatile _fast_java_thread_list;
+
   static void initialize_java_lang_classes(JavaThread* main_thread, TRAPS);
   static void initialize_jsr292_core_classes(TRAPS);
+  static JavaThread *smr_free_work(JavaThread *thread);
+  static void smr_free_list(JavaThread **threads);
  public:
   // Thread management
   // force_daemon is a concession to JNI, where we may need to add a
@@ -1887,6 +1913,9 @@
   static JavaThread* first()                     { return _thread_list; }
   static void threads_do(ThreadClosure* tc);
 
+  static void java_threads_do_fast(ThreadClosure *tc, Thread *self);
+  static void smr_free(JavaThread *thread, bool have_lock);
+
   // Initializes the vm and creates the vm thread
   static jint create_vm(JavaVMInitArgs* args, bool* canTryAgain);
   static void convert_vm_init_libraries_to_agents();
--- old/src/share/vm/utilities/exceptions.hpp	2015-08-12 14:09:49.000000000 +0200
+++ new/src/share/vm/utilities/exceptions.hpp	2015-08-12 14:09:49.000000000 +0200
@@ -59,8 +59,9 @@
 
 class ThreadShadow: public CHeapObj<mtThread> {
   friend class VMStructs;
-
  protected:
+  char _yieldpoint_poll;
+  char _yieldpoint_spill[wordSize - 1];
   oop  _pending_exception;                       // Thread has gc actions.
   const char* _exception_file;                   // file information for exception (debugging only)
   int         _exception_line;                   // line information for exception (debugging only)
@@ -90,7 +91,15 @@
   void clear_pending_exception();
 
   ThreadShadow() : _pending_exception(NULL),
-                   _exception_file(NULL), _exception_line(0) {}
+                   _exception_file(NULL), _exception_line(0), _yieldpoint_poll(3) {}
+
+  void set_yieldpoint(bool should_take_yieldpoint) {
+    _yieldpoint_poll = should_take_yieldpoint ? 0 : 3;
+  }
+
+  bool yieldpoint() {
+    return _yieldpoint_poll == 3;
+  }
 };
 
 
--- old/src/share/vm/utilities/hashtable.cpp	2015-08-12 14:09:51.000000000 +0200
+++ new/src/share/vm/utilities/hashtable.cpp	2015-08-12 14:09:50.000000000 +0200
@@ -381,4 +381,5 @@
 template class BasicHashtable<mtSymbol>;
 template class BasicHashtable<mtCode>;
 template class BasicHashtable<mtInternal>;
+template class BasicHashtable<mtThread>;
 template class BasicHashtable<mtCompiler>;
--- /dev/null	2015-08-12 14:09:52.000000000 +0200
+++ new/src/cpu/x86/vm/c2_safepointTable_x86_64.cpp	2015-08-12 14:09:51.000000000 +0200
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "opto/compile.hpp"
+#include "opto/node.hpp"
+#include "opto/safepointTable.hpp"
+#include "runtime/sharedRuntime.hpp"
+
+Label &ThreadLocalSafepointTable::add_safepoint(InternalAddress safepoint_addr, bool is_return) {
+  ThreadLocalSafepointEntry *entry = new (Compile::current()->comp_arena()) ThreadLocalSafepointEntry(safepoint_addr, is_return);
+  int index = _safepoints.append(entry);
+  return _safepoints.at(index)->_stub_label;
+}
+
+int ThreadLocalSafepointTable::stub_size() {
+  return _safepoints.length() * 14 * 2;
+}
+
+#define __ _masm.
+void ThreadLocalSafepointTable::emit(CodeBuffer& cbuf, bool has_wide_vectors) {
+  //cb->insts()->freeze();
+
+  MacroAssembler _masm(&cbuf);
+
+  for (int i = _safepoints.length() - 1; i >= 0; i--) {
+    ThreadLocalSafepointEntry &entry = *_safepoints.at(i);
+
+    __ bind(entry._stub_label);
+    __ push(rax);
+    __ lea(rax, entry._safepoint_addr);
+    __ movptr(Address(r15_thread, JavaThread::saved_exception_pc_offset()), rax);
+    __ pop(rax);
+
+    address stub;
+
+    if (entry._is_return) {
+      assert(SharedRuntime::polling_page_return_handler_blob() != NULL,
+             "polling page return stub not created yet");
+      stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
+    } else if (has_wide_vectors) {
+      assert(SharedRuntime::polling_page_vectors_safepoint_handler_blob() != NULL,
+             "polling page safepoint stub not created yet");
+      stub = SharedRuntime::polling_page_vectors_safepoint_handler_blob()->entry_point();
+    } else {
+      assert(SharedRuntime::polling_page_safepoint_handler_blob() != NULL,
+             "polling page safepoint stub not created yet");
+      stub = SharedRuntime::polling_page_safepoint_handler_blob()->entry_point();
+    }
+
+    RuntimeAddress callback_addr(stub);
+
+    __ jump(callback_addr);
+  }
+
+}
+#undef __
--- /dev/null	2015-08-12 14:09:53.000000000 +0200
+++ new/src/cpu/x86/vm/c2_safepointTable_x86_64.hpp	2015-08-12 14:09:52.000000000 +0200
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_X86_VM_C2_SAFEPOINTTABLE_x86_64_HPP
+#define CPU_X86_VM_C2_SAFEPOINTTABLE_x86_64_HPP
+
+#define THREAD_LOCAL_SAFEPOINT_SUPPORT 1
+
+#include "asm/macroAssembler.hpp"
+#include "utilities/growableArray.hpp"
+
+class ThreadLocalSafepointTable {
+private:
+  struct ThreadLocalSafepointEntry : public ResourceObj {
+    InternalAddress _safepoint_addr;
+    Label _stub_label;
+    bool _is_return;
+    ThreadLocalSafepointEntry(InternalAddress safepoint_addr, bool is_return) : _safepoint_addr(safepoint_addr), _is_return(is_return) {}
+  };
+  GrowableArray<ThreadLocalSafepointEntry*> _safepoints;
+
+public:
+  Label &add_safepoint(InternalAddress safepoint_addr, bool is_return);
+
+  int stub_size();
+  void emit(CodeBuffer &cb, bool has_wide_vectors);
+};
+
+#endif /* CPU_X86_VM_C2_SAFEPOINTTABLE_x86_64_HPP */
--- /dev/null	2015-08-12 14:09:54.000000000 +0200
+++ new/src/share/vm/opto/safepointTable.hpp	2015-08-12 14:09:53.000000000 +0200
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_VM_OPTO_SAFEPOINTTABLE_HPP
+#define SHARE_VM_OPTO_SAFEPOINTTABLE_HPP
+
+#if defined(AMD64)
+#include "c2_safepointTable_x86_64.hpp"
+#else
+class ThreadLocalSafepointTable {
+public:
+  int stub_size() { return 0; }
+  void emit(CodeBuffer &cb, bool has_wide_vectors) {}
+};
+#endif
+
+#endif /* SHARE_VM_OPTO_SAFEPOINTTABLE_HPP */
--- /dev/null	2015-08-12 14:09:55.000000000 +0200
+++ new/src/share/vm/runtime/globalSynchronizer.cpp	2015-08-12 14:09:54.000000000 +0200
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "runtime/atomic.hpp"
+#include "runtime/globalSynchronizer.hpp"
+#include "runtime/thread.inline.hpp"
+
+#ifndef MIN
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+#endif
+
+volatile int GlobalSynchronizer::_global_serialized_memory_version = 0;
+volatile int GlobalSynchronizer::_latest_global_serialized_memory_version = 0;
+
+int GlobalSynchronizer::global_serialized_memory_version() {
+  return _global_serialized_memory_version;
+}
+
+GlobalSynchronizer::GlobalSynchronizer(UrgencyLevel start_urgency, UrgencyLevel max_urgency)
+  : _current_urgency(start_urgency), _max_urgency(max_urgency) {
+  assert(max_urgency >= start_urgency, "sanity");
+  assert(start_urgency >= UrgencyLevel1 && start_urgency <= UrgencyLevelMax, "sanity");
+}
+
+GlobalSynchronizer::~GlobalSynchronizer() {
+}
+
+void GlobalSynchronizer::start_synchronizing() {
+  assert(ThreadLocalSafepoints, "sanity");
+  _local_serialized_memory_version = Atomic::add(1, &_global_serialized_memory_version);
+}
+
+bool GlobalSynchronizer::increase_urgency() {
+  if (_current_urgency + 1 < _max_urgency) {
+    _current_urgency = (UrgencyLevel)(int(_current_urgency) + 1);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void GlobalSynchronizer::maximize_urgency() {
+  _current_urgency = _max_urgency;
+}
+
+void GlobalSynchronizer::threads_do(ThreadClosure *cl) {
+  Threads::java_threads_do_fast(cl, Thread::current());
+}
+
+class GSHasFinishedThreadClosure : public ThreadClosure {
+private:
+  int _needed_version;
+  int _min_agreed_version;
+  bool _check_thread_state;
+
+public:
+  GSHasFinishedThreadClosure(int version, bool check_thread_state) : _needed_version(version), _min_agreed_version(INT_MAX), _check_thread_state(check_thread_state) {}
+
+  virtual void do_thread(Thread *thread) {
+    JavaThread *jthread = reinterpret_cast<JavaThread*>(thread);
+    int thread_version = jthread->serialized_memory_version();
+    if (thread_version < _needed_version) {
+      if (!jthread->is_online_vm()) _min_agreed_version = MIN(_needed_version, _min_agreed_version);
+      else if (_check_thread_state && !jthread->is_online_os()) _min_agreed_version = MIN(_needed_version, _min_agreed_version);
+      else _min_agreed_version = MIN(thread_version, _min_agreed_version);
+    } else {
+      _min_agreed_version = MIN(thread_version, _min_agreed_version);
+    }
+  }
+
+  bool did_synchronize() { return _min_agreed_version >= _needed_version; }
+  void fixup_global_version() {
+    int global_version = GlobalSynchronizer::_latest_global_serialized_memory_version;
+    if (global_version < _min_agreed_version) {
+      (void) Atomic::cmpxchg(_min_agreed_version, &GlobalSynchronizer::_latest_global_serialized_memory_version, global_version);
+    }
+  }
+};
+
+class GSSetYieldpointThreadClosure : public ThreadClosure {
+  const int  _target_version;
+  const bool _force_yields;
+public:
+  GSSetYieldpointThreadClosure(bool force_yields, int target_version) : _force_yields(force_yields), _target_version(target_version) {}
+
+  virtual void do_thread(Thread *thread) {
+    JavaThread *const jthread = (JavaThread*)thread;
+    if (jthread->serialized_memory_version() >= _target_version) return;
+    if (_force_yields) jthread->set_force_yield();
+    jthread->set_yieldpoint(true);
+  }
+};
+
+bool GlobalSynchronizer::try_synchronize() {
+  Thread *thread = Thread::current();
+  if (thread->is_Java_thread()) {
+    JavaThread *jthread = reinterpret_cast<JavaThread*>(thread);
+    jthread->update_serialized_memory_version();
+  }
+
+  if (_latest_global_serialized_memory_version >= _local_serialized_memory_version) {
+    return true;
+  }
+
+  GSHasFinishedThreadClosure cl(_local_serialized_memory_version, /* check_thread_state */ _current_urgency >= UrgencyLevel2);
+  threads_do(&cl);
+  if (cl.did_synchronize()) {
+    cl.fixup_global_version();
+    return true;
+  }
+
+  switch (_current_urgency) {
+  case UrgencyLevel3:
+  case UrgencyLevel4: {
+    GSSetYieldpointThreadClosure cl(false, _local_serialized_memory_version);
+    threads_do(&cl);
+    return false;
+  }
+  default: return false;
+  }
+}
+
+void GlobalSynchronizer::synchronize() {
+  while (!try_synchronize()) os::naked_yield();
+}
--- /dev/null	2015-08-12 14:09:56.000000000 +0200
+++ new/src/share/vm/runtime/globalSynchronizer.hpp	2015-08-12 14:09:55.000000000 +0200
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+
+#ifndef SHARE_VM_RUNTIME_GLOBALSYNCHRONIZER_HPP
+#define SHARE_VM_RUNTIME_GLOBALSYNCHRONIZER_HPP
+
+#include "memory/allocation.hpp"
+
+class JavaThread;
+class ThreadClosure;
+
+// This class is used to coordinate global synchronization among mutator threads.
+// It may be used in a lazy asynchronous way to reduce global overheads of the mechanism.
+
+class GlobalSynchronizer {
+  friend class GSHasFinishedThreadClosure;
+public:
+  enum UrgencyLevel {
+    UrgencyLevel1,
+    // Hope for runtime to respond willingly
+
+    UrgencyLevel2,
+    // Arm thread-local yieldpoints for forced handshaking
+
+    UrgencyLevel3,
+    // Force running threads to yield to complete the handshake.
+    // May also check if threads are ONPROC when this information is available
+
+    UrgencyLevel4,
+    // Enforce global synchronization to finish with whatever means necessary
+    // and available on the platform, including IPI
+
+    UrgencyLevelMax = UrgencyLevel4
+  };
+
+private:
+  UrgencyLevel _current_urgency;
+  UrgencyLevel _max_urgency;
+  int          _threads_left;
+  int          _local_serialized_memory_version;
+
+private:
+  void threads_do(ThreadClosure *cl);
+
+private:
+  static volatile int _global_serialized_memory_version;
+  static volatile int _latest_global_serialized_memory_version;
+
+public:
+  static int global_serialized_memory_version();
+
+public:
+  virtual ~GlobalSynchronizer();
+  GlobalSynchronizer(UrgencyLevel start_urgency = UrgencyLevel1, UrgencyLevel max_urgency = UrgencyLevel4);
+
+  // Starts the synchronization process
+  void start_synchronizing();
+
+  bool increase_urgency();
+  void maximize_urgency();
+
+  bool try_synchronize();   // For less urgent more scalable synchronization
+  void synchronize();       // For aggressive blocking synchronization
+};
+
+template <MEMFLAGS F>
+class SynchronizerObj: public GlobalSynchronizer, public CHeapObj<F> {
+public:
+  SynchronizerObj(UrgencyLevel start_urgency = UrgencyLevel1, UrgencyLevel max_urgency = UrgencyLevel4)
+    : GlobalSynchronizer(start_urgency, max_urgency) {}
+};
+
+#endif // SHARE_VM_RUNTIME_GLOBALSYNCHRONIZER_HPP