--- old/src/gpu/hsail/vm/gpu_hsail_Tlab.hpp	2014-08-06 17:05:48.711457142 -0500
+++ new/src/gpu/hsail/vm/gpu_hsail_Tlab.hpp	2014-08-06 17:05:48.555457142 -0500
@@ -41,7 +41,7 @@
   HeapWord* _end;
   HeapWord* _last_good_top;
   HeapWord* _original_top;
-  JavaThread* _donor_thread;         // donor thread associated with this tlabInfo
+  ThreadLocalAllocBuffer* _tlab;      // tlab associated with this tlabInfo
   HSAILAllocationInfo* _alloc_info;   // same as what is in HSAILDeoptimizationInfo
 
   // Accessors
@@ -50,11 +50,12 @@
   HeapWord* end() { return _end; }
   HeapWord* last_good_top() { return _last_good_top; }
   HeapWord* original_top() { return _original_top; }
-  void initialize(HeapWord* start, HeapWord* top, HeapWord* end, JavaThread* donorThread, HSAILAllocationInfo* allocInfo) {
+  ThreadLocalAllocBuffer* tlab() { return _tlab; }
+  void initialize(HeapWord* start, HeapWord* top, HeapWord* end, ThreadLocalAllocBuffer* tlab, HSAILAllocationInfo* allocInfo) {
     _start = start;
     _top = _original_top = top;
     _end = end;
-    _donor_thread = donorThread;
+    _tlab = tlab;
     _alloc_info = allocInfo;
   }
 };
@@ -63,54 +64,56 @@
 class HSAILAllocationInfo : public CHeapObj<mtInternal> {
   friend class VMStructs;
 private:
-  JavaThread** donorThreads;
-  jint _num_donor_threads;
-  size_t _tlab_align_reserve_bytes;    // filled in from ThreadLocalAllocBuffer::alignment_reserve_in_bytes()
-  HSAILTlabInfo** _cur_tlab_infos;    // array of current tlab info pointers, one per donor_thread
+  jint   _num_tlabs;
+  size_t _tlab_align_reserve_bytes;         // filled in from ThreadLocalAllocBuffer::alignment_reserve_in_bytes()
+  HSAILTlabInfo** _cur_tlab_infos;          // array of current tlab info pointers, one per num_tlabs
   HSAILTlabInfo* _tlab_infos_pool_start;    // pool for new tlab_infos
   HSAILTlabInfo* _tlab_infos_pool_next;     // where next will be allocated from
   HSAILTlabInfo* _tlab_infos_pool_end;      // where next will be allocated from
 
 public:
-  HSAILAllocationInfo(jobject donor_threads_jobj, int dimX, int allocBytesPerWorkitem) {
-    // fill in the donorThreads array
-    objArrayOop donorThreadObjects = (objArrayOop) JNIHandles::resolve(donor_threads_jobj);
-    _num_donor_threads = donorThreadObjects->length();
-    guarantee(_num_donor_threads > 0, "need at least one donor thread");
-    donorThreads = NEW_C_HEAP_ARRAY(JavaThread*, _num_donor_threads, mtInternal);
-    for (int i = 0; i < _num_donor_threads; i++) {
-      donorThreads[i] = java_lang_Thread::thread(donorThreadObjects->obj_at(i));
+  HSAILAllocationInfo(jint num_tlabs, int dimX, int allocBytesPerWorkitem) {
+    _num_tlabs = num_tlabs;
+    // if this thread doesn't have gpu_hsail_tlabs allocated yet, do so now
+    JavaThread* thread = JavaThread::current();
+    if (thread->get_gpu_hsail_tlabs_count() == 0) {
+      thread->initialize_gpu_hsail_tlabs(num_tlabs);
+      if (TraceGPUInteraction) {
+        for (int i = 0; i < num_tlabs; i++) {
+          ThreadLocalAllocBuffer* tlab = thread->get_gpu_hsail_tlab_at(i);
+          tty->print("initialized gpu_hsail_tlab %d at %p -> ", i, tlab);
+          printTlabInfoFromThread(tlab);
+        }
+      }
     }
-    
+
     // Compute max_tlab_infos based on amount of free heap space
     size_t max_tlab_infos;
     {
-      JavaThread* donorThread = donorThreads[0];
-      ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
+      ThreadLocalAllocBuffer* tlab = &thread->tlab();
       size_t new_tlab_size = tlab->compute_size(0);
-      size_t heap_bytes_free = Universe::heap()->unsafe_max_tlab_alloc(donorThread);
+      size_t heap_bytes_free = Universe::heap()->unsafe_max_tlab_alloc(thread);
       if (new_tlab_size != 0) {
-        max_tlab_infos = MIN2(heap_bytes_free / new_tlab_size, (size_t)(64 * _num_donor_threads));
+        max_tlab_infos = MIN2(heap_bytes_free / new_tlab_size, (size_t)(64 * _num_tlabs));
       } else {
-        max_tlab_infos = 8 * _num_donor_threads;   // an arbitrary multiple
+        max_tlab_infos = 8 * _num_tlabs;   // an arbitrary multiple
       }
       if (TraceGPUInteraction) {
         tty->print_cr("heapFree = %ld, newTlabSize=%ld, tlabInfos allocated = %ld", heap_bytes_free, new_tlab_size, max_tlab_infos);
       }
     }
 
-    _cur_tlab_infos = NEW_C_HEAP_ARRAY(HSAILTlabInfo*, _num_donor_threads, mtInternal);
+    _cur_tlab_infos = NEW_C_HEAP_ARRAY(HSAILTlabInfo*, _num_tlabs, mtInternal);
     _tlab_infos_pool_start = NEW_C_HEAP_ARRAY(HSAILTlabInfo, max_tlab_infos, mtInternal);
-    _tlab_infos_pool_next = &_tlab_infos_pool_start[_num_donor_threads];
+    _tlab_infos_pool_next = &_tlab_infos_pool_start[_num_tlabs];
     _tlab_infos_pool_end = &_tlab_infos_pool_start[max_tlab_infos];
     _tlab_align_reserve_bytes = ThreadLocalAllocBuffer::alignment_reserve_in_bytes();
       
-    // we will fill the first N tlabInfos from the donor threads
-    for (int i = 0; i < _num_donor_threads; i++) {
-      JavaThread* donorThread = donorThreads[i];
-      ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
+    // we will fill the first N tlabInfos from the gpu_hsail_tlabs
+    for (int i = 0; i < _num_tlabs; i++) {
+      ThreadLocalAllocBuffer* tlab = thread->get_gpu_hsail_tlab_at(i);
       if (TraceGPUInteraction) {
-        tty->print("donorThread %d, is %p, tlab at %p -> ", i, donorThread, tlab);
+        tty->print("gpu_hsail_tlab %d at %p -> ", i, tlab);
         printTlabInfoFromThread(tlab);
       }
       
@@ -122,13 +125,13 @@
       // here, it might make sense to do a gc now rather than to start
       // the kernel and have it deoptimize.  How to do that?
       if (tlab->end() == NULL) {
-        bool success = getNewTlabForDonorThread(tlab, i);
+        bool success = getNewGpuHsailTlab(tlab);
         if (TraceGPUInteraction) {
           if (success) {
-            tty->print("donorThread %d, refilled tlab, -> ", i);
+            tty->print("gpu_hsail_tlab %d, refilled tlab, -> ", i);
             printTlabInfoFromThread(tlab);
           } else {
-            tty->print("donorThread %d, could not refill tlab, left as ", i);
+            tty->print("gpu_hsail_tlab %d, could not refill tlab, left as ", i);
             printTlabInfoFromThread(tlab);
           }
         }
@@ -137,26 +140,19 @@
       // extract the necessary tlab fields into a TlabInfo record
       HSAILTlabInfo* pTlabInfo = &_tlab_infos_pool_start[i];
       _cur_tlab_infos[i] = pTlabInfo;
-      pTlabInfo->initialize(tlab->start(), tlab->top(), tlab->end(), donorThread, this);
-
-      // reset the real tlab fields to zero so we are sure the thread doesn't use it
-      tlab->set_start(NULL);
-      tlab->set_top(NULL);
-      tlab->set_pf_top(NULL);
-      tlab->set_end(NULL);
+      pTlabInfo->initialize(tlab->start(), tlab->top(), tlab->end(), tlab, this);
     }
   }
 
   ~HSAILAllocationInfo() {
     FREE_C_HEAP_ARRAY(HSAILTlabInfo*, _cur_tlab_infos, mtInternal);
     FREE_C_HEAP_ARRAY(HSAILTlabInfo, _tlab_infos_pool_start, mtInternal);
-    FREE_C_HEAP_ARRAY(JavaThread*, donorThreads, mtInternal);
   }
 
   void postKernelCleanup() {
     // go thru all the tlabInfos, fix up any tlab tops that overflowed
     // complete the tlabs if they overflowed
-    // update the donor threads tlabs when appropriate
+    // update the gpu_hsail_tlabs when appropriate
     bool anyOverflows = false;
     size_t bytesAllocated = 0;
     // if there was an overflow in allocating tlabInfos, correct it here
@@ -172,8 +168,7 @@
         tty->print_cr("postprocess tlabInfo %p, start=%p, top=%p, end=%p, last_good_top=%p", tlabInfo, 
                       tlabInfo->start(), tlabInfo->top(), tlabInfo->end(), tlabInfo->last_good_top());
       }
-      JavaThread* donorThread = tlabInfo->_donor_thread;
-      ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
+      ThreadLocalAllocBuffer* tlab = tlabInfo->tlab();
       bool overflowed = false;
       // if a tlabInfo has NULL fields, i.e. we could not prime it on entry,
       // or we could not get a tlab from the gpu, so ignore tlabInfo here
@@ -183,24 +178,14 @@
           overflowed = true;
           if (TraceGPUInteraction) {
             long overflowAmount = (long) tlabInfo->top() - (long) tlabInfo->last_good_top(); 
-            tty->print_cr("tlabInfo %p (donorThread = %p) overflowed by %ld bytes, setting last good top to %p", tlabInfo, donorThread, overflowAmount, tlabInfo->last_good_top());
+            tty->print_cr("tlabInfo %p (tlab = %p) overflowed by %ld bytes, setting last good top to %p", tlabInfo, tlab, overflowAmount, tlabInfo->last_good_top());
           }
           tlabInfo->_top = tlabInfo->last_good_top();
         }
 
-        // if the donor thread allocated anything while we were running
-        // we will retire its tlab before overwriting with our new one
-        if (tlab->top() != NULL) {
-          if (TraceGPUInteraction) {
-            tty->print("Donor Thread allocated new tlab");
-            printTlabInfoFromThread(tlab);
-          }
-          tlab->make_parsable(true);
-        }
-
-        // fill the donor thread tlab with the tlabInfo information
+        // fill the gpu_hsail_tlab with the tlabInfo information
         // we do this even if it will get overwritten by a later tlabinfo
-        // because it helps with tlab statistics for that donor thread
+        // because it helps with tlab statistics for that tlab
         tlab->fill(tlabInfo->start(), tlabInfo->top(), (tlabInfo->end() - tlabInfo->start()) + tlab->alignment_reserve());
 
         // if there was an overflow, make it parsable with retire = true
@@ -231,7 +216,7 @@
 private:
   // fill and retire old tlab and get a new one
   // if we can't get one, no problem someone will eventually do a gc
-  bool getNewTlabForDonorThread(ThreadLocalAllocBuffer* tlab, int idx) {
+  bool getNewGpuHsailTlab(ThreadLocalAllocBuffer* tlab) {
 
     tlab->clear_before_allocation();    // fill and retire old tlab (will also check for null)