New src/gpu/hsail/vm/gpu_hsail

   1 /*
   2  * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef GPU_HSAIL_VM_GPU_HSAIL_TLAB_HPP
  26 #define GPU_HSAIL_VM_GPU_HSAIL_TLAB_HPP
  27 
  28 #include "graal/graalEnv.hpp"
  29 #include "code/debugInfo.hpp"
  30 #include "code/location.hpp"
  31 #include "gpu_hsail.hpp"
  32 
  33 class HSAILAllocationInfo;
  34 
  35 class HSAILTlabInfo VALUE_OBJ_CLASS_SPEC {
  36   friend class VMStructs;
  37 public:
  38   // uses only the necessary fields from a full TLAB
  39   HeapWord* _start;
  40   HeapWord* _top;
  41   HeapWord* _end;
  42   HeapWord* _last_good_top;
  43   HeapWord* _original_top;
  44   JavaThread* _donor_thread;         // donor thread associated with this tlabInfo
  45   HSAILAllocationInfo* _alloc_info;   // same as what is in HSAILDeoptimizationInfo
  46 
  47   // Accessors
  48   HeapWord* start() { return _start; }
  49   HeapWord* top() { return _top; }
  50   HeapWord* end() { return _end; }
  51   HeapWord* last_good_top() { return _last_good_top; }
  52   HeapWord* original_top() { return _original_top; }
  53   void initialize(HeapWord* start, HeapWord* top, HeapWord* end, JavaThread* donorThread, HSAILAllocationInfo* allocInfo) {
  54     _start = start;
  55     _top = _original_top = top;
  56     _end = end;
  57     _donor_thread = donorThread;
  58     _alloc_info = allocInfo;
  59   }
  60 };
  61 
  62 
  63 class HSAILAllocationInfo : public CHeapObj<mtInternal> {
  64   friend class VMStructs;
  65 private:
  66   JavaThread** donorThreads;
  67   jint _num_donor_threads;
  68   size_t _tlab_align_reserve_bytes;    // filled in from ThreadLocalAllocBuffer::alignment_reserve_in_bytes()
  69   HSAILTlabInfo** _cur_tlab_infos;    // array of current tlab info pointers, one per donor_thread
  70   HSAILTlabInfo* _tlab_infos_pool_start;    // pool for new tlab_infos
  71   HSAILTlabInfo* _tlab_infos_pool_next;     // where next will be allocated from
  72   HSAILTlabInfo* _tlab_infos_pool_end;      // where next will be allocated from
  73 
  74 public:
  75   HSAILAllocationInfo(jobject donor_threads_jobj, int dimX, int allocBytesPerWorkitem) {
  76     // fill in the donorThreads array
  77     objArrayOop donorThreadObjects = (objArrayOop) JNIHandles::resolve(donor_threads_jobj);
  78     _num_donor_threads = donorThreadObjects->length();
  79     guarantee(_num_donor_threads > 0, "need at least one donor thread");
  80     donorThreads = NEW_C_HEAP_ARRAY(JavaThread*, _num_donor_threads, mtInternal);
  81     for (int i = 0; i < _num_donor_threads; i++) {
  82       donorThreads[i] = java_lang_Thread::thread(donorThreadObjects->obj_at(i));
  83     }
  84     
  85     // Compute max_tlab_infos based on amount of free heap space
  86     size_t max_tlab_infos;
  87     {
  88       JavaThread* donorThread = donorThreads[0];
  89       ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
  90       size_t new_tlab_size = tlab->compute_size(0);
  91       size_t heap_bytes_free = Universe::heap()->unsafe_max_tlab_alloc(donorThread);
  92       if (new_tlab_size != 0) {
  93         max_tlab_infos = MIN2(heap_bytes_free / new_tlab_size, (size_t)(64 * _num_donor_threads));
  94       } else {
  95         max_tlab_infos = 8 * _num_donor_threads;   // an arbitrary multiple
  96       }
  97       if (TraceGPUInteraction) {
  98         tty->print_cr("heapFree = %ld, newTlabSize=%ld, tlabInfos allocated = %ld", heap_bytes_free, new_tlab_size, max_tlab_infos);
  99       }
 100     }
 101 
 102     _cur_tlab_infos = NEW_C_HEAP_ARRAY(HSAILTlabInfo*, _num_donor_threads, mtInternal);
 103     _tlab_infos_pool_start = NEW_C_HEAP_ARRAY(HSAILTlabInfo, max_tlab_infos, mtInternal);
 104     _tlab_infos_pool_next = &_tlab_infos_pool_start[_num_donor_threads];
 105     _tlab_infos_pool_end = &_tlab_infos_pool_start[max_tlab_infos];
 106     _tlab_align_reserve_bytes = ThreadLocalAllocBuffer::alignment_reserve_in_bytes();
 107       
 108     // we will fill the first N tlabInfos from the donor threads
 109     for (int i = 0; i < _num_donor_threads; i++) {
 110       JavaThread* donorThread = donorThreads[i];
 111       ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
 112       if (TraceGPUInteraction) {
 113         tty->print("donorThread %d, is %p, tlab at %p -> ", i, donorThread, tlab);
 114         printTlabInfoFromThread(tlab);
 115       }
 116       
 117       // Here we try to get a new tlab if current one is null. Note:
 118       // eventually we may want to test if the size is too small based
 119       // on some heuristic where we see how much this kernel tends to
 120       // allocate, but for now we can just let it overflow and let the
 121       // GPU allocate new tlabs. Actually, if we can't prime a tlab
 122       // here, it might make sense to do a gc now rather than to start
 123       // the kernel and have it deoptimize.  How to do that?
 124       if (tlab->end() == NULL) {
 125         bool success = getNewTlabForDonorThread(tlab, i);
 126         if (TraceGPUInteraction) {
 127           if (success) {
 128             tty->print("donorThread %d, refilled tlab, -> ", i);
 129             printTlabInfoFromThread(tlab);
 130           } else {
 131             tty->print("donorThread %d, could not refill tlab, left as ", i);
 132             printTlabInfoFromThread(tlab);
 133           }
 134         }
 135       }
 136 
 137       // extract the necessary tlab fields into a TlabInfo record
 138       HSAILTlabInfo* pTlabInfo = &_tlab_infos_pool_start[i];
 139       _cur_tlab_infos[i] = pTlabInfo;
 140       pTlabInfo->initialize(tlab->start(), tlab->top(), tlab->end(), donorThread, this);
 141 
 142       // reset the real tlab fields to zero so we are sure the thread doesn't use it
 143       tlab->reset_to_null();
 144     }
 145   }
 146 
 147   ~HSAILAllocationInfo() {
 148     FREE_C_HEAP_ARRAY(HSAILTlabInfo*, _cur_tlab_infos, mtInternal);
 149     FREE_C_HEAP_ARRAY(HSAILTlabInfo, _tlab_infos_pool_start, mtInternal);
 150     FREE_C_HEAP_ARRAY(JavaThread*, donorThreads, mtInternal);
 151   }
 152 
 153   void postKernelCleanup() {
 154     // go thru all the tlabInfos, fix up any tlab tops that overflowed
 155     // complete the tlabs if they overflowed
 156     // update the donor threads tlabs when appropriate
 157     bool anyOverflows = false;
 158     size_t bytesAllocated = 0;
 159     // if there was an overflow in allocating tlabInfos, correct it here
 160     if (_tlab_infos_pool_next > _tlab_infos_pool_end) {
 161       if (TraceGPUInteraction) {
 162         int overflowAmount = _tlab_infos_pool_next - _tlab_infos_pool_end;
 163         tty->print_cr("tlabInfo allocation overflowed by %d units", overflowAmount);
 164       }
 165       _tlab_infos_pool_next = _tlab_infos_pool_end;
 166     }
 167     for (HSAILTlabInfo* tlabInfo = _tlab_infos_pool_start; tlabInfo < _tlab_infos_pool_next; tlabInfo++) {
 168       if (TraceGPUInteraction) {
 169         tty->print_cr("postprocess tlabInfo %p, start=%p, top=%p, end=%p, last_good_top=%p", tlabInfo, 
 170                       tlabInfo->start(), tlabInfo->top(), tlabInfo->end(), tlabInfo->last_good_top());
 171       }
 172       JavaThread* donorThread = tlabInfo->_donor_thread;
 173       ThreadLocalAllocBuffer* tlab = &donorThread->tlab();
 174       bool overflowed = false;
 175       // if a tlabInfo has NULL fields, i.e. we could not prime it on entry,
 176       // or we could not get a tlab from the gpu, so ignore tlabInfo here
 177       if (tlabInfo->start() != NULL) {
 178         if (tlabInfo->top() > tlabInfo->end()) {
 179           anyOverflows = true;
 180           overflowed = true;
 181           if (TraceGPUInteraction) {
 182             long overflowAmount = (long) tlabInfo->top() - (long) tlabInfo->last_good_top(); 
 183             tty->print_cr("tlabInfo %p (donorThread = %p) overflowed by %ld bytes, setting last good top to %p", tlabInfo, donorThread, overflowAmount, tlabInfo->last_good_top());
 184           }
 185           tlabInfo->_top = tlabInfo->last_good_top();
 186         }
 187 
 188         // see if the donor thread actually tried to allocate anything while we were running
 189         // if so we will retire that one as we overwrite with our new one
 190         if (tlab->top() != NULL) {
 191           if (TraceGPUInteraction) {
 192             tty->print("Donor Thread allocated new tlab");
 193             printTlabInfoFromThread(tlab);
 194           }
 195           tlab->make_parsable(true);
 196         }
 197 
 198         // fill the donor thread tlab with the tlabInfo information
 199         // we do this even if it will get overwritten by a later tlabinfo
 200         // because it helps with tlab statistics for that donor thread
 201         tlab->fill(tlabInfo->start(), tlabInfo->top(), (tlabInfo->end() - tlabInfo->start()) + tlab->alignment_reserve());
 202 
 203         // if there was an overflow, make it parsable with retire = true
 204         if (overflowed) {
 205           tlab->make_parsable(true);
 206         }
 207         
 208         size_t delta = (long)(tlabInfo->top()) - (long)(tlabInfo->original_top());
 209         if (TraceGPUInteraction) {
 210           tty->print_cr("%ld bytes were allocated by tlabInfo %p (start %p, top %p, end %p", delta, tlabInfo,
 211                         tlabInfo->start(), tlabInfo->top(), tlabInfo->end());
 212         }
 213         bytesAllocated += delta;
 214       }
 215     }
 216     if (TraceGPUInteraction) {
 217       tty->print_cr("%ld total bytes were allocated in this kernel", bytesAllocated);
 218     }
 219     if (anyOverflows) {
 220       // Hsail::kernelStats.incOverflows();
 221     }
 222   }
 223 
 224   HSAILTlabInfo** getCurTlabInfos() {
 225     return _cur_tlab_infos;
 226   }
 227 
 228 private:
 229   // fill and retire old tlab and get a new one
 230   // if we can't get one, no problem someone will eventually do a gc
 231   bool getNewTlabForDonorThread(ThreadLocalAllocBuffer* tlab, int idx) {
 232 
 233     tlab->clear_before_allocation();    // fill and retire old tlab (will also check for null)
 234     
 235     // get a size for a new tlab that is based on the desired_size
 236     size_t new_tlab_size = tlab->compute_size(0);
 237     if (new_tlab_size == 0) return false;
 238     
 239     HeapWord* tlab_start = Universe::heap()->allocate_new_tlab(new_tlab_size);
 240     if (tlab_start == NULL) return false;
 241     
 242     // ..and clear it if required
 243     if (ZeroTLAB) {
 244       Copy::zero_to_words(tlab_start, new_tlab_size);
 245     }
 246     // and init the tlab pointers
 247     tlab->fill(tlab_start, tlab_start, new_tlab_size);
 248     return true;
 249   }
 250   
 251   void printTlabInfoFromThread (ThreadLocalAllocBuffer* tlab) {
 252     HeapWord* start = tlab->start();
 253     HeapWord* top = tlab->top();
 254     HeapWord* end = tlab->end();
 255     // sizes are in bytes
 256     size_t tlabFree = tlab->free() * HeapWordSize;
 257     size_t tlabUsed = tlab->used() * HeapWordSize;
 258     size_t tlabSize = tlabFree + tlabUsed;
 259     double freePct = 100.0 * (double) tlabFree/(double) tlabSize;
 260     tty->print_cr("(%p, %p, %p), siz=%ld, free=%ld (%f%%)", start, top, end, tlabSize, tlabFree, freePct);
 261   }
 262   
 263 };
 264   
 265 #endif // GPU_HSAIL_VM_GPU_HSAIL_TLAB_HPP