1 /* 2 * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #ifndef GPU_HSAIL_VM_GPU_HSAIL_TLAB_HPP 26 #define GPU_HSAIL_VM_GPU_HSAIL_TLAB_HPP 27 28 #include "graal/graalEnv.hpp" 29 #include "code/debugInfo.hpp" 30 #include "code/location.hpp" 31 #include "gpu_hsail.hpp" 32 33 class HSAILAllocationInfo; 34 35 class HSAILTlabInfo VALUE_OBJ_CLASS_SPEC { 36 friend class VMStructs; 37 public: 38 // uses only the necessary fields from a full TLAB 39 HeapWord* _start; 40 HeapWord* _top; 41 HeapWord* _end; 42 HeapWord* _last_good_top; 43 HeapWord* _original_top; 44 JavaThread* _donor_thread; // donor thread associated with this tlabInfo 45 HSAILAllocationInfo* _alloc_info; // same as what is in HSAILDeoptimizationInfo 46 47 // Accessors 48 HeapWord* start() { return _start; } 49 HeapWord* top() { return _top; } 50 HeapWord* end() { return _end; } 51 HeapWord* last_good_top() { return _last_good_top; } 52 HeapWord* original_top() { return _original_top; } 53 void initialize(HeapWord* start, HeapWord* top, HeapWord* end, JavaThread* donorThread, HSAILAllocationInfo* allocInfo) { 54 _start = start; 55 _top = _original_top = top; 56 _end = end; 57 _donor_thread = donorThread; 58 _alloc_info = allocInfo; 59 } 60 }; 61 62 63 class HSAILAllocationInfo : public CHeapObj<mtInternal> { 64 friend class VMStructs; 65 private: 66 JavaThread** donorThreads; 67 jint _num_donor_threads; 68 size_t _tlab_align_reserve_bytes; // filled in from ThreadLocalAllocBuffer::alignment_reserve_in_bytes() 69 HSAILTlabInfo** _cur_tlab_infos; // array of current tlab info pointers, one per donor_thread 70 HSAILTlabInfo* _tlab_infos_pool_start; // pool for new tlab_infos 71 HSAILTlabInfo* _tlab_infos_pool_next; // where next will be allocated from 72 HSAILTlabInfo* _tlab_infos_pool_end; // where next will be allocated from 73 74 public: 75 HSAILAllocationInfo(jobject donor_threads_jobj, int dimX, int allocBytesPerWorkitem) { 76 // fill in the donorThreads array 77 objArrayOop donorThreadObjects = (objArrayOop) JNIHandles::resolve(donor_threads_jobj); 78 _num_donor_threads = donorThreadObjects->length(); 79 guarantee(_num_donor_threads > 0, "need at least one donor thread"); 80 donorThreads = NEW_C_HEAP_ARRAY(JavaThread*, _num_donor_threads, mtInternal); 81 for (int i = 0; i < _num_donor_threads; i++) { 82 donorThreads[i] = java_lang_Thread::thread(donorThreadObjects->obj_at(i)); 83 } 84 85 // Compute max_tlab_infos based on amount of free heap space 86 size_t max_tlab_infos; 87 { 88 JavaThread* donorThread = donorThreads[0]; 89 ThreadLocalAllocBuffer* tlab = &donorThread->tlab(); 90 size_t new_tlab_size = tlab->compute_size(0); 91 size_t heap_bytes_free = Universe::heap()->unsafe_max_tlab_alloc(donorThread); 92 if (new_tlab_size != 0) { 93 max_tlab_infos = MIN2(heap_bytes_free / new_tlab_size, (size_t)(64 * _num_donor_threads)); 94 } else { 95 max_tlab_infos = 8 * _num_donor_threads; // an arbitrary multiple 96 } 97 if (TraceGPUInteraction) { 98 tty->print_cr("heapFree = %ld, newTlabSize=%ld, tlabInfos allocated = %ld", heap_bytes_free, new_tlab_size, max_tlab_infos); 99 } 100 } 101 102 _cur_tlab_infos = NEW_C_HEAP_ARRAY(HSAILTlabInfo*, _num_donor_threads, mtInternal); 103 _tlab_infos_pool_start = NEW_C_HEAP_ARRAY(HSAILTlabInfo, max_tlab_infos, mtInternal); 104 _tlab_infos_pool_next = &_tlab_infos_pool_start[_num_donor_threads]; 105 _tlab_infos_pool_end = &_tlab_infos_pool_start[max_tlab_infos]; 106 _tlab_align_reserve_bytes = ThreadLocalAllocBuffer::alignment_reserve_in_bytes(); 107 108 // we will fill the first N tlabInfos from the donor threads 109 for (int i = 0; i < _num_donor_threads; i++) { 110 JavaThread* donorThread = donorThreads[i]; 111 ThreadLocalAllocBuffer* tlab = &donorThread->tlab(); 112 if (TraceGPUInteraction) { 113 tty->print("donorThread %d, is %p, tlab at %p -> ", i, donorThread, tlab); 114 printTlabInfoFromThread(tlab); 115 } 116 117 // Here we try to get a new tlab if current one is null. Note: 118 // eventually we may want to test if the size is too small based 119 // on some heuristic where we see how much this kernel tends to 120 // allocate, but for now we can just let it overflow and let the 121 // GPU allocate new tlabs. Actually, if we can't prime a tlab 122 // here, it might make sense to do a gc now rather than to start 123 // the kernel and have it deoptimize. How to do that? 124 if (tlab->end() == NULL) { 125 bool success = getNewTlabForDonorThread(tlab, i); 126 if (TraceGPUInteraction) { 127 if (success) { 128 tty->print("donorThread %d, refilled tlab, -> ", i); 129 printTlabInfoFromThread(tlab); 130 } else { 131 tty->print("donorThread %d, could not refill tlab, left as ", i); 132 printTlabInfoFromThread(tlab); 133 } 134 } 135 } 136 137 // extract the necessary tlab fields into a TlabInfo record 138 HSAILTlabInfo* pTlabInfo = &_tlab_infos_pool_start[i]; 139 _cur_tlab_infos[i] = pTlabInfo; 140 pTlabInfo->initialize(tlab->start(), tlab->top(), tlab->end(), donorThread, this); 141 142 // reset the real tlab fields to zero so we are sure the thread doesn't use it 143 tlab->reset_to_null(); 144 } 145 } 146 147 ~HSAILAllocationInfo() { 148 FREE_C_HEAP_ARRAY(HSAILTlabInfo*, _cur_tlab_infos, mtInternal); 149 FREE_C_HEAP_ARRAY(HSAILTlabInfo, _tlab_infos_pool_start, mtInternal); 150 FREE_C_HEAP_ARRAY(JavaThread*, donorThreads, mtInternal); 151 } 152 153 void postKernelCleanup() { 154 // go thru all the tlabInfos, fix up any tlab tops that overflowed 155 // complete the tlabs if they overflowed 156 // update the donor threads tlabs when appropriate 157 bool anyOverflows = false; 158 size_t bytesAllocated = 0; 159 // if there was an overflow in allocating tlabInfos, correct it here 160 if (_tlab_infos_pool_next > _tlab_infos_pool_end) { 161 if (TraceGPUInteraction) { 162 int overflowAmount = _tlab_infos_pool_next - _tlab_infos_pool_end; 163 tty->print_cr("tlabInfo allocation overflowed by %d units", overflowAmount); 164 } 165 _tlab_infos_pool_next = _tlab_infos_pool_end; 166 } 167 for (HSAILTlabInfo* tlabInfo = _tlab_infos_pool_start; tlabInfo < _tlab_infos_pool_next; tlabInfo++) { 168 if (TraceGPUInteraction) { 169 tty->print_cr("postprocess tlabInfo %p, start=%p, top=%p, end=%p, last_good_top=%p", tlabInfo, 170 tlabInfo->start(), tlabInfo->top(), tlabInfo->end(), tlabInfo->last_good_top()); 171 } 172 JavaThread* donorThread = tlabInfo->_donor_thread; 173 ThreadLocalAllocBuffer* tlab = &donorThread->tlab(); 174 bool overflowed = false; 175 // if a tlabInfo has NULL fields, i.e. we could not prime it on entry, 176 // or we could not get a tlab from the gpu, so ignore tlabInfo here 177 if (tlabInfo->start() != NULL) { 178 if (tlabInfo->top() > tlabInfo->end()) { 179 anyOverflows = true; 180 overflowed = true; 181 if (TraceGPUInteraction) { 182 long overflowAmount = (long) tlabInfo->top() - (long) tlabInfo->last_good_top(); 183 tty->print_cr("tlabInfo %p (donorThread = %p) overflowed by %ld bytes, setting last good top to %p", tlabInfo, donorThread, overflowAmount, tlabInfo->last_good_top()); 184 } 185 tlabInfo->_top = tlabInfo->last_good_top(); 186 } 187 188 // see if the donor thread actually tried to allocate anything while we were running 189 // if so we will retire that one as we overwrite with our new one 190 if (tlab->top() != NULL) { 191 if (TraceGPUInteraction) { 192 tty->print("Donor Thread allocated new tlab"); 193 printTlabInfoFromThread(tlab); 194 } 195 tlab->make_parsable(true); 196 } 197 198 // fill the donor thread tlab with the tlabInfo information 199 // we do this even if it will get overwritten by a later tlabinfo 200 // because it helps with tlab statistics for that donor thread 201 tlab->fill(tlabInfo->start(), tlabInfo->top(), (tlabInfo->end() - tlabInfo->start()) + tlab->alignment_reserve()); 202 203 // if there was an overflow, make it parsable with retire = true 204 if (overflowed) { 205 tlab->make_parsable(true); 206 } 207 208 size_t delta = (long)(tlabInfo->top()) - (long)(tlabInfo->original_top()); 209 if (TraceGPUInteraction) { 210 tty->print_cr("%ld bytes were allocated by tlabInfo %p (start %p, top %p, end %p", delta, tlabInfo, 211 tlabInfo->start(), tlabInfo->top(), tlabInfo->end()); 212 } 213 bytesAllocated += delta; 214 } 215 } 216 if (TraceGPUInteraction) { 217 tty->print_cr("%ld total bytes were allocated in this kernel", bytesAllocated); 218 } 219 if (anyOverflows) { 220 // Hsail::kernelStats.incOverflows(); 221 } 222 } 223 224 HSAILTlabInfo** getCurTlabInfos() { 225 return _cur_tlab_infos; 226 } 227 228 private: 229 // fill and retire old tlab and get a new one 230 // if we can't get one, no problem someone will eventually do a gc 231 bool getNewTlabForDonorThread(ThreadLocalAllocBuffer* tlab, int idx) { 232 233 tlab->clear_before_allocation(); // fill and retire old tlab (will also check for null) 234 235 // get a size for a new tlab that is based on the desired_size 236 size_t new_tlab_size = tlab->compute_size(0); 237 if (new_tlab_size == 0) return false; 238 239 HeapWord* tlab_start = Universe::heap()->allocate_new_tlab(new_tlab_size); 240 if (tlab_start == NULL) return false; 241 242 // ..and clear it if required 243 if (ZeroTLAB) { 244 Copy::zero_to_words(tlab_start, new_tlab_size); 245 } 246 // and init the tlab pointers 247 tlab->fill(tlab_start, tlab_start, new_tlab_size); 248 return true; 249 } 250 251 void printTlabInfoFromThread (ThreadLocalAllocBuffer* tlab) { 252 HeapWord* start = tlab->start(); 253 HeapWord* top = tlab->top(); 254 HeapWord* end = tlab->end(); 255 // sizes are in bytes 256 size_t tlabFree = tlab->free() * HeapWordSize; 257 size_t tlabUsed = tlab->used() * HeapWordSize; 258 size_t tlabSize = tlabFree + tlabUsed; 259 double freePct = 100.0 * (double) tlabFree/(double) tlabSize; 260 tty->print_cr("(%p, %p, %p), siz=%ld, free=%ld (%f%%)", start, top, end, tlabSize, tlabFree, freePct); 261 } 262 263 }; 264 265 #endif // GPU_HSAIL_VM_GPU_HSAIL_TLAB_HPP