68 import com.oracle.graal.lir.hsail.HSAILControlFlow.DeoptimizingOp; 69 import com.oracle.graal.nodes.*; 70 import com.oracle.graal.nodes.StructuredGraph.GuardsStage; 71 import com.oracle.graal.nodes.extended.*; 72 import com.oracle.graal.nodes.java.*; 73 import com.oracle.graal.nodes.spi.*; 74 import com.oracle.graal.nodes.virtual.*; 75 import com.oracle.graal.options.*; 76 import com.oracle.graal.phases.*; 77 import com.oracle.graal.phases.tiers.*; 78 import com.oracle.graal.virtual.nodes.*; 79 80 /** 81 * HSAIL specific backend. 82 */ 83 public class HSAILHotSpotBackend extends HotSpotBackend { 84 85 public static class Options { 86 87 // @formatter:off 88 @Option(help = "Number of donor threads for HSAIL kernel dispatch") 89 static public final OptionValue<Integer> HsailDonorThreads = new OptionValue<>(4); 90 // @formatter:on 91 } 92 93 private Map<String, String> paramTypeMap = new HashMap<>(); 94 private final boolean deviceInitialized; 95 // TODO: get maximum Concurrency from okra 96 private int maxDeoptIndex = 8 * 40 * 64; // see gpu_hsail.hpp 97 98 public HSAILHotSpotBackend(HotSpotGraalRuntime runtime, HotSpotProviders providers) { 99 super(runtime, providers); 100 paramTypeMap.put("HotSpotResolvedPrimitiveType<int>", "s32"); 101 paramTypeMap.put("HotSpotResolvedPrimitiveType<float>", "f32"); 102 paramTypeMap.put("HotSpotResolvedPrimitiveType<double>", "f64"); 103 paramTypeMap.put("HotSpotResolvedPrimitiveType<long>", "s64"); 104 105 /* 106 * The order of the conjunction below is important: the OkraUtil call may provision the 107 * native library required by the initialize() call 108 */ 109 deviceInitialized = OkraUtil.okraLibExists() && initialize(); 352 if (hostCode.getAssumptions() != null) { 353 for (Assumption assumption : hostCode.getAssumptions().getAssumptions()) { 354 if (assumption != null) { 355 mergedAssumptions.record(assumption); 356 } 357 } 358 } 359 if (hsailCode.getAssumptions() != null) { 360 for (Assumption assumption : hsailCode.getAssumptions().getAssumptions()) { 361 if (assumption != null) { 362 mergedAssumptions.record(assumption); 363 } 364 } 365 } 366 if (!mergedAssumptions.isEmpty()) { 367 result.setAssumptions(mergedAssumptions); 368 } 369 return result; 370 } 371 372 private static final ThreadLocal<DonorThreadPool> donorThreadPool = new ThreadLocal<DonorThreadPool>() { 373 @Override 374 protected DonorThreadPool initialValue() { 375 return new DonorThreadPool(); 376 } 377 }; 378 379 public boolean executeKernel(HotSpotInstalledCode kernel, int jobSize, Object[] args) throws InvalidInstalledCodeException { 380 if (!deviceInitialized) { 381 throw new GraalInternalError("Cannot execute GPU kernel if device is not initialized"); 382 } 383 int[] oopMapArray = ((HSAILHotSpotNmethod) kernel).getOopMapArray(); 384 385 // Pass donorThreadPoolArray if this kernel uses allocation, otherwise null 386 Thread[] donorThreadArray = ((HSAILHotSpotNmethod) kernel).getUsesAllocationFlag() ? donorThreadPool.get().getThreads() : null; 387 return executeKernel0(kernel, jobSize, args, donorThreadArray, HsailAllocBytesPerWorkitem.getValue(), oopMapArray); 388 } 389 390 private static native boolean executeKernel0(HotSpotInstalledCode kernel, int jobSize, Object[] args, Thread[] donorThreads, int allocBytesPerWorkitem, int[] oopMapArray) 391 throws InvalidInstalledCodeException; 392 393 /** 394 * Use the HSAIL register set when the compilation target is HSAIL. 395 */ 396 @Override 397 public FrameMap newFrameMap(RegisterConfig registerConfig) { 398 return new HSAILFrameMap(getCodeCache(), registerConfig); 399 } 400 401 @Override 402 public LIRGeneratorTool newLIRGenerator(CallingConvention cc, LIRGenerationResult lirGenRes) { 403 return new HSAILHotSpotLIRGenerator(getProviders(), getRuntime().getConfig(), cc, lirGenRes); 404 } 405 406 @Override 407 public LIRGenerationResult newLIRGenerationResult(LIR lir, FrameMap frameMap, ResolvedJavaMethod method, Object stub) { 408 return new HSAILHotSpotLIRGenerationResult(lir, frameMap); 409 } 410 616 * loaded up front but will be loaded as needed. 617 */ 618 for (int i = 0; i < nonConstantParamCount; i++) { 619 asm.emitString("ld_kernarg_" + paramHsailSizes[i] + " " + HSAIL.mapRegister(cc.getArgument(i)) + ", [" + paramNames[i] + "];"); 620 } 621 622 /* 623 * Emit the workitemaid instruction for loading the hidden gid parameter. This is assigned 624 * the register as if it were the last of the nonConstant parameters. 625 */ 626 String workItemReg = "$s" + Integer.toString(asRegister(cc.getArgument(nonConstantParamCount)).encoding()); 627 asm.emitString("workitemabsid_u32 " + workItemReg + ", 0;"); 628 629 final String deoptInProgressLabel = "@LHandleDeoptInProgress"; 630 631 if (useHSAILDeoptimization) { 632 // Aliases for d16 633 RegisterValue d16_deoptInfo = HSAIL.d16.asValue(wordLIRKind); 634 635 // Aliases for d17 636 RegisterValue d17_donorThreadIndex = HSAIL.d17.asValue(wordLIRKind); 637 RegisterValue d17_safepointFlagAddrIndex = d17_donorThreadIndex; 638 639 // Aliases for s34 640 RegisterValue s34_deoptOccurred = HSAIL.s34.asValue(LIRKind.value(Kind.Int)); 641 RegisterValue s34_donorThreadIndex = s34_deoptOccurred; 642 643 asm.emitLoadKernelArg(d16_deoptInfo, asm.getDeoptInfoName(), "u64"); 644 asm.emitComment("// Check if a deopt or safepoint has occurred and abort if true before doing any work"); 645 646 if (useHSAILSafepoints) { 647 // Load address of _notice_safepoints field 648 asm.emitLoad(wordKind, d17_safepointFlagAddrIndex, new HSAILAddressValue(wordLIRKind, d16_deoptInfo, config.hsailNoticeSafepointsOffset).toAddress()); 649 // Load int value from that field 650 asm.emitLoadAcquire(s34_deoptOccurred, new HSAILAddressValue(wordLIRKind, d17_safepointFlagAddrIndex, 0).toAddress()); 651 asm.emitCompare(Kind.Int, s34_deoptOccurred, Constant.forInt(0), "ne", false, false); 652 asm.cbr(deoptInProgressLabel); 653 } 654 asm.emitLoadAcquire(s34_deoptOccurred, new HSAILAddressValue(wordLIRKind, d16_deoptInfo, config.hsailDeoptOccurredOffset).toAddress()); 655 asm.emitCompare(Kind.Int, s34_deoptOccurred, Constant.forInt(0), "ne", false, false); 656 asm.cbr(deoptInProgressLabel); 657 // load thread register if this kernel performs allocation 658 if (usesAllocation) { 659 RegisterValue threadReg = getProviders().getRegisters().getThreadRegister().asValue(wordLIRKind); 660 assert HsailDonorThreads.getValue() > 0; 661 asm.emitLoad(wordKind, threadReg, new HSAILAddressValue(wordLIRKind, d16_deoptInfo, config.hsailCurTlabInfoOffset).toAddress()); 662 if (HsailDonorThreads.getValue() != 1) { 663 asm.emitComment("// map workitem to a donor thread"); 664 asm.emitString(String.format("rem_u32 $%s, %s, %d;", s34_donorThreadIndex.getRegister(), workItemReg, HsailDonorThreads.getValue())); 665 asm.emitConvert(d17_donorThreadIndex, s34_donorThreadIndex, wordKind, Kind.Int); 666 asm.emit("mad", threadReg, d17_donorThreadIndex, Constant.forInt(8), threadReg); 667 } else { 668 // workitem is already mapped to solitary donor thread 669 } 670 asm.emitComment("// $" + getProviders().getRegisters().getThreadRegister() + " will point to holder of tlab thread info for this workitem"); 671 } 672 } 673 674 /* 675 * Note the logic used for this spillseg size is to leave space and then go back and patch 676 * in the correct size once we have generated all the instructions. This should probably be 677 * done in a more robust way by implementing something like asm.insertString. 678 */ 679 int spillsegDeclarationPosition = asm.position() + 1; 680 String spillsegTemplate = "align 4 spill_u8 %spillseg[123456];"; 681 asm.emitString(spillsegTemplate); 682 // Emit object array load prologue here. 683 if (isObjectLambda) { 684 boolean useCompressedOops = config.useCompressedOops; 685 final int arrayElementsOffset = HotSpotGraalRuntime.getArrayBaseOffset(wordKind); 686 String iterationObjArgReg = HSAIL.mapRegister(cc.getArgument(nonConstantParamCount - 1)); 687 /* 688 * iterationObjArgReg will be the highest $d register in use (it is the last parameter) | 68 import com.oracle.graal.lir.hsail.HSAILControlFlow.DeoptimizingOp; 69 import com.oracle.graal.nodes.*; 70 import com.oracle.graal.nodes.StructuredGraph.GuardsStage; 71 import com.oracle.graal.nodes.extended.*; 72 import com.oracle.graal.nodes.java.*; 73 import com.oracle.graal.nodes.spi.*; 74 import com.oracle.graal.nodes.virtual.*; 75 import com.oracle.graal.options.*; 76 import com.oracle.graal.phases.*; 77 import com.oracle.graal.phases.tiers.*; 78 import com.oracle.graal.virtual.nodes.*; 79 80 /** 81 * HSAIL specific backend. 82 */ 83 public class HSAILHotSpotBackend extends HotSpotBackend { 84 85 public static class Options { 86 87 // @formatter:off 88 @Option(help = "Number of TLABs used for HSAIL kernels which allocate") 89 static public final OptionValue<Integer> HsailKernelTlabs = new OptionValue<>(4); 90 // @formatter:on 91 } 92 93 private Map<String, String> paramTypeMap = new HashMap<>(); 94 private final boolean deviceInitialized; 95 // TODO: get maximum Concurrency from okra 96 private int maxDeoptIndex = 8 * 40 * 64; // see gpu_hsail.hpp 97 98 public HSAILHotSpotBackend(HotSpotGraalRuntime runtime, HotSpotProviders providers) { 99 super(runtime, providers); 100 paramTypeMap.put("HotSpotResolvedPrimitiveType<int>", "s32"); 101 paramTypeMap.put("HotSpotResolvedPrimitiveType<float>", "f32"); 102 paramTypeMap.put("HotSpotResolvedPrimitiveType<double>", "f64"); 103 paramTypeMap.put("HotSpotResolvedPrimitiveType<long>", "s64"); 104 105 /* 106 * The order of the conjunction below is important: the OkraUtil call may provision the 107 * native library required by the initialize() call 108 */ 109 deviceInitialized = OkraUtil.okraLibExists() && initialize(); 352 if (hostCode.getAssumptions() != null) { 353 for (Assumption assumption : hostCode.getAssumptions().getAssumptions()) { 354 if (assumption != null) { 355 mergedAssumptions.record(assumption); 356 } 357 } 358 } 359 if (hsailCode.getAssumptions() != null) { 360 for (Assumption assumption : hsailCode.getAssumptions().getAssumptions()) { 361 if (assumption != null) { 362 mergedAssumptions.record(assumption); 363 } 364 } 365 } 366 if (!mergedAssumptions.isEmpty()) { 367 result.setAssumptions(mergedAssumptions); 368 } 369 return result; 370 } 371 372 public boolean executeKernel(HotSpotInstalledCode kernel, int jobSize, Object[] args) throws InvalidInstalledCodeException { 373 if (!deviceInitialized) { 374 throw new GraalInternalError("Cannot execute GPU kernel if device is not initialized"); 375 } 376 int[] oopMapArray = ((HSAILHotSpotNmethod) kernel).getOopMapArray(); 377 378 // Pass HsailKernelTlabs number if this kernel uses allocation, otherwise 0 379 int numTlabs = ((HSAILHotSpotNmethod) kernel).getUsesAllocationFlag() ? HsailKernelTlabs.getValue() : 0; 380 return executeKernel0(kernel, jobSize, args, numTlabs, HsailAllocBytesPerWorkitem.getValue(), oopMapArray); 381 } 382 383 private static native boolean executeKernel0(HotSpotInstalledCode kernel, int jobSize, Object[] args, int numTlabs, int allocBytesPerWorkitem, int[] oopMapArray) 384 throws InvalidInstalledCodeException; 385 386 /** 387 * Use the HSAIL register set when the compilation target is HSAIL. 388 */ 389 @Override 390 public FrameMap newFrameMap(RegisterConfig registerConfig) { 391 return new HSAILFrameMap(getCodeCache(), registerConfig); 392 } 393 394 @Override 395 public LIRGeneratorTool newLIRGenerator(CallingConvention cc, LIRGenerationResult lirGenRes) { 396 return new HSAILHotSpotLIRGenerator(getProviders(), getRuntime().getConfig(), cc, lirGenRes); 397 } 398 399 @Override 400 public LIRGenerationResult newLIRGenerationResult(LIR lir, FrameMap frameMap, ResolvedJavaMethod method, Object stub) { 401 return new HSAILHotSpotLIRGenerationResult(lir, frameMap); 402 } 403 609 * loaded up front but will be loaded as needed. 610 */ 611 for (int i = 0; i < nonConstantParamCount; i++) { 612 asm.emitString("ld_kernarg_" + paramHsailSizes[i] + " " + HSAIL.mapRegister(cc.getArgument(i)) + ", [" + paramNames[i] + "];"); 613 } 614 615 /* 616 * Emit the workitemaid instruction for loading the hidden gid parameter. This is assigned 617 * the register as if it were the last of the nonConstant parameters. 618 */ 619 String workItemReg = "$s" + Integer.toString(asRegister(cc.getArgument(nonConstantParamCount)).encoding()); 620 asm.emitString("workitemabsid_u32 " + workItemReg + ", 0;"); 621 622 final String deoptInProgressLabel = "@LHandleDeoptInProgress"; 623 624 if (useHSAILDeoptimization) { 625 // Aliases for d16 626 RegisterValue d16_deoptInfo = HSAIL.d16.asValue(wordLIRKind); 627 628 // Aliases for d17 629 RegisterValue d17_tlabIndex = HSAIL.d17.asValue(wordLIRKind); 630 RegisterValue d17_safepointFlagAddrIndex = d17_tlabIndex; 631 632 // Aliases for s34 633 RegisterValue s34_deoptOccurred = HSAIL.s34.asValue(LIRKind.value(Kind.Int)); 634 RegisterValue s34_tlabIndex = s34_deoptOccurred; 635 636 asm.emitLoadKernelArg(d16_deoptInfo, asm.getDeoptInfoName(), "u64"); 637 asm.emitComment("// Check if a deopt or safepoint has occurred and abort if true before doing any work"); 638 639 if (useHSAILSafepoints) { 640 // Load address of _notice_safepoints field 641 asm.emitLoad(wordKind, d17_safepointFlagAddrIndex, new HSAILAddressValue(wordLIRKind, d16_deoptInfo, config.hsailNoticeSafepointsOffset).toAddress()); 642 // Load int value from that field 643 asm.emitLoadAcquire(s34_deoptOccurred, new HSAILAddressValue(wordLIRKind, d17_safepointFlagAddrIndex, 0).toAddress()); 644 asm.emitCompare(Kind.Int, s34_deoptOccurred, Constant.forInt(0), "ne", false, false); 645 asm.cbr(deoptInProgressLabel); 646 } 647 asm.emitLoadAcquire(s34_deoptOccurred, new HSAILAddressValue(wordLIRKind, d16_deoptInfo, config.hsailDeoptOccurredOffset).toAddress()); 648 asm.emitCompare(Kind.Int, s34_deoptOccurred, Constant.forInt(0), "ne", false, false); 649 asm.cbr(deoptInProgressLabel); 650 // load thread register if this kernel performs allocation 651 if (usesAllocation) { 652 RegisterValue threadReg = getProviders().getRegisters().getThreadRegister().asValue(wordLIRKind); 653 assert HsailKernelTlabs.getValue() > 0; 654 asm.emitLoad(wordKind, threadReg, new HSAILAddressValue(wordLIRKind, d16_deoptInfo, config.hsailCurTlabInfoOffset).toAddress()); 655 if (HsailKernelTlabs.getValue() != 1) { 656 asm.emitComment("// map workitem to a tlab"); 657 asm.emitString(String.format("rem_u32 $%s, %s, %d;", s34_tlabIndex.getRegister(), workItemReg, HsailKernelTlabs.getValue())); 658 asm.emitConvert(d17_tlabIndex, s34_tlabIndex, wordKind, Kind.Int); 659 asm.emit("mad", threadReg, d17_tlabIndex, Constant.forInt(8), threadReg); 660 } else { 661 // workitem is already mapped to solitary tlab 662 } 663 asm.emitComment("// $" + getProviders().getRegisters().getThreadRegister() + " will point to holder of tlab thread info for this workitem"); 664 } 665 } 666 667 /* 668 * Note the logic used for this spillseg size is to leave space and then go back and patch 669 * in the correct size once we have generated all the instructions. This should probably be 670 * done in a more robust way by implementing something like asm.insertString. 671 */ 672 int spillsegDeclarationPosition = asm.position() + 1; 673 String spillsegTemplate = "align 4 spill_u8 %spillseg[123456];"; 674 asm.emitString(spillsegTemplate); 675 // Emit object array load prologue here. 676 if (isObjectLambda) { 677 boolean useCompressedOops = config.useCompressedOops; 678 final int arrayElementsOffset = HotSpotGraalRuntime.getArrayBaseOffset(wordKind); 679 String iterationObjArgReg = HSAIL.mapRegister(cc.getArgument(nonConstantParamCount - 1)); 680 /* 681 * iterationObjArgReg will be the highest $d register in use (it is the last parameter) |