1 /*
   2  * Copyright (c) 2013, 2014, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  */
  23 package com.oracle.graal.hotspot.hsail;
  24 
  25 import static com.oracle.graal.api.code.CallingConvention.Type.*;
  26 import static com.oracle.graal.api.code.CodeUtil.*;
  27 import static com.oracle.graal.api.code.ValueUtil.*;
  28 import static com.oracle.graal.api.meta.LocationIdentity.*;
  29 import static com.oracle.graal.compiler.GraalCompiler.*;
  30 import static com.oracle.graal.hotspot.hsail.HSAILHotSpotBackend.Options.*;
  31 import static com.oracle.graal.hotspot.hsail.replacements.HSAILNewObjectSnippets.Options.*;
  32 
  33 import java.lang.reflect.*;
  34 import java.util.*;
  35 import java.util.Map.Entry;
  36 import java.util.function.*;
  37 import java.util.stream.*;
  38 
  39 import com.amd.okra.*;
  40 import com.oracle.graal.api.code.*;
  41 import com.oracle.graal.api.code.Assumptions.Assumption;
  42 import com.oracle.graal.api.code.CallingConvention.Type;
  43 import com.oracle.graal.api.code.CompilationResult.Call;
  44 import com.oracle.graal.api.code.CompilationResult.CodeAnnotation;
  45 import com.oracle.graal.api.code.CompilationResult.DataPatch;
  46 import com.oracle.graal.api.code.CompilationResult.ExceptionHandler;
  47 import com.oracle.graal.api.code.CompilationResult.Infopoint;
  48 import com.oracle.graal.api.code.CompilationResult.Mark;
  49 import com.oracle.graal.api.meta.*;
  50 import com.oracle.graal.asm.*;
  51 import com.oracle.graal.asm.hsail.*;
  52 import com.oracle.graal.compiler.common.*;
  53 import com.oracle.graal.compiler.common.cfg.*;
  54 import com.oracle.graal.compiler.common.type.*;
  55 import com.oracle.graal.debug.*;
  56 import com.oracle.graal.debug.Debug.Scope;
  57 import com.oracle.graal.gpu.*;
  58 import com.oracle.graal.hotspot.*;
  59 import com.oracle.graal.hotspot.bridge.CompilerToVM.CodeInstallResult;
  60 import com.oracle.graal.hotspot.meta.*;
  61 import com.oracle.graal.hotspot.nodes.*;
  62 import com.oracle.graal.hsail.*;
  63 import com.oracle.graal.java.*;
  64 import com.oracle.graal.lir.*;
  65 import com.oracle.graal.lir.asm.*;
  66 import com.oracle.graal.lir.gen.*;
  67 import com.oracle.graal.lir.hsail.*;
  68 import com.oracle.graal.lir.hsail.HSAILControlFlow.DeoptimizingOp;
  69 import com.oracle.graal.nodes.*;
  70 import com.oracle.graal.nodes.StructuredGraph.GuardsStage;
  71 import com.oracle.graal.nodes.extended.*;
  72 import com.oracle.graal.nodes.java.*;
  73 import com.oracle.graal.nodes.spi.*;
  74 import com.oracle.graal.nodes.virtual.*;
  75 import com.oracle.graal.options.*;
  76 import com.oracle.graal.phases.*;
  77 import com.oracle.graal.phases.tiers.*;
  78 import com.oracle.graal.virtual.nodes.*;
  79 
  80 /**
  81  * HSAIL specific backend.
  82  */
  83 public class HSAILHotSpotBackend extends HotSpotBackend {
  84 
  85     public static class Options {
  86 
  87         // @formatter:off
  88         @Option(help = "Number of TLABs used for HSAIL kernels which allocate")
  89         static public final OptionValue<Integer> HsailKernelTlabs = new OptionValue<>(4);
  90         // @formatter:on
  91     }
  92 
  93     private Map<String, String> paramTypeMap = new HashMap<>();
  94     private final boolean deviceInitialized;
  95     // TODO: get maximum Concurrency from okra
  96     private int maxDeoptIndex = 8 * 40 * 64;   // see gpu_hsail.hpp
  97 
  98     public HSAILHotSpotBackend(HotSpotGraalRuntime runtime, HotSpotProviders providers) {
  99         super(runtime, providers);
 100         paramTypeMap.put("HotSpotResolvedPrimitiveType<int>", "s32");
 101         paramTypeMap.put("HotSpotResolvedPrimitiveType<float>", "f32");
 102         paramTypeMap.put("HotSpotResolvedPrimitiveType<double>", "f64");
 103         paramTypeMap.put("HotSpotResolvedPrimitiveType<long>", "s64");
 104 
 105         /*
 106          * The order of the conjunction below is important: the OkraUtil call may provision the
 107          * native library required by the initialize() call
 108          */
 109         deviceInitialized = OkraUtil.okraLibExists() && initialize();
 110     }
 111 
 112     @Override
 113     public boolean shouldAllocateRegisters() {
 114         return true;
 115     }
 116 
 117     /**
 118      * Initializes the GPU device.
 119      *
 120      * @return whether or not initialization was successful
 121      */
 122     private static native boolean initialize();
 123 
 124     /**
 125      * Determines if the GPU device (or simulator) is available and initialized.
 126      */
 127     public boolean isDeviceInitialized() {
 128         return deviceInitialized;
 129     }
 130 
 131     /**
 132      * Completes the initialization of the HSAIL backend. This includes initializing the providers
 133      * and registering any method substitutions specified by the HSAIL backend.
 134      */
 135     @Override
 136     public void completeInitialization() {
 137         final HotSpotProviders providers = getProviders();
 138         HotSpotVMConfig config = getRuntime().getConfig();
 139         // Initialize the lowering provider.
 140         final HotSpotLoweringProvider lowerer = (HotSpotLoweringProvider) providers.getLowerer();
 141         lowerer.initialize(providers, config);
 142 
 143         // Register the replacements used by the HSAIL backend.
 144         HSAILHotSpotReplacementsImpl replacements = (HSAILHotSpotReplacementsImpl) providers.getReplacements();
 145         replacements.completeInitialization();
 146     }
 147 
 148     /**
 149      * Compiles and installs a given method to a GPU binary.
 150      */
 151     public HotSpotNmethod compileAndInstallKernel(Method method) {
 152         ResolvedJavaMethod javaMethod = getProviders().getMetaAccess().lookupJavaMethod(method);
 153         HotSpotNmethod nm = installKernel(javaMethod, compileKernel(javaMethod, true));
 154         try (Scope s = Debug.scope("HostCodeGen")) {
 155             if (Debug.isLogEnabled()) {
 156                 DisassemblerProvider dis = getRuntime().getHostBackend().getDisassembler();
 157                 if (dis != null) {
 158                     String disasm = dis.disassemble(nm);
 159                     Debug.log("host code generated for %s%n%s", javaMethod, disasm);
 160                 } else {
 161                     Debug.log("host code disassembler is null");
 162                 }
 163             }
 164         } catch (Throwable e) {
 165             throw Debug.handle(e);
 166         }
 167         return nm;
 168     }
 169 
 170     /**
 171      * Compiles a given method to HSAIL code.
 172      *
 173      * @param makeBinary specifies whether a GPU binary should also be generated for the HSAIL code.
 174      *            If true, the returned value is guaranteed to have a non-zero
 175      *            {@linkplain ExternalCompilationResult#getEntryPoint() entry point}.
 176      * @return the HSAIL code compiled from {@code method}'s bytecode
 177      */
 178     public ExternalCompilationResult compileKernel(ResolvedJavaMethod method, boolean makeBinary) {
 179         StructuredGraph graph = new StructuredGraph(method);
 180         HotSpotProviders providers = getProviders();
 181         MetaAccessProvider metaAccess = getProviders().getMetaAccess();
 182 
 183         // changed this from default to help us generate deopts when needed
 184         OptimisticOptimizations optimisticOpts = OptimisticOptimizations.ALL;
 185         optimisticOpts.remove(OptimisticOptimizations.Optimization.UseExceptionProbabilityForOperations);
 186         new GraphBuilderPhase.Instance(metaAccess, GraphBuilderConfiguration.getSnippetDefault(), optimisticOpts).apply(graph);
 187         PhaseSuite<HighTierContext> graphBuilderSuite = providers.getSuites().getDefaultGraphBuilderSuite();
 188         CallingConvention cc = getCallingConvention(providers.getCodeCache(), Type.JavaCallee, graph.method(), false);
 189 
 190         // append special HSAILNonNullParametersPhase
 191         int numArgs = cc.getArguments().length;
 192         graphBuilderSuite.appendPhase(new HSAILNonNullParametersPhase(numArgs));
 193 
 194         Suites suites = providers.getSuites().getDefaultSuites();
 195         ExternalCompilationResult hsailCode = compileGraph(graph, null, cc, method, providers, this, this.getTarget(), null, graphBuilderSuite, optimisticOpts, getProfilingInfo(graph), null, suites,
 196                         new ExternalCompilationResult(), CompilationResultBuilderFactory.Default);
 197 
 198         // this code added to dump infopoints
 199         try (Scope s = Debug.scope("CodeGen")) {
 200             if (Debug.isLogEnabled()) {
 201                 // show infopoints
 202                 List<Infopoint> infoList = hsailCode.getInfopoints();
 203                 Debug.log("%d HSAIL infopoints", infoList.size());
 204                 for (Infopoint info : infoList) {
 205                     Debug.log(info.toString());
 206                     Debug.log(info.debugInfo.frame().toString());
 207                 }
 208             }
 209         } catch (Throwable e) {
 210             throw Debug.handle(e);
 211         }
 212 
 213         if (makeBinary) {
 214             if (!deviceInitialized) {
 215                 throw new GraalInternalError("Cannot generate GPU kernel if device is not initialized");
 216             }
 217             try (Scope ds = Debug.scope("GeneratingKernelBinary")) {
 218                 long kernel = generateKernel(hsailCode.getTargetCode(), method.getName());
 219                 if (kernel == 0) {
 220                     throw new GraalInternalError("Failed to compile HSAIL kernel");
 221                 }
 222                 hsailCode.setEntryPoint(kernel);
 223             } catch (Throwable e) {
 224                 throw Debug.handle(e);
 225             }
 226         }
 227         return hsailCode;
 228     }
 229 
 230     private static class HSAILNonNullParametersPhase extends Phase {
 231         // we use this to limit the stamping to exclude the final argument in an obj stream method
 232         private int numArgs;
 233 
 234         public HSAILNonNullParametersPhase(int numArgs) {
 235             this.numArgs = numArgs;
 236         }
 237 
 238         @Override
 239         protected void run(StructuredGraph graph) {
 240             int argCount = 0;
 241             Stamp nonNull = StampFactory.objectNonNull();
 242             for (ParameterNode param : graph.getNodes(ParameterNode.class)) {
 243                 argCount++;
 244                 if (argCount < numArgs && param.stamp() instanceof ObjectStamp) {
 245                     ObjectStamp paramStamp = (ObjectStamp) param.stamp();
 246                     param.setStamp(paramStamp.join(nonNull));
 247                 }
 248             }
 249         }
 250     }
 251 
 252     /**
 253      * Generates a GPU binary from HSAIL code.
 254      */
 255     static native long generateKernel(byte[] hsailCode, String name);
 256 
 257     /**
 258      * Installs the {@linkplain ExternalCompilationResult#getEntryPoint() GPU binary} associated
 259      * with some given HSAIL code in the code cache and returns a {@link HotSpotNmethod} handle to
 260      * the installed code.
 261      *
 262      * @param hsailCode HSAIL compilation result for which a GPU binary has been generated
 263      * @return a handle to the binary as installed in the HotSpot code cache
 264      */
 265     public final HotSpotNmethod installKernel(ResolvedJavaMethod method, ExternalCompilationResult hsailCode) {
 266         assert hsailCode.getEntryPoint() != 0L;
 267         // Code here based on HotSpotCodeCacheProvider.addExternalMethod().
 268         HotSpotResolvedJavaMethod javaMethod = (HotSpotResolvedJavaMethod) method;
 269         if (hsailCode.getId() == -1) {
 270             hsailCode.setId(javaMethod.allocateCompileId(hsailCode.getEntryBCI()));
 271         }
 272         CompilationResult compilationResult = hsailCode;
 273         StructuredGraph hostGraph = hsailCode.getHostGraph();
 274         if (hostGraph != null) {
 275             // TODO get rid of the unverified entry point in the host code
 276             try (Scope ds = Debug.scope("GeneratingHostGraph", new DebugDumpScope("HostGraph"))) {
 277                 HotSpotBackend hostBackend = getRuntime().getHostBackend();
 278                 JavaType[] parameterTypes = new JavaType[hostGraph.getNodes(ParameterNode.class).count()];
 279                 Debug.log("Param count: %d", parameterTypes.length);
 280                 for (int i = 0; i < parameterTypes.length; i++) {
 281                     ParameterNode parameter = hostGraph.getParameter(i);
 282                     Debug.log("Param [%d]=%s", i, parameter);
 283                     parameterTypes[i] = parameter.stamp().javaType(hostBackend.getProviders().getMetaAccess());
 284                     Debug.log(" %s", parameterTypes[i]);
 285                 }
 286                 CallingConvention cc = hostBackend.getProviders().getCodeCache().getRegisterConfig().getCallingConvention(Type.JavaCallee, method.getSignature().getReturnType(null), parameterTypes,
 287                                 hostBackend.getTarget(), false);
 288                 CompilationResult hostCode = compileGraph(hostGraph, null, cc, method, hostBackend.getProviders(), hostBackend, this.getTarget(), null,
 289                                 hostBackend.getProviders().getSuites().getDefaultGraphBuilderSuite(), OptimisticOptimizations.NONE, null, null,
 290                                 hostBackend.getProviders().getSuites().getDefaultSuites(), new CompilationResult(), CompilationResultBuilderFactory.Default);
 291                 compilationResult = merge(hostCode, hsailCode);
 292             } catch (Throwable e) {
 293                 throw Debug.handle(e);
 294             }
 295         }
 296 
 297         HSAILHotSpotNmethod code = new HSAILHotSpotNmethod(javaMethod, hsailCode.getName(), false, true);
 298         code.setOopMapArray(hsailCode.getOopMapArray());
 299         code.setUsesAllocationFlag(hsailCode.getUsesAllocationFlag());
 300         HotSpotCompiledNmethod compiled = new HotSpotCompiledNmethod(getTarget(), javaMethod, compilationResult);
 301         CodeInstallResult result = getRuntime().getCompilerToVM().installCode(compiled, code, null);
 302         if (result != CodeInstallResult.OK) {
 303             return null;
 304         }
 305         return code;
 306     }
 307 
 308     private static ExternalCompilationResult merge(CompilationResult hostCode, ExternalCompilationResult hsailCode) {
 309         ExternalCompilationResult result = new ExternalCompilationResult();
 310 
 311         // from hsail code
 312         result.setEntryPoint(hsailCode.getEntryPoint());
 313         result.setId(hsailCode.getId());
 314         result.setEntryBCI(hsailCode.getEntryBCI());
 315         assert hsailCode.getMarks().isEmpty();
 316         assert hsailCode.getExceptionHandlers().isEmpty();
 317         assert hsailCode.getDataReferences().isEmpty();
 318 
 319         // from host code
 320         result.setTotalFrameSize(hostCode.getTotalFrameSize());
 321         result.setCustomStackAreaOffset(hostCode.getCustomStackAreaOffset());
 322         result.setTargetCode(hostCode.getTargetCode(), hostCode.getTargetCodeSize());
 323         for (CodeAnnotation annotation : hostCode.getAnnotations()) {
 324             result.addAnnotation(annotation);
 325         }
 326         for (Mark mark : hostCode.getMarks()) {
 327             result.recordMark(mark.pcOffset, mark.id);
 328         }
 329         for (ExceptionHandler handler : hostCode.getExceptionHandlers()) {
 330             result.recordExceptionHandler(handler.pcOffset, handler.handlerPos);
 331         }
 332         for (DataPatch patch : hostCode.getDataReferences()) {
 333             if (patch.data != null) {
 334                 if (patch.inline) {
 335                     result.recordInlineData(patch.pcOffset, patch.data);
 336                 } else {
 337                     result.recordDataReference(patch.pcOffset, patch.data);
 338                 }
 339             }
 340         }
 341         for (Infopoint infopoint : hostCode.getInfopoints()) {
 342             if (infopoint instanceof Call) {
 343                 Call call = (Call) infopoint;
 344                 result.recordCall(call.pcOffset, call.size, call.target, call.debugInfo, call.direct);
 345             } else {
 346                 result.recordInfopoint(infopoint.pcOffset, infopoint.debugInfo, infopoint.reason);
 347             }
 348         }
 349 
 350         // merged
 351         Assumptions mergedAssumptions = new Assumptions(true);
 352         if (hostCode.getAssumptions() != null) {
 353             for (Assumption assumption : hostCode.getAssumptions().getAssumptions()) {
 354                 if (assumption != null) {
 355                     mergedAssumptions.record(assumption);
 356                 }
 357             }
 358         }
 359         if (hsailCode.getAssumptions() != null) {
 360             for (Assumption assumption : hsailCode.getAssumptions().getAssumptions()) {
 361                 if (assumption != null) {
 362                     mergedAssumptions.record(assumption);
 363                 }
 364             }
 365         }
 366         if (!mergedAssumptions.isEmpty()) {
 367             result.setAssumptions(mergedAssumptions);
 368         }
 369         return result;
 370     }
 371 
 372     public boolean executeKernel(HotSpotInstalledCode kernel, int jobSize, Object[] args) throws InvalidInstalledCodeException {
 373         if (!deviceInitialized) {
 374             throw new GraalInternalError("Cannot execute GPU kernel if device is not initialized");
 375         }
 376         int[] oopMapArray = ((HSAILHotSpotNmethod) kernel).getOopMapArray();
 377 
 378         // Pass HsailKernelTlabs number if this kernel uses allocation, otherwise 0
 379         int numTlabs = ((HSAILHotSpotNmethod) kernel).getUsesAllocationFlag() ? HsailKernelTlabs.getValue() : 0;
 380         return executeKernel0(kernel, jobSize, args, numTlabs, HsailAllocBytesPerWorkitem.getValue(), oopMapArray);
 381     }
 382 
 383     private static native boolean executeKernel0(HotSpotInstalledCode kernel, int jobSize, Object[] args, int numTlabs, int allocBytesPerWorkitem, int[] oopMapArray)
 384                     throws InvalidInstalledCodeException;
 385 
 386     /**
 387      * Use the HSAIL register set when the compilation target is HSAIL.
 388      */
 389     @Override
 390     public FrameMap newFrameMap(RegisterConfig registerConfig) {
 391         return new HSAILFrameMap(getCodeCache(), registerConfig);
 392     }
 393 
 394     @Override
 395     public LIRGeneratorTool newLIRGenerator(CallingConvention cc, LIRGenerationResult lirGenRes) {
 396         return new HSAILHotSpotLIRGenerator(getProviders(), getRuntime().getConfig(), cc, lirGenRes);
 397     }
 398 
 399     @Override
 400     public LIRGenerationResult newLIRGenerationResult(LIR lir, FrameMap frameMap, ResolvedJavaMethod method, Object stub) {
 401         return new HSAILHotSpotLIRGenerationResult(lir, frameMap);
 402     }
 403 
 404     @Override
 405     public NodeLIRBuilderTool newNodeLIRBuilder(StructuredGraph graph, LIRGeneratorTool lirGen) {
 406         return new HSAILHotSpotNodeLIRBuilder(graph, lirGen);
 407     }
 408 
 409     class HotSpotFrameContext implements FrameContext {
 410 
 411         public boolean hasFrame() {
 412             return true;
 413         }
 414 
 415         @Override
 416         public void enter(CompilationResultBuilder crb) {
 417             Debug.log("Nothing to do here");
 418         }
 419 
 420         @Override
 421         public void leave(CompilationResultBuilder crb) {
 422             Debug.log("Nothing to do here");
 423         }
 424     }
 425 
 426     /**
 427      * a class to allow us to save lirGen.
 428      */
 429     static class HSAILCompilationResultBuilder extends CompilationResultBuilder {
 430         public HSAILHotSpotLIRGenerationResult lirGenRes;
 431 
 432         public HSAILCompilationResultBuilder(CodeCacheProvider codeCache, ForeignCallsProvider foreignCalls, FrameMap frameMap, Assembler asm, FrameContext frameContext,
 433                         CompilationResult compilationResult, HSAILHotSpotLIRGenerationResult lirGenRes) {
 434             super(codeCache, foreignCalls, frameMap, asm, frameContext, compilationResult);
 435             this.lirGenRes = lirGenRes;
 436         }
 437     }
 438 
 439     static class HSAILHotSpotNmethod extends HotSpotNmethod {
 440         private int[] oopMapArray;
 441         private boolean usesAllocation;
 442 
 443         HSAILHotSpotNmethod(HotSpotResolvedJavaMethod method, String name, boolean isDefault, boolean isExternal) {
 444             super(method, name, isDefault, isExternal);
 445         }
 446 
 447         void setOopMapArray(int[] array) {
 448             oopMapArray = array;
 449         }
 450 
 451         int[] getOopMapArray() {
 452             return oopMapArray;
 453         }
 454 
 455         public void setUsesAllocationFlag(boolean val) {
 456             usesAllocation = val;
 457         }
 458 
 459         public boolean getUsesAllocationFlag() {
 460             return usesAllocation;
 461         }
 462     }
 463 
 464     @Override
 465     protected Assembler createAssembler(FrameMap frameMap) {
 466         return new HSAILHotSpotAssembler(getTarget());
 467     }
 468 
 469     @Override
 470     public CompilationResultBuilder newCompilationResultBuilder(LIRGenerationResult lirGenRes, CompilationResult compilationResult, CompilationResultBuilderFactory factory) {
 471         FrameMap frameMap = lirGenRes.getFrameMap();
 472         Assembler masm = createAssembler(frameMap);
 473         HotSpotFrameContext frameContext = new HotSpotFrameContext();
 474         // save lirGen for later use by setHostGraph
 475         CompilationResultBuilder crb = new HSAILCompilationResultBuilder(getCodeCache(), getForeignCalls(), frameMap, masm, frameContext, compilationResult,
 476                         (HSAILHotSpotLIRGenerationResult) lirGenRes);
 477         crb.setTotalFrameSize(frameMap.totalFrameSize());
 478         return crb;
 479     }
 480 
 481     @Override
 482     public void emitCode(CompilationResultBuilder crb, LIR lir, ResolvedJavaMethod method) {
 483         assert method != null : lir + " is not associated with a method";
 484         Kind wordKind = getProviders().getCodeCache().getTarget().wordKind;
 485         LIRKind wordLIRKind = LIRKind.value(wordKind);
 486 
 487         HotSpotVMConfig config = getRuntime().getConfig();
 488         boolean useHSAILDeoptimization = config.useHSAILDeoptimization;
 489         boolean useHSAILSafepoints = config.useHSAILSafepoints;
 490 
 491         if ((useHSAILSafepoints == true) && (useHSAILDeoptimization == false)) {
 492             Debug.log("+UseHSAILSafepoints requires +UseHSAILDeoptimization");
 493         }
 494 
 495         /*
 496          * See what graph nodes we have to see if we are using the thread register. If not, we don't
 497          * have to emit the code that sets it up. Maybe there is a better way to do this?
 498          */
 499         boolean usesAllocation = false;
 500         search: for (AbstractBlock<?> b : lir.linearScanOrder()) {
 501             for (LIRInstruction op : lir.getLIRforBlock(b)) {
 502                 if ((op instanceof HSAILMove.LoadOp) && ((HSAILMove.LoadOp) op).usesThreadRegister()) {
 503                     usesAllocation = true;
 504                     assert useHSAILDeoptimization : "cannot use thread register if HSAIL deopt support is disabled";
 505                     break search;
 506                 }
 507             }
 508         }
 509         // save usesAllocation flag in ExternalCompilationResult
 510         ((ExternalCompilationResult) crb.compilationResult).setUsesAllocationFlag(usesAllocation);
 511 
 512         // Emit the prologue.
 513         HSAILAssembler asm = (HSAILAssembler) crb.asm;
 514         asm.emitString0("version 0:95: $full : $large;\n");
 515 
 516         Signature signature = method.getSignature();
 517         int sigParamCount = signature.getParameterCount(false);
 518         // We're subtracting 1 because we're not making the final gid as a parameter.
 519 
 520         int nonConstantParamCount = sigParamCount - 1;
 521         boolean isStatic = (method.isStatic());
 522         // Determine if this is an object lambda.
 523         boolean isObjectLambda = true;
 524 
 525         if (signature.getParameterType(nonConstantParamCount, null).getKind() == Kind.Int) {
 526             isObjectLambda = false;
 527         } else {
 528             // Add space for gid int reg.
 529             nonConstantParamCount++;
 530         }
 531 
 532         // If this is an instance method, include the "this" parameter
 533         if (!isStatic) {
 534             nonConstantParamCount++;
 535         }
 536         // Add in any "constant" parameters (currently none).
 537         int totalParamCount = nonConstantParamCount;
 538         JavaType[] paramtypes = new JavaType[totalParamCount];
 539         String[] paramNames = new String[totalParamCount];
 540         int pidx = 0;
 541         MetaAccessProvider metaAccess = getProviders().getMetaAccess();
 542         for (int i = 0; i < totalParamCount; i++) {
 543             if (i == 0 && !isStatic) {
 544                 paramtypes[i] = metaAccess.lookupJavaType(Object.class);
 545                 paramNames[i] = "%_this";
 546             } else if (i < nonConstantParamCount) {
 547                 if (isObjectLambda && (i == (nonConstantParamCount))) {
 548                     // Set up the gid register mapping.
 549                     paramtypes[i] = metaAccess.lookupJavaType(int.class);
 550                     paramNames[i] = "%_gid";
 551                 } else {
 552                     paramtypes[i] = signature.getParameterType(pidx++, null);
 553                     paramNames[i] = "%_arg" + i;
 554                 }
 555             }
 556         }
 557 
 558         asm.emitString0("// " + (isStatic ? "static" : "instance") + " method " + method + "\n");
 559         asm.emitString0("kernel &run ( \n");
 560 
 561         FrameMap frameMap = crb.frameMap;
 562         RegisterConfig regConfig = frameMap.registerConfig;
 563         // Build list of param types which does include the gid (for cc register mapping query).
 564         JavaType[] ccParamTypes = new JavaType[nonConstantParamCount + 1];
 565         // Include the gid.
 566         System.arraycopy(paramtypes, 0, ccParamTypes, 0, nonConstantParamCount);
 567 
 568         /*
 569          * Last entry is always int (its register gets used in the workitemabsid instruction). This
 570          * is true even for object stream lambdas.
 571          */
 572         if (sigParamCount > 0) {
 573             ccParamTypes[ccParamTypes.length - 1] = metaAccess.lookupJavaType(int.class);
 574         }
 575         CallingConvention cc = regConfig.getCallingConvention(JavaCallee, null, ccParamTypes, getTarget(), false);
 576 
 577         /**
 578          * Compute the hsail size mappings up to but not including the last non-constant parameter
 579          * (which is the gid).
 580          *
 581          */
 582         String[] paramHsailSizes = new String[totalParamCount];
 583         for (int i = 0; i < totalParamCount; i++) {
 584             String paramtypeStr = paramtypes[i].toString();
 585             String sizeStr = paramTypeMap.get(paramtypeStr);
 586             // Catch all for any unmapped paramtype that is u64 (address of an object).
 587             paramHsailSizes[i] = (sizeStr != null ? sizeStr : "u64");
 588         }
 589         // Emit the kernel function parameters.
 590         for (int i = 0; i < totalParamCount; i++) {
 591             String str = "align 8 kernarg_" + paramHsailSizes[i] + " " + paramNames[i];
 592 
 593             if (useHSAILDeoptimization || (i != totalParamCount - 1)) {
 594                 str += ",";
 595             }
 596             asm.emitString(str);
 597         }
 598 
 599         if (useHSAILDeoptimization) {
 600             // add in the deoptInfo parameter
 601             asm.emitString("kernarg_u64 " + asm.getDeoptInfoName());
 602         }
 603 
 604         asm.emitString(") {");
 605 
 606         /*
 607          * End of parameters start of prolog code. Emit the load instructions for loading of the
 608          * kernel non-constant parameters into registers. The constant class parameters will not be
 609          * loaded up front but will be loaded as needed.
 610          */
 611         for (int i = 0; i < nonConstantParamCount; i++) {
 612             asm.emitString("ld_kernarg_" + paramHsailSizes[i] + "  " + HSAIL.mapRegister(cc.getArgument(i)) + ", [" + paramNames[i] + "];");
 613         }
 614 
 615         /*
 616          * Emit the workitemaid instruction for loading the hidden gid parameter. This is assigned
 617          * the register as if it were the last of the nonConstant parameters.
 618          */
 619         String workItemReg = "$s" + Integer.toString(asRegister(cc.getArgument(nonConstantParamCount)).encoding());
 620         asm.emitString("workitemabsid_u32 " + workItemReg + ", 0;");
 621 
 622         final String deoptInProgressLabel = "@LHandleDeoptInProgress";
 623 
 624         if (useHSAILDeoptimization) {
 625             // Aliases for d16
 626             RegisterValue d16_deoptInfo = HSAIL.d16.asValue(wordLIRKind);
 627 
 628             // Aliases for d17
 629             RegisterValue d17_tlabIndex = HSAIL.d17.asValue(wordLIRKind);
 630             RegisterValue d17_safepointFlagAddrIndex = d17_tlabIndex;
 631 
 632             // Aliases for s34
 633             RegisterValue s34_deoptOccurred = HSAIL.s34.asValue(LIRKind.value(Kind.Int));
 634             RegisterValue s34_tlabIndex = s34_deoptOccurred;
 635 
 636             asm.emitLoadKernelArg(d16_deoptInfo, asm.getDeoptInfoName(), "u64");
 637             asm.emitComment("// Check if a deopt or safepoint has occurred and abort if true before doing any work");
 638 
 639             if (useHSAILSafepoints) {
 640                 // Load address of _notice_safepoints field
 641                 asm.emitLoad(wordKind, d17_safepointFlagAddrIndex, new HSAILAddressValue(wordLIRKind, d16_deoptInfo, config.hsailNoticeSafepointsOffset).toAddress());
 642                 // Load int value from that field
 643                 asm.emitLoadAcquire(s34_deoptOccurred, new HSAILAddressValue(wordLIRKind, d17_safepointFlagAddrIndex, 0).toAddress());
 644                 asm.emitCompare(Kind.Int, s34_deoptOccurred, Constant.forInt(0), "ne", false, false);
 645                 asm.cbr(deoptInProgressLabel);
 646             }
 647             asm.emitLoadAcquire(s34_deoptOccurred, new HSAILAddressValue(wordLIRKind, d16_deoptInfo, config.hsailDeoptOccurredOffset).toAddress());
 648             asm.emitCompare(Kind.Int, s34_deoptOccurred, Constant.forInt(0), "ne", false, false);
 649             asm.cbr(deoptInProgressLabel);
 650             // load thread register if this kernel performs allocation
 651             if (usesAllocation) {
 652                 RegisterValue threadReg = getProviders().getRegisters().getThreadRegister().asValue(wordLIRKind);
 653                 assert HsailKernelTlabs.getValue() > 0;
 654                 asm.emitLoad(wordKind, threadReg, new HSAILAddressValue(wordLIRKind, d16_deoptInfo, config.hsailCurTlabInfoOffset).toAddress());
 655                 if (HsailKernelTlabs.getValue() != 1) {
 656                     asm.emitComment("// map workitem to a tlab");
 657                     asm.emitString(String.format("rem_u32  $%s, %s, %d;", s34_tlabIndex.getRegister(), workItemReg, HsailKernelTlabs.getValue()));
 658                     asm.emitConvert(d17_tlabIndex, s34_tlabIndex, wordKind, Kind.Int);
 659                     asm.emit("mad", threadReg, d17_tlabIndex, Constant.forInt(8), threadReg);
 660                 } else {
 661                     // workitem is already mapped to solitary tlab
 662                 }
 663                 asm.emitComment("// $" + getProviders().getRegisters().getThreadRegister() + " will point to holder of tlab thread info for this workitem");
 664             }
 665         }
 666 
 667         /*
 668          * Note the logic used for this spillseg size is to leave space and then go back and patch
 669          * in the correct size once we have generated all the instructions. This should probably be
 670          * done in a more robust way by implementing something like asm.insertString.
 671          */
 672         int spillsegDeclarationPosition = asm.position() + 1;
 673         String spillsegTemplate = "align 4 spill_u8 %spillseg[123456];";
 674         asm.emitString(spillsegTemplate);
 675         // Emit object array load prologue here.
 676         if (isObjectLambda) {
 677             boolean useCompressedOops = config.useCompressedOops;
 678             final int arrayElementsOffset = HotSpotGraalRuntime.getArrayBaseOffset(wordKind);
 679             String iterationObjArgReg = HSAIL.mapRegister(cc.getArgument(nonConstantParamCount - 1));
 680             /*
 681              * iterationObjArgReg will be the highest $d register in use (it is the last parameter)
 682              * so tempReg can be the next higher $d register. As of 1.0 spec, we cannot use
 683              * ld_global_u32 $dxx, [addr]; so we need a temporary $s register. We can use
 684              * workItemReg+1;
 685              */
 686             String tmpReg = "$d" + (asRegister(cc.getArgument(nonConstantParamCount - 1)).encoding() + 1);
 687             // Convert gid to long.
 688             asm.emitString("cvt_u64_s32 " + tmpReg + ", " + workItemReg + "; // Convert gid to long");
 689             // Adjust index for sizeof ref. Where to pull this size from?
 690             asm.emitString("mul_u64 " + tmpReg + ", " + tmpReg + ", " + (useCompressedOops ? 4 : 8) + "; // Adjust index for sizeof ref");
 691             // Adjust for actual data start.
 692             asm.emitString("add_u64 " + tmpReg + ", " + tmpReg + ", " + arrayElementsOffset + "; // Adjust for actual elements data start");
 693             // Add to array ref ptr.
 694             asm.emitString("add_u64 " + tmpReg + ", " + tmpReg + ", " + iterationObjArgReg + "; // Add to array ref ptr");
 695             // Load the object into the parameter reg.
 696             if (useCompressedOops) {
 697                 int workItemRegEncoding = asRegister(cc.getArgument(nonConstantParamCount)).encoding();
 698                 String tmpReg32 = "$s" + Integer.toString(workItemRegEncoding + 1);
 699 
 700                 // Load u32 into the temporary $s reg since it will become an object address
 701 
 702                 asm.emitString("ld_global_u32 " + tmpReg32 + ", " + "[" + tmpReg + "]" + "; // Load compressed ptr from array");
 703                 asm.emitString("cvt_u64_u32 " + tmpReg + ", " + tmpReg32 + ";      // cvt to 64 bits");
 704 
 705                 long narrowOopBase = config.narrowOopBase;
 706                 long narrowOopShift = config.narrowOopShift;
 707 
 708                 if (narrowOopBase == 0 && narrowOopShift == 0) {
 709                     // No more calculation to do, mov to target register
 710                     asm.emitString("mov_b64 " + iterationObjArgReg + ", " + tmpReg + "; // no shift or base addition");
 711                 } else {
 712                     if (narrowOopBase == 0) {
 713                         asm.emitString("shl_u64 " + iterationObjArgReg + ", " + tmpReg + ", " + narrowOopShift + "; // do narrowOopShift");
 714                     } else if (narrowOopShift == 0) {
 715                         // not sure if we ever get add with 0 shift but just in case
 716                         asm.emitString("cmp_eq_b1_u64  $c0, " + tmpReg + ", 0x0; // avoid add if compressed is null");
 717                         asm.emitString("add_u64 " + iterationObjArgReg + ", " + tmpReg + ", " + narrowOopBase + "; // add narrowOopBase");
 718                         asm.emitString("cmov_b64 " + iterationObjArgReg + ", $c0, 0x0, " + iterationObjArgReg + "; // avoid add if compressed is null");
 719                     } else {
 720                         asm.emitString("cmp_eq_b1_u64  $c0, " + tmpReg + ", 0x0; // avoid shift-add if compressed is null");
 721                         asm.emitString("mad_u64 " + iterationObjArgReg + ", " + tmpReg + ", " + (1 << narrowOopShift) + ", " + narrowOopBase + "; // shift and add narrowOopBase");
 722                         asm.emitString("cmov_b64 " + iterationObjArgReg + ", $c0, 0x0, " + iterationObjArgReg + "; // avoid shift-add if compressed is null");
 723                     }
 724                 }
 725 
 726             } else {
 727                 asm.emitString("ld_global_u64 " + iterationObjArgReg + ", " + "[" + tmpReg + "]" + "; // Load from array element into parameter reg");
 728             }
 729         }
 730         // Prologue done, Emit code for the LIR.
 731         crb.emit(lir);
 732         // Now that code is emitted go back and figure out what the upper Bound stack size was.
 733         long maxStackSize = ((HSAILAssembler) crb.asm).upperBoundStackSize();
 734         String spillsegStringFinal;
 735         if (maxStackSize == 0) {
 736             // If no spilling, get rid of spillseg declaration.
 737             char[] array = new char[spillsegTemplate.length()];
 738             Arrays.fill(array, ' ');
 739             spillsegStringFinal = new String(array);
 740         } else {
 741             spillsegStringFinal = spillsegTemplate.replace("123456", String.format("%6d", maxStackSize));
 742         }
 743         asm.emitString(spillsegStringFinal, spillsegDeclarationPosition);
 744         // Emit the epilogue.
 745 
 746         HSAILHotSpotLIRGenerationResult lirGenRes = ((HSAILCompilationResultBuilder) crb).lirGenRes;
 747 
 748         int numSRegs = 0;
 749         int numDRegs = 0;
 750         int numStackSlotBytes = 0;
 751         if (useHSAILDeoptimization) {
 752             /*
 753              * Get the union of registers and stack slots needed to be saved at the infopoints.
 754              * While doing this compute the highest register in each category.
 755              */
 756             HSAILHotSpotRegisterConfig hsailRegConfig = (HSAILHotSpotRegisterConfig) regConfig;
 757             Set<Register> infoUsedRegs = new TreeSet<>();
 758             Set<StackSlot> infoUsedStackSlots = new HashSet<>();
 759             List<Infopoint> infoList = crb.compilationResult.getInfopoints();
 760             Queue<Value[]> workList = new LinkedList<>();
 761             for (Infopoint info : infoList) {
 762                 BytecodeFrame frame = info.debugInfo.frame();
 763                 while (frame != null) {
 764                     workList.add(frame.values);
 765                     frame = frame.caller();
 766                 }
 767             }
 768             while (!workList.isEmpty()) {
 769                 Value[] values = workList.poll();
 770                 for (Value val : values) {
 771                     if (isLegal(val)) {
 772                         if (isRegister(val)) {
 773                             Register reg = asRegister(val);
 774                             infoUsedRegs.add(reg);
 775                             if (hsailRegConfig.isAllocatableSReg(reg)) {
 776                                 numSRegs = Math.max(numSRegs, reg.encoding + 1);
 777                             } else if (hsailRegConfig.isAllocatableDReg(reg)) {
 778                                 numDRegs = Math.max(numDRegs, reg.encoding + 1);
 779                             }
 780                         } else if (isStackSlot(val)) {
 781                             StackSlot slot = asStackSlot(val);
 782                             Kind slotKind = slot.getKind();
 783                             int slotSizeBytes = (slotKind.isObject() ? 8 : slotKind.getByteCount());
 784                             int slotOffsetMax = HSAIL.getStackOffsetStart(slot, slotSizeBytes * 8) + slotSizeBytes;
 785                             numStackSlotBytes = Math.max(numStackSlotBytes, slotOffsetMax);
 786                             infoUsedStackSlots.add(slot);
 787                         } else if (isVirtualObject(val)) {
 788                             workList.add(((VirtualObject) val).getValues());
 789                         } else {
 790                             assert isConstant(val) : "Unsupported value: " + val;
 791                         }
 792                     }
 793                 }
 794             }
 795 
 796             // round up numSRegs to even number so dregs start on aligned boundary
 797             numSRegs += (numSRegs & 1);
 798 
 799             // numStackSlots is the number of 8-byte locations used for stack variables
 800             int numStackSlots = (numStackSlotBytes + 7) / 8;
 801 
 802             final int offsetToDeoptSaveStates = config.hsailDeoptimizationInfoHeaderSize;
 803             final int bytesPerSaveArea = 4 * numSRegs + 8 * numDRegs + 8 * numStackSlots;
 804             final int sizeofKernelDeopt = config.hsailKernelDeoptimizationHeaderSize + config.hsailFrameHeaderSize + bytesPerSaveArea;
 805             final int offsetToNeverRanArray = config.hsailNeverRanArrayOffset;
 806             final int offsetToDeoptNextIndex = config.hsailDeoptNextIndexOffset;
 807             final int offsetToDeoptimizationWorkItem = config.hsailDeoptimizationWorkItem;
 808             final int offsetToDeoptimizationReason = config.hsailDeoptimizationReason;
 809             final int offsetToDeoptimizationFrame = config.hsailKernelDeoptimizationHeaderSize;
 810             final int offsetToFramePc = config.hsailFramePcOffset;
 811             final int offsetToNumSaves = config.hsailFrameNumSRegOffset;
 812             final int offsetToSaveArea = config.hsailFrameHeaderSize;
 813 
 814             AllocatableValue scratch64 = HSAIL.d16.asValue(wordLIRKind);
 815             AllocatableValue cuSaveAreaPtr = HSAIL.d17.asValue(wordLIRKind);
 816             AllocatableValue waveMathScratch1 = HSAIL.d18.asValue(wordLIRKind);
 817             AllocatableValue waveMathScratch2 = HSAIL.d19.asValue(wordLIRKind);
 818 
 819             AllocatableValue actionAndReasonReg = HSAIL.actionAndReasonReg.asValue(LIRKind.value(Kind.Int));
 820             AllocatableValue codeBufferOffsetReg = HSAIL.codeBufferOffsetReg.asValue(LIRKind.value(Kind.Int));
 821             AllocatableValue scratch32 = HSAIL.s34.asValue(LIRKind.value(Kind.Int));
 822             AllocatableValue workidreg = HSAIL.s35.asValue(LIRKind.value(Kind.Int));
 823 
 824             HSAILAddress deoptNextIndexAddr = new HSAILAddressValue(wordLIRKind, scratch64, offsetToDeoptNextIndex).toAddress();
 825             HSAILAddress neverRanArrayAddr = new HSAILAddressValue(wordLIRKind, scratch64, offsetToNeverRanArray).toAddress();
 826 
 827             // The just-started lanes that see the deopt flag will jump here
 828             asm.emitString0(deoptInProgressLabel + ":\n");
 829             asm.emitLoad(wordKind, waveMathScratch1, neverRanArrayAddr);
 830             asm.emitWorkItemAbsId(workidreg);
 831             asm.emitConvert(waveMathScratch2, workidreg, wordKind, Kind.Int);
 832             asm.emit("add", waveMathScratch1, waveMathScratch1, waveMathScratch2);
 833             HSAILAddress neverRanStoreAddr = new HSAILAddressValue(wordLIRKind, waveMathScratch1, 0).toAddress();
 834             asm.emitStore(Kind.Byte, Constant.forInt(1), neverRanStoreAddr);
 835             asm.emitString("ret;");
 836 
 837             // The deoptimizing lanes will jump here
 838             asm.emitString0(asm.getDeoptLabelName() + ":\n");
 839             String labelExit = asm.getDeoptLabelName() + "_Exit";
 840 
 841             HSAILAddress deoptInfoAddr = new HSAILAddressValue(wordLIRKind, scratch64, config.hsailDeoptOccurredOffset).toAddress();
 842             asm.emitLoadKernelArg(scratch64, asm.getDeoptInfoName(), "u64");
 843 
 844             // Set deopt occurred flag
 845             asm.emitMov(Kind.Int, scratch32, Constant.forInt(1));
 846             asm.emitStoreRelease(scratch32, deoptInfoAddr);
 847 
 848             asm.emitComment("// Determine next deopt save slot");
 849             asm.emitAtomicAdd(scratch32, deoptNextIndexAddr, Constant.forInt(1));
 850             /*
 851              * scratch32 now holds next index to use set error condition if no room in save area
 852              */
 853             asm.emitComment("// assert room to save deopt");
 854             asm.emitCompare(Kind.Int, scratch32, Constant.forInt(maxDeoptIndex), "lt", false, false);
 855             asm.cbr("@L_StoreDeopt");
 856             /*
 857              * if assert fails, store a guaranteed negative workitemid in top level deopt occurred
 858              * flag
 859              */
 860             asm.emitWorkItemAbsId(scratch32);
 861             asm.emit("mad", scratch32, scratch32, Constant.forInt(-1), Constant.forInt(-1));
 862             asm.emitStore(scratch32, deoptInfoAddr);
 863             asm.emitString("ret;");
 864 
 865             asm.emitString0("@L_StoreDeopt" + ":\n");
 866 
 867             // Store deopt for this workitem into its slot in the HSAILComputeUnitSaveStates array
 868 
 869             asm.emitComment("// Convert id's for ptr math");
 870             asm.emitConvert(cuSaveAreaPtr, scratch32, wordKind, Kind.Int);
 871             asm.emitComment("// multiply by sizeof KernelDeoptArea");
 872             asm.emit("mul", cuSaveAreaPtr, cuSaveAreaPtr, Constant.forInt(sizeofKernelDeopt));
 873             asm.emitComment("// Add computed offset to deoptInfoPtr base");
 874             asm.emit("add", cuSaveAreaPtr, cuSaveAreaPtr, scratch64);
 875             // Add offset to _deopt_save_states[0]
 876             asm.emit("add", scratch64, cuSaveAreaPtr, Constant.forInt(offsetToDeoptSaveStates));
 877 
 878             HSAILAddress workItemAddr = new HSAILAddressValue(wordLIRKind, scratch64, offsetToDeoptimizationWorkItem).toAddress();
 879             HSAILAddress actionReasonStoreAddr = new HSAILAddressValue(wordLIRKind, scratch64, offsetToDeoptimizationReason).toAddress();
 880 
 881             asm.emitComment("// Get _deopt_info._first_frame");
 882             asm.emit("add", waveMathScratch1, scratch64, Constant.forInt(offsetToDeoptimizationFrame));
 883             // Now scratch64 is the _deopt_info._first_frame
 884             HSAILAddress pcStoreAddr = new HSAILAddressValue(wordLIRKind, waveMathScratch1, offsetToFramePc).toAddress();
 885             HSAILAddress regCountsAddr = new HSAILAddressValue(wordLIRKind, waveMathScratch1, offsetToNumSaves).toAddress();
 886             asm.emitComment("// store deopting workitem");
 887             asm.emitWorkItemAbsId(scratch32);
 888             asm.emitStore(Kind.Int, scratch32, workItemAddr);
 889             asm.emitComment("// store actionAndReason");
 890             asm.emitStore(Kind.Int, actionAndReasonReg, actionReasonStoreAddr);
 891             asm.emitComment("// store PC");
 892             asm.emitStore(Kind.Int, codeBufferOffsetReg, pcStoreAddr);
 893 
 894             asm.emitComment("// store regCounts (" + numSRegs + " $s registers, " + numDRegs + " $d registers, " + numStackSlots + " stack slots)");
 895             asm.emitStore(Kind.Int, Constant.forInt(numSRegs + (numDRegs << 8) + (numStackSlots << 16)), regCountsAddr);
 896 
 897             /*
 898              * Loop thru the usedValues storing each of the registers that are used. We always store
 899              * in a fixed location, even if some registers are skipped.
 900              */
 901             asm.emitComment("// store used regs");
 902             for (Register reg : infoUsedRegs) {
 903                 if (hsailRegConfig.isAllocatableSReg(reg)) {
 904                     // 32 bit registers
 905                     Kind kind = Kind.Int;
 906                     int ofst = offsetToSaveArea + reg.encoding * 4;
 907                     HSAILAddress addr = new HSAILAddressValue(wordLIRKind, waveMathScratch1, ofst).toAddress();
 908                     AllocatableValue regValue = reg.asValue(LIRKind.value(kind));
 909                     asm.emitStore(kind, regValue, addr);
 910                 } else if (hsailRegConfig.isAllocatableDReg(reg)) {
 911                     // 64 bit registers
 912                     Kind kind = Kind.Long;
 913                     // d reg ofst starts past the 32 sregs
 914                     int ofst = offsetToSaveArea + (numSRegs * 4) + reg.encoding * 8;
 915                     HSAILAddress addr = new HSAILAddressValue(wordLIRKind, waveMathScratch1, ofst).toAddress();
 916                     AllocatableValue regValue = reg.asValue(LIRKind.value(kind));
 917                     asm.emitStore(kind, regValue, addr);
 918                 } else {
 919                     throw GraalInternalError.unimplemented();
 920                 }
 921             }
 922 
 923             // loop thru the usedStackSlots creating instructions to save in the save area
 924             if (numStackSlotBytes > 0) {
 925                 asm.emitComment("// store stack slots (uses " + numStackSlotBytes + " bytes)");
 926                 for (StackSlot slot : infoUsedStackSlots) {
 927                     asm.emitComment("// store " + slot);
 928                     Kind kind = slot.getKind();
 929                     int sizeInBits = (kind.isObject() || kind.getByteCount() == 8 ? 64 : 32);
 930                     int ofst = offsetToSaveArea + (numSRegs * 4) + (numDRegs * 8) + HSAIL.getStackOffsetStart(slot, sizeInBits);
 931                     HSAILAddress addr = new HSAILAddressValue(wordLIRKind, waveMathScratch1, ofst).toAddress();
 932                     if (sizeInBits == 64) {
 933                         asm.emitSpillLoad(kind, scratch64, slot);
 934                         asm.emitStore(kind, scratch64, addr);
 935                     } else {
 936                         asm.emitSpillLoad(kind, scratch32, slot);
 937                         asm.emitStore(kind, scratch32, addr);
 938                     }
 939                 }
 940             }
 941 
 942             asm.emitString0(labelExit + ":\n");
 943 
 944             // and emit the return
 945             crb.frameContext.leave(crb);
 946             asm.exit();
 947             // build the oopMap Array
 948             int[] oopMapArray = new OopMapArrayBuilder().build(infoList, numSRegs, numDRegs, numStackSlots, hsailRegConfig);
 949             ((ExternalCompilationResult) crb.compilationResult).setOopMapArray(oopMapArray);
 950         } else {
 951             // Deoptimization is explicitly off, so emit simple return
 952             asm.emitString0(asm.getDeoptLabelName() + ":\n");
 953             asm.emitComment("// No deoptimization");
 954             asm.emitString("ret;");
 955         }
 956 
 957         asm.emitString0("}; \n");
 958 
 959         ExternalCompilationResult compilationResult = (ExternalCompilationResult) crb.compilationResult;
 960         if (useHSAILDeoptimization) {
 961             compilationResult.setHostGraph(prepareHostGraph(method, lirGenRes.getDeopts(), getProviders(), config, numSRegs, numDRegs));
 962         }
 963     }
 964 
 965     private static class OopMapArrayBuilder {
 966         // oopMapArray struct
 967         // int bytesPerSaveArea; (not strictly part of oopsmap but convenient to put here)
 968         // int intsPerInfopoint;
 969         static final int SAVEAREACOUNTS_OFST = 0;
 970         static final int INTSPERINFOPOINT_OFST = 1;
 971         static final int HEADERSIZE = 2;
 972         // for each infopoint:
 973         // int deoptId
 974         // one or more ints of bits for the oopmap
 975 
 976         private int[] array;
 977         private int intsPerInfopoint;
 978 
 979         int[] build(List<Infopoint> infoList, int numSRegs, int numDRegs, int numStackSlots, HSAILHotSpotRegisterConfig hsailRegConfig) {
 980             /*
 981              * We are told that infoList is always sorted. Each infoPoint can have a different
 982              * oopMap. Since numStackSlots is the number of 8-byte stack slots used, it is an upper
 983              * limit on the number of oop stack slots
 984              */
 985             int bitsPerInfopoint = numDRegs + numStackSlots;
 986             int intsForBits = (bitsPerInfopoint + 31) / 32;
 987             int numInfopoints = infoList.size();
 988             intsPerInfopoint = intsForBits + 1;  // +1 for the pcoffset
 989             int arraySize = HEADERSIZE + (numInfopoints * intsPerInfopoint);
 990             array = new int[arraySize];
 991             array[INTSPERINFOPOINT_OFST] = intsPerInfopoint;
 992             // compute saveAreaCounts
 993             int saveAreaCounts = (numSRegs & 0xff) + (numDRegs << 8) + (numStackSlots << 16);
 994             array[SAVEAREACOUNTS_OFST] = saveAreaCounts;
 995 
 996             // loop thru the infoList
 997             int infoIndex = 0;
 998             for (Infopoint info : infoList) {
 999                 setOopMapPcOffset(infoIndex, info.pcOffset);
1000                 BytecodeFrame frame = info.debugInfo.frame();
1001                 while (frame != null) {
1002                     for (int i = 0; i < frame.numLocals + frame.numStack; i++) {
1003                         Value val = frame.values[i];
1004                         if (isLegal(val)) {
1005                             if (isRegister(val)) {
1006                                 Register reg = asRegister(val);
1007                                 if (val.getKind().isObject()) {
1008                                     assert (hsailRegConfig.isAllocatableDReg(reg));
1009                                     int bitIndex = reg.encoding();
1010                                     setOopMapBit(infoIndex, bitIndex);
1011                                 }
1012                             } else if (isStackSlot(val)) {
1013                                 StackSlot slot = asStackSlot(val);
1014                                 if (val.getKind().isObject()) {
1015                                     assert (HSAIL.getStackOffsetStart(slot, 64) % 8 == 0);
1016                                     int bitIndex = numDRegs + HSAIL.getStackOffsetStart(slot, 64) / 8;
1017                                     setOopMapBit(infoIndex, bitIndex);
1018                                 }
1019                             }
1020                         }
1021                     }
1022                     frame = frame.caller();
1023                 }
1024                 infoIndex++;
1025             }
1026             try (Scope s = Debug.scope("CodeGen")) {
1027                 if (Debug.isLogEnabled()) {
1028                     Debug.log("numSRegs=%d, numDRegs=%d, numStackSlots=%d", numSRegs, numDRegs, numStackSlots);
1029                     // show infopoint oopmap details
1030                     for (infoIndex = 0; infoIndex < infoList.size(); infoIndex++) {
1031                         String infoString = "Infopoint " + infoIndex + ", pcOffset=" + getOopMapPcOffset(infoIndex) + ",   oopmap=";
1032                         for (int i = 0; i < intsForBits; i++) {
1033                             infoString += (i != 0 ? ", " : "") + Integer.toHexString(getOopMapBitsAsInt(infoIndex, i));
1034                         }
1035                         Debug.log(infoString);
1036                     }
1037                 }
1038             } catch (Throwable e) {
1039                 throw Debug.handle(e);
1040             }
1041 
1042             return array;
1043         }
1044 
1045         private void setOopMapPcOffset(int infoIndex, int pcOffset) {
1046             int arrIndex = HEADERSIZE + infoIndex * intsPerInfopoint;
1047             array[arrIndex] = pcOffset;
1048         }
1049 
1050         private int getOopMapPcOffset(int infoIndex) {
1051             int arrIndex = HEADERSIZE + infoIndex * intsPerInfopoint;
1052             return array[arrIndex];
1053         }
1054 
1055         private void setOopMapBit(int infoIndex, int bitIndex) {
1056             int arrIndex = HEADERSIZE + infoIndex * intsPerInfopoint + 1 + bitIndex / 32;
1057             array[arrIndex] |= (1 << (bitIndex % 32));
1058         }
1059 
1060         private int getOopMapBitsAsInt(int infoIndex, int intIndex) {
1061             int arrIndex = HEADERSIZE + infoIndex * intsPerInfopoint + 1 + intIndex;
1062             return array[arrIndex];
1063         }
1064     }
1065 
1066     private static StructuredGraph prepareHostGraph(ResolvedJavaMethod method, List<DeoptimizingOp> deopts, HotSpotProviders providers, HotSpotVMConfig config, int numSRegs, int numDRegs) {
1067         if (deopts.isEmpty()) {
1068             return null;
1069         }
1070         StructuredGraph hostGraph = new StructuredGraph(method, -2);
1071         ParameterNode deoptId = hostGraph.unique(new ParameterNode(0, StampFactory.intValue()));
1072         ParameterNode hsailFrame = hostGraph.unique(new ParameterNode(1, StampFactory.forKind(providers.getCodeCache().getTarget().wordKind)));
1073         ParameterNode reasonAndAction = hostGraph.unique(new ParameterNode(2, StampFactory.intValue()));
1074         ParameterNode speculation = hostGraph.unique(new ParameterNode(3, StampFactory.object()));
1075         BeginNode[] branches = new BeginNode[deopts.size() + 1];
1076         int[] keys = new int[deopts.size()];
1077         int[] keySuccessors = new int[deopts.size() + 1];
1078         double[] keyProbabilities = new double[deopts.size() + 1];
1079         int i = 0;
1080         Collections.sort(deopts, new Comparator<DeoptimizingOp>() {
1081             public int compare(DeoptimizingOp o1, DeoptimizingOp o2) {
1082                 return o1.getCodeBufferPos() - o2.getCodeBufferPos();
1083             }
1084         });
1085         for (DeoptimizingOp deopt : deopts) {
1086             keySuccessors[i] = i;
1087             keyProbabilities[i] = 1.0 / deopts.size();
1088             keys[i] = deopt.getCodeBufferPos();
1089             assert keys[i] >= 0;
1090             branches[i] = createHostDeoptBranch(deopt, hsailFrame, reasonAndAction, speculation, providers, config, numSRegs, numDRegs);
1091 
1092             i++;
1093         }
1094         keyProbabilities[deopts.size()] = 0; // default
1095         keySuccessors[deopts.size()] = deopts.size();
1096         branches[deopts.size()] = createHostCrashBranch(hostGraph, deoptId);
1097         IntegerSwitchNode switchNode = hostGraph.add(new IntegerSwitchNode(deoptId, branches, keys, keyProbabilities, keySuccessors));
1098         StartNode start = hostGraph.start();
1099         start.setNext(switchNode);
1100         /*
1101          * printf.setNext(printf2); printf2.setNext(switchNode);
1102          */
1103         hostGraph.setGuardsStage(GuardsStage.AFTER_FSA);
1104         return hostGraph;
1105     }
1106 
1107     private static BeginNode createHostCrashBranch(StructuredGraph hostGraph, ValueNode deoptId) {
1108         VMErrorNode vmError = hostGraph.add(new VMErrorNode("Error in HSAIL deopt. DeoptId=%d", deoptId));
1109         // ConvertNode.convert(hostGraph, Kind.Long, deoptId)));
1110         vmError.setNext(hostGraph.add(new ReturnNode(ConstantNode.defaultForKind(hostGraph.method().getSignature().getReturnKind(), hostGraph))));
1111         return BeginNode.begin(vmError);
1112     }
1113 
1114     private static BeginNode createHostDeoptBranch(DeoptimizingOp deopt, ParameterNode hsailFrame, ValueNode reasonAndAction, ValueNode speculation, HotSpotProviders providers,
1115                     HotSpotVMConfig config, int numSRegs, int numDRegs) {
1116         BeginNode branch = hsailFrame.graph().add(new BeginNode());
1117         DynamicDeoptimizeNode deoptimization = hsailFrame.graph().add(new DynamicDeoptimizeNode(reasonAndAction, speculation));
1118         deoptimization.setStateBefore(createFrameState(deopt.getFrameState().topFrame, hsailFrame, providers, config, numSRegs, numDRegs));
1119         branch.setNext(deoptimization);
1120         return branch;
1121     }
1122 
1123     private static FrameState createFrameState(BytecodeFrame lowLevelFrame, ParameterNode hsailFrame, HotSpotProviders providers, HotSpotVMConfig config, int numSRegs, int numDRegs) {
1124         return createFrameState(lowLevelFrame, hsailFrame, providers, config, numSRegs, numDRegs, new HashMap<VirtualObject, VirtualObjectNode>());
1125     }
1126 
1127     private static FrameState createFrameState(BytecodeFrame lowLevelFrame, ParameterNode hsailFrame, HotSpotProviders providers, HotSpotVMConfig config, int numSRegs, int numDRegs,
1128                     Map<VirtualObject, VirtualObjectNode> virtualObjects) {
1129         FrameState outterFrameState = null;
1130         if (lowLevelFrame.caller() != null) {
1131             outterFrameState = createFrameState(lowLevelFrame.caller(), hsailFrame, providers, config, numSRegs, numDRegs, virtualObjects);
1132         }
1133         StructuredGraph hostGraph = hsailFrame.graph();
1134         Function<? super Value, ? extends ValueNode> lirValueToHirNode = v -> getNodeForValueFromFrame(v, hsailFrame, hostGraph, providers, config, numSRegs, numDRegs, virtualObjects);
1135         ValueNode[] locals = new ValueNode[lowLevelFrame.numLocals];
1136         for (int i = 0; i < lowLevelFrame.numLocals; i++) {
1137             locals[i] = lirValueToHirNode.apply(lowLevelFrame.getLocalValue(i));
1138         }
1139         List<ValueNode> stack = new ArrayList<>(lowLevelFrame.numStack);
1140         for (int i = 0; i < lowLevelFrame.numStack; i++) {
1141             stack.add(lirValueToHirNode.apply(lowLevelFrame.getStackValue(i)));
1142         }
1143         ValueNode[] locks = new ValueNode[lowLevelFrame.numLocks];
1144         MonitorIdNode[] monitorIds = new MonitorIdNode[lowLevelFrame.numLocks];
1145         for (int i = 0; i < lowLevelFrame.numLocks; i++) {
1146             HotSpotMonitorValue lockValue = (HotSpotMonitorValue) lowLevelFrame.getLockValue(i);
1147             locks[i] = lirValueToHirNode.apply(lockValue);
1148             monitorIds[i] = getMonitorIdForHotSpotMonitorValueFromFrame(lockValue, hsailFrame, hostGraph);
1149         }
1150         FrameState frameState = hostGraph.add(new FrameState(lowLevelFrame.getMethod(), lowLevelFrame.getBCI(), locals, stack, locks, monitorIds, lowLevelFrame.rethrowException, false));
1151         if (outterFrameState != null) {
1152             frameState.setOuterFrameState(outterFrameState);
1153         }
1154         Map<VirtualObject, VirtualObjectNode> virtualObjectsCopy;
1155         // TODO this could be implemented more efficiently with a mark into the map
1156         // unfortunately LinkedHashMap doesn't seem to provide that.
1157         List<VirtualObjectState> virtualStates = new ArrayList<>(virtualObjects.size());
1158         do {
1159             virtualObjectsCopy = new HashMap<>(virtualObjects);
1160             virtualStates.clear();
1161             for (Entry<VirtualObject, VirtualObjectNode> entry : virtualObjectsCopy.entrySet()) {
1162                 VirtualObject virtualObject = entry.getKey();
1163                 VirtualObjectNode virtualObjectNode = entry.getValue();
1164                 List<ValueNode> fieldValues = Arrays.stream(virtualObject.getValues()).map(lirValueToHirNode).collect(Collectors.toList());
1165                 virtualStates.add(new VirtualObjectState(virtualObjectNode, fieldValues));
1166             }
1167             // New virtual objects may have been discovered while processing the previous set.
1168             // Wait until a fixed point is reached
1169         } while (virtualObjectsCopy.size() < virtualObjects.size());
1170         virtualStates.forEach(vos -> frameState.addVirtualObjectMapping(hostGraph.unique(vos)));
1171         return frameState;
1172     }
1173 
1174     @SuppressWarnings("unused")
1175     private static MonitorIdNode getMonitorIdForHotSpotMonitorValueFromFrame(HotSpotMonitorValue lockValue, ParameterNode hsailFrame, StructuredGraph hsailGraph) {
1176         if (lockValue.isEliminated()) {
1177             return null;
1178         }
1179         throw GraalInternalError.unimplemented();
1180     }
1181 
1182     private static ValueNode getNodeForValueFromFrame(Value localValue, ParameterNode hsailFrame, StructuredGraph hostGraph, HotSpotProviders providers, HotSpotVMConfig config, int numSRegs,
1183                     int numDRegs, Map<VirtualObject, VirtualObjectNode> virtualObjects) {
1184         ValueNode valueNode;
1185         if (localValue instanceof Constant) {
1186             valueNode = ConstantNode.forConstant((Constant) localValue, providers.getMetaAccess(), hostGraph);
1187         } else if (localValue instanceof VirtualObject) {
1188             valueNode = getNodeForVirtualObjectFromFrame((VirtualObject) localValue, virtualObjects, hostGraph);
1189         } else if (localValue instanceof StackSlot) {
1190             StackSlot slot = (StackSlot) localValue;
1191             valueNode = getNodeForStackSlotFromFrame(slot, localValue.getKind(), hsailFrame, hostGraph, providers, config, numSRegs, numDRegs);
1192         } else if (localValue instanceof HotSpotMonitorValue) {
1193             HotSpotMonitorValue hotSpotMonitorValue = (HotSpotMonitorValue) localValue;
1194             return getNodeForValueFromFrame(hotSpotMonitorValue.getOwner(), hsailFrame, hostGraph, providers, config, numSRegs, numDRegs, virtualObjects);
1195         } else if (localValue instanceof RegisterValue) {
1196             RegisterValue registerValue = (RegisterValue) localValue;
1197             int regNumber = registerValue.getRegister().number;
1198             valueNode = getNodeForRegisterFromFrame(regNumber, localValue.getKind(), hsailFrame, hostGraph, providers, config, numSRegs);
1199         } else if (Value.ILLEGAL.equals(localValue)) {
1200             valueNode = null;
1201         } else {
1202             throw GraalInternalError.shouldNotReachHere();
1203         }
1204         return valueNode;
1205     }
1206 
1207     private static ValueNode getNodeForVirtualObjectFromFrame(VirtualObject virtualObject, Map<VirtualObject, VirtualObjectNode> virtualObjects, StructuredGraph hostGraph) {
1208         return virtualObjects.computeIfAbsent(virtualObject, vo -> {
1209             if (vo.getType().isArray()) {
1210                 return hostGraph.add(new VirtualArrayNode(vo.getType().getComponentType(), vo.getValues().length));
1211             } else {
1212                 return hostGraph.add(new VirtualInstanceNode(vo.getType(), true));
1213             }
1214         });
1215     }
1216 
1217     private static ValueNode getNodeForRegisterFromFrame(int regNumber, Kind valueKind, ParameterNode hsailFrame, StructuredGraph hostGraph, HotSpotProviders providers, HotSpotVMConfig config,
1218                     int numSRegs) {
1219         ValueNode valueNode;
1220         LocationNode location;
1221         int longSize = providers.getCodeCache().getTarget().arch.getSizeInBytes(Kind.Long);
1222         int intSize = providers.getCodeCache().getTarget().arch.getSizeInBytes(Kind.Int);
1223         if (regNumber >= HSAIL.s0.number && regNumber <= HSAIL.s31.number) {
1224             long offset = config.hsailFrameHeaderSize + intSize * (regNumber - HSAIL.s0.number);
1225             location = ConstantLocationNode.create(FINAL_LOCATION, valueKind, offset, hostGraph);
1226         } else if (regNumber >= HSAIL.d0.number && regNumber <= HSAIL.d15.number) {
1227             long offset = config.hsailFrameHeaderSize + intSize * numSRegs + longSize * (regNumber - HSAIL.d0.number);
1228             location = ConstantLocationNode.create(FINAL_LOCATION, valueKind, offset, hostGraph);
1229         } else {
1230             throw GraalInternalError.shouldNotReachHere("unknown hsail register: " + regNumber);
1231         }
1232         valueNode = hostGraph.unique(new FloatingReadNode(hsailFrame, location, null, StampFactory.forKind(valueKind)));
1233         return valueNode;
1234     }
1235 
1236     private static ValueNode getNodeForStackSlotFromFrame(StackSlot slot, Kind valueKind, ParameterNode hsailFrame, StructuredGraph hostGraph, HotSpotProviders providers, HotSpotVMConfig config,
1237                     int numSRegs, int numDRegs) {
1238         int slotSizeInBits = (valueKind == Kind.Object ? 64 : valueKind.getByteCount() * 8);
1239         if ((slotSizeInBits == 32) || (slotSizeInBits == 64)) {
1240             int longSize = providers.getCodeCache().getTarget().arch.getSizeInBytes(Kind.Long);
1241             int intSize = providers.getCodeCache().getTarget().arch.getSizeInBytes(Kind.Int);
1242             long offset = config.hsailFrameHeaderSize + (intSize * numSRegs) + (longSize * numDRegs) + HSAIL.getStackOffsetStart(slot, slotSizeInBits);
1243             LocationNode location = ConstantLocationNode.create(FINAL_LOCATION, valueKind, offset, hostGraph);
1244             ValueNode valueNode = hostGraph.unique(new FloatingReadNode(hsailFrame, location, null, StampFactory.forKind(valueKind)));
1245             return valueNode;
1246         } else {
1247             throw GraalInternalError.shouldNotReachHere("unsupported stack slot kind: " + valueKind);
1248         }
1249     }
1250 }