1 /*
   2  * Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2016, 2019, SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "registerSaver_s390.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "interpreter/interp_masm.hpp"
  33 #include "memory/universe.hpp"
  34 #include "nativeInst_s390.hpp"
  35 #include "oops/instanceOop.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/frame.inline.hpp"
  40 #include "runtime/handles.inline.hpp"
  41 #include "runtime/sharedRuntime.hpp"
  42 #include "runtime/stubCodeGenerator.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 
  46 // Declaration and definition of StubGenerator (no .hpp file).
  47 // For a more detailed description of the stub routine structure
  48 // see the comment in stubRoutines.hpp.
  49 
  50 #ifdef PRODUCT
  51 #define __ _masm->
  52 #else
  53 #define __ (Verbose ? (_masm->block_comment(FILE_AND_LINE),_masm):_masm)->
  54 #endif
  55 
  56 #define BLOCK_COMMENT(str) if (PrintAssembly) __ block_comment(str)
  57 #define BIND(label)        bind(label); BLOCK_COMMENT(#label ":")
  58 
  59 // -----------------------------------------------------------------------
  60 // Stub Code definitions
  61 
  62 class StubGenerator: public StubCodeGenerator {
  63  private:
  64 
  65   //----------------------------------------------------------------------
  66   // Call stubs are used to call Java from C.
  67 
  68   //
  69   // Arguments:
  70   //
  71   //   R2        - call wrapper address     : address
  72   //   R3        - result                   : intptr_t*
  73   //   R4        - result type              : BasicType
  74   //   R5        - method                   : method
  75   //   R6        - frame mgr entry point    : address
  76   //   [SP+160]  - parameter block          : intptr_t*
  77   //   [SP+172]  - parameter count in words : int
  78   //   [SP+176]  - thread                   : Thread*
  79   //
  80   address generate_call_stub(address& return_address) {
  81     // Set up a new C frame, copy Java arguments, call frame manager
  82     // or native_entry, and process result.
  83 
  84     StubCodeMark mark(this, "StubRoutines", "call_stub");
  85     address start = __ pc();
  86 
  87     Register r_arg_call_wrapper_addr   = Z_ARG1;
  88     Register r_arg_result_addr         = Z_ARG2;
  89     Register r_arg_result_type         = Z_ARG3;
  90     Register r_arg_method              = Z_ARG4;
  91     Register r_arg_entry               = Z_ARG5;
  92 
  93     // offsets to fp
  94     #define d_arg_thread 176
  95     #define d_arg_argument_addr 160
  96     #define d_arg_argument_count 168+4
  97 
  98     Register r_entryframe_fp           = Z_tmp_1;
  99     Register r_top_of_arguments_addr   = Z_ARG4;
 100     Register r_new_arg_entry = Z_R14;
 101 
 102     // macros for frame offsets
 103     #define call_wrapper_address_offset \
 104                _z_entry_frame_locals_neg(call_wrapper_address)
 105     #define result_address_offset \
 106               _z_entry_frame_locals_neg(result_address)
 107     #define result_type_offset \
 108               _z_entry_frame_locals_neg(result_type)
 109     #define arguments_tos_address_offset \
 110               _z_entry_frame_locals_neg(arguments_tos_address)
 111 
 112     {
 113       //
 114       // STACK on entry to call_stub:
 115       //
 116       //     F1      [C_FRAME]
 117       //            ...
 118       //
 119 
 120       Register r_argument_addr              = Z_tmp_3;
 121       Register r_argumentcopy_addr          = Z_tmp_4;
 122       Register r_argument_size_in_bytes     = Z_ARG5;
 123       Register r_frame_size                 = Z_R1;
 124 
 125       Label arguments_copied;
 126 
 127       // Save non-volatile registers to ABI of caller frame.
 128       BLOCK_COMMENT("save registers, push frame {");
 129       __ z_stmg(Z_R6, Z_R14, 16, Z_SP);
 130       __ z_std(Z_F8, 96, Z_SP);
 131       __ z_std(Z_F9, 104, Z_SP);
 132       __ z_std(Z_F10, 112, Z_SP);
 133       __ z_std(Z_F11, 120, Z_SP);
 134       __ z_std(Z_F12, 128, Z_SP);
 135       __ z_std(Z_F13, 136, Z_SP);
 136       __ z_std(Z_F14, 144, Z_SP);
 137       __ z_std(Z_F15, 152, Z_SP);
 138 
 139       //
 140       // Push ENTRY_FRAME including arguments:
 141       //
 142       //     F0      [TOP_IJAVA_FRAME_ABI]
 143       //             [outgoing Java arguments]
 144       //             [ENTRY_FRAME_LOCALS]
 145       //     F1      [C_FRAME]
 146       //             ...
 147       //
 148 
 149       // Calculate new frame size and push frame.
 150       #define abi_plus_locals_size \
 151                 (frame::z_top_ijava_frame_abi_size + frame::z_entry_frame_locals_size)
 152       if (abi_plus_locals_size % BytesPerWord == 0) {
 153         // Preload constant part of frame size.
 154         __ load_const_optimized(r_frame_size, -abi_plus_locals_size/BytesPerWord);
 155         // Keep copy of our frame pointer (caller's SP).
 156         __ z_lgr(r_entryframe_fp, Z_SP);
 157         // Add space required by arguments to frame size.
 158         __ z_slgf(r_frame_size, d_arg_argument_count, Z_R0, Z_SP);
 159         // Move Z_ARG5 early, it will be used as a local.
 160         __ z_lgr(r_new_arg_entry, r_arg_entry);
 161         // Convert frame size from words to bytes.
 162         __ z_sllg(r_frame_size, r_frame_size, LogBytesPerWord);
 163         __ push_frame(r_frame_size, r_entryframe_fp,
 164                       false/*don't copy SP*/, true /*frame size sign inverted*/);
 165       } else {
 166         guarantee(false, "frame sizes should be multiples of word size (BytesPerWord)");
 167       }
 168       BLOCK_COMMENT("} save, push");
 169 
 170       // Load argument registers for call.
 171       BLOCK_COMMENT("prepare/copy arguments {");
 172       __ z_lgr(Z_method, r_arg_method);
 173       __ z_lg(Z_thread, d_arg_thread, r_entryframe_fp);
 174 
 175       // Calculate top_of_arguments_addr which will be tos (not prepushed) later.
 176       // Wimply use SP + frame::top_ijava_frame_size.
 177       __ add2reg(r_top_of_arguments_addr,
 178                  frame::z_top_ijava_frame_abi_size - BytesPerWord, Z_SP);
 179 
 180       // Initialize call_stub locals (step 1).
 181       if ((call_wrapper_address_offset + BytesPerWord == result_address_offset) &&
 182           (result_address_offset + BytesPerWord == result_type_offset)          &&
 183           (result_type_offset + BytesPerWord == arguments_tos_address_offset)) {
 184 
 185         __ z_stmg(r_arg_call_wrapper_addr, r_top_of_arguments_addr,
 186                   call_wrapper_address_offset, r_entryframe_fp);
 187       } else {
 188         __ z_stg(r_arg_call_wrapper_addr,
 189                  call_wrapper_address_offset, r_entryframe_fp);
 190         __ z_stg(r_arg_result_addr,
 191                  result_address_offset, r_entryframe_fp);
 192         __ z_stg(r_arg_result_type,
 193                  result_type_offset, r_entryframe_fp);
 194         __ z_stg(r_top_of_arguments_addr,
 195                  arguments_tos_address_offset, r_entryframe_fp);
 196       }
 197 
 198       // Copy Java arguments.
 199 
 200       // Any arguments to copy?
 201       __ load_and_test_int2long(Z_R1, Address(r_entryframe_fp, d_arg_argument_count));
 202       __ z_bre(arguments_copied);
 203 
 204       // Prepare loop and copy arguments in reverse order.
 205       {
 206         // Calculate argument size in bytes.
 207         __ z_sllg(r_argument_size_in_bytes, Z_R1, LogBytesPerWord);
 208 
 209         // Get addr of first incoming Java argument.
 210         __ z_lg(r_argument_addr, d_arg_argument_addr, r_entryframe_fp);
 211 
 212         // Let r_argumentcopy_addr point to last outgoing Java argument.
 213         __ add2reg(r_argumentcopy_addr, BytesPerWord, r_top_of_arguments_addr); // = Z_SP+160 effectively.
 214 
 215         // Let r_argument_addr point to last incoming Java argument.
 216         __ add2reg_with_index(r_argument_addr, -BytesPerWord,
 217                               r_argument_size_in_bytes, r_argument_addr);
 218 
 219         // Now loop while Z_R1 > 0 and copy arguments.
 220         {
 221           Label next_argument;
 222           __ bind(next_argument);
 223           // Mem-mem move.
 224           __ z_mvc(0, BytesPerWord-1, r_argumentcopy_addr, 0, r_argument_addr);
 225           __ add2reg(r_argument_addr,    -BytesPerWord);
 226           __ add2reg(r_argumentcopy_addr, BytesPerWord);
 227           __ z_brct(Z_R1, next_argument);
 228         }
 229       }  // End of argument copy loop.
 230 
 231       __ bind(arguments_copied);
 232     }
 233     BLOCK_COMMENT("} arguments");
 234 
 235     BLOCK_COMMENT("call {");
 236     {
 237       // Call frame manager or native entry.
 238 
 239       //
 240       // Register state on entry to frame manager / native entry:
 241       //
 242       //   Z_ARG1 = r_top_of_arguments_addr  - intptr_t *sender tos (prepushed)
 243       //                                       Lesp = (SP) + copied_arguments_offset - 8
 244       //   Z_method                          - method
 245       //   Z_thread                          - JavaThread*
 246       //
 247 
 248       // Here, the usual SP is the initial_caller_sp.
 249       __ z_lgr(Z_R10, Z_SP);
 250 
 251       // Z_esp points to the slot below the last argument.
 252       __ z_lgr(Z_esp, r_top_of_arguments_addr);
 253 
 254       //
 255       // Stack on entry to frame manager / native entry:
 256       //
 257       //     F0      [TOP_IJAVA_FRAME_ABI]
 258       //             [outgoing Java arguments]
 259       //             [ENTRY_FRAME_LOCALS]
 260       //     F1      [C_FRAME]
 261       //             ...
 262       //
 263 
 264       // Do a light-weight C-call here, r_new_arg_entry holds the address
 265       // of the interpreter entry point (frame manager or native entry)
 266       // and save runtime-value of return_pc in return_address
 267       // (call by reference argument).
 268       return_address = __ call_stub(r_new_arg_entry);
 269     }
 270     BLOCK_COMMENT("} call");
 271 
 272     {
 273       BLOCK_COMMENT("restore registers {");
 274       // Returned from frame manager or native entry.
 275       // Now pop frame, process result, and return to caller.
 276 
 277       //
 278       // Stack on exit from frame manager / native entry:
 279       //
 280       //     F0      [ABI]
 281       //             ...
 282       //             [ENTRY_FRAME_LOCALS]
 283       //     F1      [C_FRAME]
 284       //             ...
 285       //
 286       // Just pop the topmost frame ...
 287       //
 288 
 289       // Restore frame pointer.
 290       __ z_lg(r_entryframe_fp, _z_abi(callers_sp), Z_SP);
 291       // Pop frame. Done here to minimize stalls.
 292       __ pop_frame();
 293 
 294       // Reload some volatile registers which we've spilled before the call
 295       // to frame manager / native entry.
 296       // Access all locals via frame pointer, because we know nothing about
 297       // the topmost frame's size.
 298       __ z_lg(r_arg_result_addr, result_address_offset, r_entryframe_fp);
 299       __ z_lg(r_arg_result_type, result_type_offset, r_entryframe_fp);
 300 
 301       // Restore non-volatiles.
 302       __ z_lmg(Z_R6, Z_R14, 16, Z_SP);
 303       __ z_ld(Z_F8, 96, Z_SP);
 304       __ z_ld(Z_F9, 104, Z_SP);
 305       __ z_ld(Z_F10, 112, Z_SP);
 306       __ z_ld(Z_F11, 120, Z_SP);
 307       __ z_ld(Z_F12, 128, Z_SP);
 308       __ z_ld(Z_F13, 136, Z_SP);
 309       __ z_ld(Z_F14, 144, Z_SP);
 310       __ z_ld(Z_F15, 152, Z_SP);
 311       BLOCK_COMMENT("} restore");
 312 
 313       //
 314       // Stack on exit from call_stub:
 315       //
 316       //     0       [C_FRAME]
 317       //             ...
 318       //
 319       // No call_stub frames left.
 320       //
 321 
 322       // All non-volatiles have been restored at this point!!
 323 
 324       //------------------------------------------------------------------------
 325       // The following code makes some assumptions on the T_<type> enum values.
 326       // The enum is defined in globalDefinitions.hpp.
 327       // The validity of the assumptions is tested as far as possible.
 328       //   The assigned values should not be shuffled
 329       //   T_BOOLEAN==4    - lowest used enum value
 330       //   T_NARROWOOP==16 - largest used enum value
 331       //------------------------------------------------------------------------
 332       BLOCK_COMMENT("process result {");
 333       Label firstHandler;
 334       int   handlerLen= 8;
 335 #ifdef ASSERT
 336       char  assertMsg[] = "check BasicType definition in globalDefinitions.hpp";
 337       __ z_chi(r_arg_result_type, T_BOOLEAN);
 338       __ asm_assert_low(assertMsg, 0x0234);
 339       __ z_chi(r_arg_result_type, T_NARROWOOP);
 340       __ asm_assert_high(assertMsg, 0x0235);
 341 #endif
 342       __ add2reg(r_arg_result_type, -T_BOOLEAN);          // Remove offset.
 343       __ z_larl(Z_R1, firstHandler);                      // location of first handler
 344       __ z_sllg(r_arg_result_type, r_arg_result_type, 3); // Each handler is 8 bytes long.
 345       __ z_bc(MacroAssembler::bcondAlways, 0, r_arg_result_type, Z_R1);
 346 
 347       __ align(handlerLen);
 348       __ bind(firstHandler);
 349       // T_BOOLEAN:
 350         guarantee(T_BOOLEAN == 4, "check BasicType definition in globalDefinitions.hpp");
 351         __ z_st(Z_RET, 0, r_arg_result_addr);
 352         __ z_br(Z_R14); // Return to caller.
 353         __ align(handlerLen);
 354       // T_CHAR:
 355         guarantee(T_CHAR == T_BOOLEAN+1, "check BasicType definition in globalDefinitions.hpp");
 356         __ z_st(Z_RET, 0, r_arg_result_addr);
 357         __ z_br(Z_R14); // Return to caller.
 358         __ align(handlerLen);
 359       // T_FLOAT:
 360         guarantee(T_FLOAT == T_CHAR+1, "check BasicType definition in globalDefinitions.hpp");
 361         __ z_ste(Z_FRET, 0, r_arg_result_addr);
 362         __ z_br(Z_R14); // Return to caller.
 363         __ align(handlerLen);
 364       // T_DOUBLE:
 365         guarantee(T_DOUBLE == T_FLOAT+1, "check BasicType definition in globalDefinitions.hpp");
 366         __ z_std(Z_FRET, 0, r_arg_result_addr);
 367         __ z_br(Z_R14); // Return to caller.
 368         __ align(handlerLen);
 369       // T_BYTE:
 370         guarantee(T_BYTE == T_DOUBLE+1, "check BasicType definition in globalDefinitions.hpp");
 371         __ z_st(Z_RET, 0, r_arg_result_addr);
 372         __ z_br(Z_R14); // Return to caller.
 373         __ align(handlerLen);
 374       // T_SHORT:
 375         guarantee(T_SHORT == T_BYTE+1, "check BasicType definition in globalDefinitions.hpp");
 376         __ z_st(Z_RET, 0, r_arg_result_addr);
 377         __ z_br(Z_R14); // Return to caller.
 378         __ align(handlerLen);
 379       // T_INT:
 380         guarantee(T_INT == T_SHORT+1, "check BasicType definition in globalDefinitions.hpp");
 381         __ z_st(Z_RET, 0, r_arg_result_addr);
 382         __ z_br(Z_R14); // Return to caller.
 383         __ align(handlerLen);
 384       // T_LONG:
 385         guarantee(T_LONG == T_INT+1, "check BasicType definition in globalDefinitions.hpp");
 386         __ z_stg(Z_RET, 0, r_arg_result_addr);
 387         __ z_br(Z_R14); // Return to caller.
 388         __ align(handlerLen);
 389       // T_OBJECT:
 390         guarantee(T_OBJECT == T_LONG+1, "check BasicType definition in globalDefinitions.hpp");
 391         __ z_stg(Z_RET, 0, r_arg_result_addr);
 392         __ z_br(Z_R14); // Return to caller.
 393         __ align(handlerLen);
 394       // T_ARRAY:
 395         guarantee(T_ARRAY == T_OBJECT+1, "check BasicType definition in globalDefinitions.hpp");
 396         __ z_stg(Z_RET, 0, r_arg_result_addr);
 397         __ z_br(Z_R14); // Return to caller.
 398         __ align(handlerLen);
 399       // T_VOID:
 400         guarantee(T_VOID == T_ARRAY+1, "check BasicType definition in globalDefinitions.hpp");
 401         __ z_stg(Z_RET, 0, r_arg_result_addr);
 402         __ z_br(Z_R14); // Return to caller.
 403         __ align(handlerLen);
 404       // T_ADDRESS:
 405         guarantee(T_ADDRESS == T_VOID+1, "check BasicType definition in globalDefinitions.hpp");
 406         __ z_stg(Z_RET, 0, r_arg_result_addr);
 407         __ z_br(Z_R14); // Return to caller.
 408         __ align(handlerLen);
 409       // T_NARROWOOP:
 410         guarantee(T_NARROWOOP == T_ADDRESS+1, "check BasicType definition in globalDefinitions.hpp");
 411         __ z_st(Z_RET, 0, r_arg_result_addr);
 412         __ z_br(Z_R14); // Return to caller.
 413         __ align(handlerLen);
 414       BLOCK_COMMENT("} process result");
 415     }
 416     return start;
 417   }
 418 
 419   // Return point for a Java call if there's an exception thrown in
 420   // Java code. The exception is caught and transformed into a
 421   // pending exception stored in JavaThread that can be tested from
 422   // within the VM.
 423   address generate_catch_exception() {
 424     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 425 
 426     address start = __ pc();
 427 
 428     //
 429     // Registers alive
 430     //
 431     //   Z_thread
 432     //   Z_ARG1 - address of pending exception
 433     //   Z_ARG2 - return address in call stub
 434     //
 435 
 436     const Register exception_file = Z_R0;
 437     const Register exception_line = Z_R1;
 438 
 439     __ load_const_optimized(exception_file, (void*)__FILE__);
 440     __ load_const_optimized(exception_line, (void*)__LINE__);
 441 
 442     __ z_stg(Z_ARG1, thread_(pending_exception));
 443     // Store into `char *'.
 444     __ z_stg(exception_file, thread_(exception_file));
 445     // Store into `int'.
 446     __ z_st(exception_line, thread_(exception_line));
 447 
 448     // Complete return to VM.
 449     assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
 450 
 451     // Continue in call stub.
 452     __ z_br(Z_ARG2);
 453 
 454     return start;
 455   }
 456 
 457   // Continuation point for runtime calls returning with a pending
 458   // exception. The pending exception check happened in the runtime
 459   // or native call stub. The pending exception in Thread is
 460   // converted into a Java-level exception.
 461   //
 462   // Read:
 463   //   Z_R14: pc the runtime library callee wants to return to.
 464   //   Since the exception occurred in the callee, the return pc
 465   //   from the point of view of Java is the exception pc.
 466   //
 467   // Invalidate:
 468   //   Volatile registers (except below).
 469   //
 470   // Update:
 471   //   Z_ARG1: exception
 472   //   (Z_R14 is unchanged and is live out).
 473   //
 474   address generate_forward_exception() {
 475     StubCodeMark mark(this, "StubRoutines", "forward_exception");
 476     address start = __ pc();
 477 
 478     #define pending_exception_offset in_bytes(Thread::pending_exception_offset())
 479 #ifdef ASSERT
 480     // Get pending exception oop.
 481     __ z_lg(Z_ARG1, pending_exception_offset, Z_thread);
 482 
 483     // Make sure that this code is only executed if there is a pending exception.
 484     {
 485       Label L;
 486       __ z_ltgr(Z_ARG1, Z_ARG1);
 487       __ z_brne(L);
 488       __ stop("StubRoutines::forward exception: no pending exception (1)");
 489       __ bind(L);
 490     }
 491 
 492     __ verify_oop(Z_ARG1, "StubRoutines::forward exception: not an oop");
 493 #endif
 494 
 495     __ z_lgr(Z_ARG2, Z_R14); // Copy exception pc into Z_ARG2.
 496     __ save_return_pc();
 497     __ push_frame_abi160(0);
 498     // Find exception handler.
 499     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address),
 500                     Z_thread,
 501                     Z_ARG2);
 502     // Copy handler's address.
 503     __ z_lgr(Z_R1, Z_RET);
 504     __ pop_frame();
 505     __ restore_return_pc();
 506 
 507     // Set up the arguments for the exception handler:
 508     // - Z_ARG1: exception oop
 509     // - Z_ARG2: exception pc
 510 
 511     // Load pending exception oop.
 512     __ z_lg(Z_ARG1, pending_exception_offset, Z_thread);
 513 
 514     // The exception pc is the return address in the caller,
 515     // must load it into Z_ARG2
 516     __ z_lgr(Z_ARG2, Z_R14);
 517 
 518 #ifdef ASSERT
 519     // Make sure exception is set.
 520     { Label L;
 521       __ z_ltgr(Z_ARG1, Z_ARG1);
 522       __ z_brne(L);
 523       __ stop("StubRoutines::forward exception: no pending exception (2)");
 524       __ bind(L);
 525     }
 526 #endif
 527     // Clear the pending exception.
 528     __ clear_mem(Address(Z_thread, pending_exception_offset), sizeof(void *));
 529     // Jump to exception handler
 530     __ z_br(Z_R1 /*handler address*/);
 531 
 532     return start;
 533 
 534     #undef pending_exception_offset
 535   }
 536 
 537   // Continuation point for throwing of implicit exceptions that are
 538   // not handled in the current activation. Fabricates an exception
 539   // oop and initiates normal exception dispatching in this
 540   // frame. Only callee-saved registers are preserved (through the
 541   // normal RegisterMap handling). If the compiler
 542   // needs all registers to be preserved between the fault point and
 543   // the exception handler then it must assume responsibility for that
 544   // in AbstractCompiler::continuation_for_implicit_null_exception or
 545   // continuation_for_implicit_division_by_zero_exception. All other
 546   // implicit exceptions (e.g., NullPointerException or
 547   // AbstractMethodError on entry) are either at call sites or
 548   // otherwise assume that stack unwinding will be initiated, so
 549   // caller saved registers were assumed volatile in the compiler.
 550 
 551   // Note that we generate only this stub into a RuntimeStub, because
 552   // it needs to be properly traversed and ignored during GC, so we
 553   // change the meaning of the "__" macro within this method.
 554 
 555   // Note: the routine set_pc_not_at_call_for_caller in
 556   // SharedRuntime.cpp requires that this code be generated into a
 557   // RuntimeStub.
 558 #undef __
 559 #define __ masm->
 560 
 561   address generate_throw_exception(const char* name, address runtime_entry,
 562                                    bool restore_saved_exception_pc,
 563                                    Register arg1 = noreg, Register arg2 = noreg) {
 564     assert_different_registers(arg1, Z_R0_scratch);  // would be destroyed by push_frame()
 565     assert_different_registers(arg2, Z_R0_scratch);  // would be destroyed by push_frame()
 566 
 567     int insts_size = 256;
 568     int locs_size  = 0;
 569     CodeBuffer      code(name, insts_size, locs_size);
 570     MacroAssembler* masm = new MacroAssembler(&code);
 571     int framesize_in_bytes;
 572     address start = __ pc();
 573 
 574     __ save_return_pc();
 575     framesize_in_bytes = __ push_frame_abi160(0);
 576 
 577     address frame_complete_pc = __ pc();
 578     if (restore_saved_exception_pc) {
 579       __ unimplemented("StubGenerator::throw_exception", 74);
 580     }
 581 
 582     // Note that we always have a runtime stub frame on the top of stack at this point.
 583     __ get_PC(Z_R1);
 584     __ set_last_Java_frame(/*sp*/Z_SP, /*pc*/Z_R1);
 585 
 586     // Do the call.
 587     BLOCK_COMMENT("call runtime_entry");
 588     __ call_VM_leaf(runtime_entry, Z_thread, arg1, arg2);
 589 
 590     __ reset_last_Java_frame();
 591 
 592 #ifdef ASSERT
 593     // Make sure that this code is only executed if there is a pending exception.
 594     { Label L;
 595       __ z_lg(Z_R0,
 596                 in_bytes(Thread::pending_exception_offset()),
 597                 Z_thread);
 598       __ z_ltgr(Z_R0, Z_R0);
 599       __ z_brne(L);
 600       __ stop("StubRoutines::throw_exception: no pending exception");
 601       __ bind(L);
 602     }
 603 #endif
 604 
 605     __ pop_frame();
 606     __ restore_return_pc();
 607 
 608     __ load_const_optimized(Z_R1, StubRoutines::forward_exception_entry());
 609     __ z_br(Z_R1);
 610 
 611     RuntimeStub* stub =
 612       RuntimeStub::new_runtime_stub(name, &code,
 613                                     frame_complete_pc - start,
 614                                     framesize_in_bytes/wordSize,
 615                                     NULL /*oop_maps*/, false);
 616 
 617     return stub->entry_point();
 618   }
 619 
 620 #undef __
 621 #ifdef PRODUCT
 622 #define __ _masm->
 623 #else
 624 #define __ (Verbose ? (_masm->block_comment(FILE_AND_LINE),_masm):_masm)->
 625 #endif
 626 
 627   // Support for uint StubRoutine::zarch::partial_subtype_check(Klass
 628   // sub, Klass super);
 629   //
 630   // Arguments:
 631   //   ret  : Z_RET, returned
 632   //   sub  : Z_ARG2, argument, not changed
 633   //   super: Z_ARG3, argument, not changed
 634   //
 635   //   raddr: Z_R14, blown by call
 636   //
 637   address generate_partial_subtype_check() {
 638     StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
 639     Label miss;
 640 
 641     address start = __ pc();
 642 
 643     const Register Rsubklass   = Z_ARG2; // subklass
 644     const Register Rsuperklass = Z_ARG3; // superklass
 645 
 646     // No args, but tmp registers that are killed.
 647     const Register Rlength     = Z_ARG4; // cache array length
 648     const Register Rarray_ptr  = Z_ARG5; // Current value from cache array.
 649 
 650     if (UseCompressedOops) {
 651       assert(Universe::heap() != NULL, "java heap must be initialized to generate partial_subtype_check stub");
 652     }
 653 
 654     // Always take the slow path (see SPARC).
 655     __ check_klass_subtype_slow_path(Rsubklass, Rsuperklass,
 656                                      Rarray_ptr, Rlength, NULL, &miss);
 657 
 658     // Match falls through here.
 659     __ clear_reg(Z_RET);               // Zero indicates a match. Set EQ flag in CC.
 660     __ z_br(Z_R14);
 661 
 662     __ BIND(miss);
 663     __ load_const_optimized(Z_RET, 1); // One indicates a miss.
 664     __ z_ltgr(Z_RET, Z_RET);           // Set NE flag in CR.
 665     __ z_br(Z_R14);
 666 
 667     return start;
 668   }
 669 
 670 #if !defined(PRODUCT)
 671   // Wrapper which calls oopDesc::is_oop_or_null()
 672   // Only called by MacroAssembler::verify_oop
 673   static void verify_oop_helper(const char* message, oopDesc* o) {
 674     if (!oopDesc::is_oop_or_null(o)) {
 675       fatal("%s. oop: " PTR_FORMAT, message, p2i(o));
 676     }
 677     ++ StubRoutines::_verify_oop_count;
 678   }
 679 #endif
 680 
 681   // Return address of code to be called from code generated by
 682   // MacroAssembler::verify_oop.
 683   //
 684   // Don't generate, rather use C++ code.
 685   address generate_verify_oop_subroutine() {
 686     // Don't generate a StubCodeMark, because no code is generated!
 687     // Generating the mark triggers notifying the oprofile jvmti agent
 688     // about the dynamic code generation, but the stub without
 689     // code (code_size == 0) confuses opjitconv
 690     // StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
 691 
 692     address start = 0;
 693 
 694 #if !defined(PRODUCT)
 695     start = CAST_FROM_FN_PTR(address, verify_oop_helper);
 696 #endif
 697 
 698     return start;
 699   }
 700 
 701   // This is to test that the count register contains a positive int value.
 702   // Required because C2 does not respect int to long conversion for stub calls.
 703   void assert_positive_int(Register count) {
 704 #ifdef ASSERT
 705     __ z_srag(Z_R0, count, 31);  // Just leave the sign (must be zero) in Z_R0.
 706     __ asm_assert_eq("missing zero extend", 0xAFFE);
 707 #endif
 708   }
 709 
 710   //  Generate overlap test for array copy stubs.
 711   //  If no actual overlap is detected, control is transferred to the
 712   //  "normal" copy stub (entry address passed in disjoint_copy_target).
 713   //  Otherwise, execution continues with the code generated by the
 714   //  caller of array_overlap_test.
 715   //
 716   //  Input:
 717   //    Z_ARG1    - from
 718   //    Z_ARG2    - to
 719   //    Z_ARG3    - element count
 720   void array_overlap_test(address disjoint_copy_target, int log2_elem_size) {
 721     __ MacroAssembler::compare_and_branch_optimized(Z_ARG2, Z_ARG1, Assembler::bcondNotHigh,
 722                                                     disjoint_copy_target, /*len64=*/true, /*has_sign=*/false);
 723 
 724     Register index = Z_ARG3;
 725     if (log2_elem_size > 0) {
 726       __ z_sllg(Z_R1, Z_ARG3, log2_elem_size);  // byte count
 727       index = Z_R1;
 728     }
 729     __ add2reg_with_index(Z_R1, 0, index, Z_ARG1);  // First byte after "from" range.
 730 
 731     __ MacroAssembler::compare_and_branch_optimized(Z_R1, Z_ARG2, Assembler::bcondNotHigh,
 732                                                     disjoint_copy_target, /*len64=*/true, /*has_sign=*/false);
 733 
 734     // Destructive overlap: let caller generate code for that.
 735   }
 736 
 737   //  Generate stub for disjoint array copy. If "aligned" is true, the
 738   //  "from" and "to" addresses are assumed to be heapword aligned.
 739   //
 740   //  Arguments for generated stub:
 741   //      from:  Z_ARG1
 742   //      to:    Z_ARG2
 743   //      count: Z_ARG3 treated as signed
 744   void generate_disjoint_copy(bool aligned, int element_size,
 745                               bool branchToEnd,
 746                               bool restoreArgs) {
 747     // This is the zarch specific stub generator for general array copy tasks.
 748     // It has the following prereqs and features:
 749     //
 750     // - No destructive overlap allowed (else unpredictable results).
 751     // - Destructive overlap does not exist if the leftmost byte of the target
 752     //   does not coincide with any of the source bytes (except the leftmost).
 753     //
 754     //   Register usage upon entry:
 755     //      Z_ARG1 == Z_R2 :   address of source array
 756     //      Z_ARG2 == Z_R3 :   address of target array
 757     //      Z_ARG3 == Z_R4 :   length of operands (# of elements on entry)
 758     //
 759     // Register usage within the generator:
 760     // - Z_R0 and Z_R1 are KILLed by the stub routine (target addr/len).
 761     //                 Used as pair register operand in complex moves, scratch registers anyway.
 762     // - Z_R5 is KILLed by the stub routine (source register pair addr/len) (even/odd reg).
 763     //                  Same as R0/R1, but no scratch register.
 764     // - Z_ARG1, Z_ARG2, Z_ARG3 are USEd but preserved by the stub routine,
 765     //                          but they might get temporarily overwritten.
 766 
 767     Register  save_reg    = Z_ARG4;   // (= Z_R5), holds original target operand address for restore.
 768 
 769     {
 770       Register   llen_reg = Z_R1;     // Holds left operand len (odd reg).
 771       Register  laddr_reg = Z_R0;     // Holds left operand addr (even reg), overlaps with data_reg.
 772       Register   rlen_reg = Z_R5;     // Holds right operand len (odd reg), overlaps with save_reg.
 773       Register  raddr_reg = Z_R4;     // Holds right operand addr (even reg), overlaps with len_reg.
 774 
 775       Register   data_reg = Z_R0;     // Holds copied data chunk in alignment process and copy loop.
 776       Register    len_reg = Z_ARG3;   // Holds operand len (#elements at entry, #bytes shortly after).
 777       Register    dst_reg = Z_ARG2;   // Holds left (target)  operand addr.
 778       Register    src_reg = Z_ARG1;   // Holds right (source) operand addr.
 779 
 780       Label     doMVCLOOP, doMVCLOOPcount, doMVCLOOPiterate;
 781       Label     doMVCUnrolled;
 782       NearLabel doMVC,  doMVCgeneral, done;
 783       Label     MVC_template;
 784       address   pcMVCblock_b, pcMVCblock_e;
 785 
 786       bool      usedMVCLE       = true;
 787       bool      usedMVCLOOP     = true;
 788       bool      usedMVCUnrolled = false;
 789       bool      usedMVC         = false;
 790       bool      usedMVCgeneral  = false;
 791 
 792       int       stride;
 793       Register  stride_reg;
 794       Register  ix_reg;
 795 
 796       assert((element_size<=256) && (256%element_size == 0), "element size must be <= 256, power of 2");
 797       unsigned int log2_size = exact_log2(element_size);
 798 
 799       switch (element_size) {
 800         case 1:  BLOCK_COMMENT("ARRAYCOPY DISJOINT byte  {"); break;
 801         case 2:  BLOCK_COMMENT("ARRAYCOPY DISJOINT short {"); break;
 802         case 4:  BLOCK_COMMENT("ARRAYCOPY DISJOINT int   {"); break;
 803         case 8:  BLOCK_COMMENT("ARRAYCOPY DISJOINT long  {"); break;
 804         default: BLOCK_COMMENT("ARRAYCOPY DISJOINT       {"); break;
 805       }
 806 
 807       assert_positive_int(len_reg);
 808 
 809       BLOCK_COMMENT("preparation {");
 810 
 811       // No copying if len <= 0.
 812       if (branchToEnd) {
 813         __ compare64_and_branch(len_reg, (intptr_t) 0, Assembler::bcondNotHigh, done);
 814       } else {
 815         if (VM_Version::has_CompareBranch()) {
 816           __ z_cgib(len_reg, 0, Assembler::bcondNotHigh, 0, Z_R14);
 817         } else {
 818           __ z_ltgr(len_reg, len_reg);
 819           __ z_bcr(Assembler::bcondNotPositive, Z_R14);
 820         }
 821       }
 822 
 823       // Prefetch just one cache line. Speculative opt for short arrays.
 824       // Do not use Z_R1 in prefetch. Is undefined here.
 825       if (VM_Version::has_Prefetch()) {
 826         __ z_pfd(0x01, 0, Z_R0, src_reg); // Fetch access.
 827         __ z_pfd(0x02, 0, Z_R0, dst_reg); // Store access.
 828       }
 829 
 830       BLOCK_COMMENT("} preparation");
 831 
 832       // Save args only if really needed.
 833       // Keep len test local to branch. Is generated only once.
 834 
 835       BLOCK_COMMENT("mode selection {");
 836 
 837       // Special handling for arrays with only a few elements.
 838       // Nothing fancy: just an executed MVC.
 839       if (log2_size > 0) {
 840         __ z_sllg(Z_R1, len_reg, log2_size); // Remember #bytes in Z_R1.
 841       }
 842       if (element_size != 8) {
 843         __ z_cghi(len_reg, 256/element_size);
 844         __ z_brnh(doMVC);
 845         usedMVC = true;
 846       }
 847       if (element_size == 8) { // Long and oop arrays are always aligned.
 848         __ z_cghi(len_reg, 256/element_size);
 849         __ z_brnh(doMVCUnrolled);
 850         usedMVCUnrolled = true;
 851       }
 852 
 853       // Prefetch another cache line. We, for sure, have more than one line to copy.
 854       if (VM_Version::has_Prefetch()) {
 855         __ z_pfd(0x01, 256, Z_R0, src_reg); // Fetch access.
 856         __ z_pfd(0x02, 256, Z_R0, dst_reg); // Store access.
 857       }
 858 
 859       if (restoreArgs) {
 860         // Remember entry value of ARG2 to restore all arguments later from that knowledge.
 861         __ z_lgr(save_reg, dst_reg);
 862       }
 863 
 864       __ z_cghi(len_reg, 4096/element_size);
 865       if (log2_size == 0) {
 866         __ z_lgr(Z_R1, len_reg); // Init Z_R1 with #bytes
 867       }
 868       __ z_brnh(doMVCLOOP);
 869 
 870       // Fall through to MVCLE case.
 871 
 872       BLOCK_COMMENT("} mode selection");
 873 
 874       // MVCLE: for long arrays
 875       //   DW aligned: Best performance for sizes > 4kBytes.
 876       //   unaligned:  Least complex for sizes > 256 bytes.
 877       if (usedMVCLE) {
 878         BLOCK_COMMENT("mode MVCLE {");
 879 
 880         // Setup registers for mvcle.
 881         //__ z_lgr(llen_reg, len_reg);// r1 <- r4  #bytes already in Z_R1, aka llen_reg.
 882         __ z_lgr(laddr_reg, dst_reg); // r0 <- r3
 883         __ z_lgr(raddr_reg, src_reg); // r4 <- r2
 884         __ z_lgr(rlen_reg, llen_reg); // r5 <- r1
 885 
 886         __ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0xb0);    // special: bypass cache
 887         // __ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0xb8); // special: Hold data in cache.
 888         // __ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0);
 889 
 890         if (restoreArgs) {
 891           // MVCLE updates the source (Z_R4,Z_R5) and target (Z_R0,Z_R1) register pairs.
 892           // Dst_reg (Z_ARG2) and src_reg (Z_ARG1) are left untouched. No restore required.
 893           // Len_reg (Z_ARG3) is destroyed and must be restored.
 894           __ z_slgr(laddr_reg, dst_reg);    // copied #bytes
 895           if (log2_size > 0) {
 896             __ z_srag(Z_ARG3, laddr_reg, log2_size); // Convert back to #elements.
 897           } else {
 898             __ z_lgr(Z_ARG3, laddr_reg);
 899           }
 900         }
 901         if (branchToEnd) {
 902           __ z_bru(done);
 903         } else {
 904           __ z_br(Z_R14);
 905         }
 906         BLOCK_COMMENT("} mode MVCLE");
 907       }
 908       // No fallthru possible here.
 909 
 910       //  MVCUnrolled: for short, aligned arrays.
 911 
 912       if (usedMVCUnrolled) {
 913         BLOCK_COMMENT("mode MVC unrolled {");
 914         stride = 8;
 915 
 916         // Generate unrolled MVC instructions.
 917         for (int ii = 32; ii > 1; ii--) {
 918           __ z_mvc(0, ii * stride-1, dst_reg, 0, src_reg); // ii*8 byte copy
 919           if (branchToEnd) {
 920             __ z_bru(done);
 921           } else {
 922             __ z_br(Z_R14);
 923           }
 924         }
 925 
 926         pcMVCblock_b = __ pc();
 927         __ z_mvc(0, 1 * stride-1, dst_reg, 0, src_reg); // 8 byte copy
 928         if (branchToEnd) {
 929           __ z_bru(done);
 930         } else {
 931           __ z_br(Z_R14);
 932         }
 933 
 934         pcMVCblock_e = __ pc();
 935         Label MVC_ListEnd;
 936         __ bind(MVC_ListEnd);
 937 
 938         // This is an absolute fast path:
 939         // - Array len in bytes must be not greater than 256.
 940         // - Array len in bytes must be an integer mult of DW
 941         //   to save expensive handling of trailing bytes.
 942         // - Argument restore is not done,
 943         //   i.e. previous code must not alter arguments (this code doesn't either).
 944 
 945         __ bind(doMVCUnrolled);
 946 
 947         // Avoid mul, prefer shift where possible.
 948         // Combine shift right (for #DW) with shift left (for block size).
 949         // Set CC for zero test below (asm_assert).
 950         // Note: #bytes comes in Z_R1, #DW in len_reg.
 951         unsigned int MVCblocksize    = pcMVCblock_e - pcMVCblock_b;
 952         unsigned int logMVCblocksize = 0xffffffffU; // Pacify compiler ("used uninitialized" warning).
 953 
 954         if (log2_size > 0) { // Len was scaled into Z_R1.
 955           switch (MVCblocksize) {
 956 
 957             case  8: logMVCblocksize = 3;
 958                      __ z_ltgr(Z_R0, Z_R1); // #bytes is index
 959                      break;                 // reasonable size, use shift
 960 
 961             case 16: logMVCblocksize = 4;
 962                      __ z_slag(Z_R0, Z_R1, logMVCblocksize-log2_size);
 963                      break;                 // reasonable size, use shift
 964 
 965             default: logMVCblocksize = 0;
 966                      __ z_ltgr(Z_R0, len_reg); // #DW for mul
 967                      break;                 // all other sizes: use mul
 968           }
 969         } else {
 970           guarantee(log2_size, "doMVCUnrolled: only for DW entities");
 971         }
 972 
 973         // This test (and branch) is redundant. Previous code makes sure that
 974         //  - element count > 0
 975         //  - element size == 8.
 976         // Thus, len reg should never be zero here. We insert an asm_assert() here,
 977         // just to double-check and to be on the safe side.
 978         __ asm_assert(false, "zero len cannot occur", 99);
 979 
 980         __ z_larl(Z_R1, MVC_ListEnd);        // Get addr of last instr block.
 981         // Avoid mul, prefer shift where possible.
 982         if (logMVCblocksize == 0) {
 983           __ z_mghi(Z_R0, MVCblocksize);
 984         }
 985         __ z_slgr(Z_R1, Z_R0);
 986         __ z_br(Z_R1);
 987         BLOCK_COMMENT("} mode MVC unrolled");
 988       }
 989       // No fallthru possible here.
 990 
 991       // MVC execute template
 992       // Must always generate. Usage may be switched on below.
 993       // There is no suitable place after here to put the template.
 994       __ bind(MVC_template);
 995       __ z_mvc(0,0,dst_reg,0,src_reg);      // Instr template, never exec directly!
 996 
 997 
 998       // MVC Loop: for medium-sized arrays
 999 
1000       // Only for DW aligned arrays (src and dst).
1001       // #bytes to copy must be at least 256!!!
1002       // Non-aligned cases handled separately.
1003       stride     = 256;
1004       stride_reg = Z_R1;   // Holds #bytes when control arrives here.
1005       ix_reg     = Z_ARG3; // Alias for len_reg.
1006 
1007 
1008       if (usedMVCLOOP) {
1009         BLOCK_COMMENT("mode MVC loop {");
1010         __ bind(doMVCLOOP);
1011 
1012         __ z_lcgr(ix_reg, Z_R1);         // Ix runs from -(n-2)*stride to 1*stride (inclusive).
1013         __ z_llill(stride_reg, stride);
1014         __ add2reg(ix_reg, 2*stride);    // Thus: increment ix by 2*stride.
1015 
1016         __ bind(doMVCLOOPiterate);
1017           __ z_mvc(0, stride-1, dst_reg, 0, src_reg);
1018           __ add2reg(dst_reg, stride);
1019           __ add2reg(src_reg, stride);
1020           __ bind(doMVCLOOPcount);
1021           __ z_brxlg(ix_reg, stride_reg, doMVCLOOPiterate);
1022 
1023         // Don 't use add2reg() here, since we must set the condition code!
1024         __ z_aghi(ix_reg, -2*stride);       // Compensate incr from above: zero diff means "all copied".
1025 
1026         if (restoreArgs) {
1027           __ z_lcgr(Z_R1, ix_reg);          // Prepare ix_reg for copy loop, #bytes expected in Z_R1.
1028           __ z_brnz(doMVCgeneral);          // We're not done yet, ix_reg is not zero.
1029 
1030           // ARG1, ARG2, and ARG3 were altered by the code above, so restore them building on save_reg.
1031           __ z_slgr(dst_reg, save_reg);     // copied #bytes
1032           __ z_slgr(src_reg, dst_reg);      // = ARG1 (now restored)
1033           if (log2_size) {
1034             __ z_srag(Z_ARG3, dst_reg, log2_size); // Convert back to #elements to restore ARG3.
1035           } else {
1036             __ z_lgr(Z_ARG3, dst_reg);
1037           }
1038           __ z_lgr(Z_ARG2, save_reg);       // ARG2 now restored.
1039 
1040           if (branchToEnd) {
1041             __ z_bru(done);
1042           } else {
1043             __ z_br(Z_R14);
1044           }
1045 
1046         } else {
1047             if (branchToEnd) {
1048               __ z_brz(done);                        // CC set by aghi instr.
1049           } else {
1050               __ z_bcr(Assembler::bcondZero, Z_R14); // We're all done if zero.
1051             }
1052 
1053           __ z_lcgr(Z_R1, ix_reg);    // Prepare ix_reg for copy loop, #bytes expected in Z_R1.
1054           // __ z_bru(doMVCgeneral);  // fallthru
1055         }
1056         usedMVCgeneral = true;
1057         BLOCK_COMMENT("} mode MVC loop");
1058       }
1059       // Fallthru to doMVCgeneral
1060 
1061       // MVCgeneral: for short, unaligned arrays, after other copy operations
1062 
1063       // Somewhat expensive due to use of EX instruction, but simple.
1064       if (usedMVCgeneral) {
1065         BLOCK_COMMENT("mode MVC general {");
1066         __ bind(doMVCgeneral);
1067 
1068         __ add2reg(len_reg, -1, Z_R1);             // Get #bytes-1 for EXECUTE.
1069         if (VM_Version::has_ExecuteExtensions()) {
1070           __ z_exrl(len_reg, MVC_template);        // Execute MVC with variable length.
1071         } else {
1072           __ z_larl(Z_R1, MVC_template);           // Get addr of instr template.
1073           __ z_ex(len_reg, 0, Z_R0, Z_R1);         // Execute MVC with variable length.
1074         }                                          // penalty: 9 ticks
1075 
1076         if (restoreArgs) {
1077           // ARG1, ARG2, and ARG3 were altered by code executed before, so restore them building on save_reg
1078           __ z_slgr(dst_reg, save_reg);            // Copied #bytes without the "doMVCgeneral" chunk
1079           __ z_slgr(src_reg, dst_reg);             // = ARG1 (now restored), was not advanced for "doMVCgeneral" chunk
1080           __ add2reg_with_index(dst_reg, 1, len_reg, dst_reg); // Len of executed MVC was not accounted for, yet.
1081           if (log2_size) {
1082             __ z_srag(Z_ARG3, dst_reg, log2_size); // Convert back to #elements to restore ARG3
1083           } else {
1084              __ z_lgr(Z_ARG3, dst_reg);
1085           }
1086           __ z_lgr(Z_ARG2, save_reg);              // ARG2 now restored.
1087         }
1088 
1089         if (usedMVC) {
1090           if (branchToEnd) {
1091             __ z_bru(done);
1092           } else {
1093             __ z_br(Z_R14);
1094         }
1095         } else {
1096           if (!branchToEnd) __ z_br(Z_R14);
1097         }
1098         BLOCK_COMMENT("} mode MVC general");
1099       }
1100       // Fallthru possible if following block not generated.
1101 
1102       // MVC: for short, unaligned arrays
1103 
1104       // Somewhat expensive due to use of EX instruction, but simple. penalty: 9 ticks.
1105       // Differs from doMVCgeneral in reconstruction of ARG2, ARG3, and ARG4.
1106       if (usedMVC) {
1107         BLOCK_COMMENT("mode MVC {");
1108         __ bind(doMVC);
1109 
1110         // get #bytes-1 for EXECUTE
1111         if (log2_size) {
1112           __ add2reg(Z_R1, -1);                // Length was scaled into Z_R1.
1113         } else {
1114           __ add2reg(Z_R1, -1, len_reg);       // Length was not scaled.
1115         }
1116 
1117         if (VM_Version::has_ExecuteExtensions()) {
1118           __ z_exrl(Z_R1, MVC_template);       // Execute MVC with variable length.
1119         } else {
1120           __ z_lgr(Z_R0, Z_R5);                // Save ARG4, may be unnecessary.
1121           __ z_larl(Z_R5, MVC_template);       // Get addr of instr template.
1122           __ z_ex(Z_R1, 0, Z_R0, Z_R5);        // Execute MVC with variable length.
1123           __ z_lgr(Z_R5, Z_R0);                // Restore ARG4, may be unnecessary.
1124         }
1125 
1126         if (!branchToEnd) {
1127           __ z_br(Z_R14);
1128         }
1129         BLOCK_COMMENT("} mode MVC");
1130       }
1131 
1132       __ bind(done);
1133 
1134       switch (element_size) {
1135         case 1:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT byte "); break;
1136         case 2:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT short"); break;
1137         case 4:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT int  "); break;
1138         case 8:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT long "); break;
1139         default: BLOCK_COMMENT("} ARRAYCOPY DISJOINT      "); break;
1140       }
1141     }
1142   }
1143 
1144   // Generate stub for conjoint array copy. If "aligned" is true, the
1145   // "from" and "to" addresses are assumed to be heapword aligned.
1146   //
1147   // Arguments for generated stub:
1148   //   from:  Z_ARG1
1149   //   to:    Z_ARG2
1150   //   count: Z_ARG3 treated as signed
1151   void generate_conjoint_copy(bool aligned, int element_size, bool branchToEnd) {
1152 
1153     // This is the zarch specific stub generator for general array copy tasks.
1154     // It has the following prereqs and features:
1155     //
1156     // - Destructive overlap exists and is handled by reverse copy.
1157     // - Destructive overlap exists if the leftmost byte of the target
1158     //   does coincide with any of the source bytes (except the leftmost).
1159     // - Z_R0 and Z_R1 are KILLed by the stub routine (data and stride)
1160     // - Z_ARG1 and Z_ARG2 are USEd but preserved by the stub routine.
1161     // - Z_ARG3 is USED but preserved by the stub routine.
1162     // - Z_ARG4 is used as index register and is thus KILLed.
1163     //
1164     {
1165       Register stride_reg = Z_R1;     // Stride & compare value in loop (negative element_size).
1166       Register   data_reg = Z_R0;     // Holds value of currently processed element.
1167       Register     ix_reg = Z_ARG4;   // Holds byte index of currently processed element.
1168       Register    len_reg = Z_ARG3;   // Holds length (in #elements) of arrays.
1169       Register    dst_reg = Z_ARG2;   // Holds left  operand addr.
1170       Register    src_reg = Z_ARG1;   // Holds right operand addr.
1171 
1172       assert(256%element_size == 0, "Element size must be power of 2.");
1173       assert(element_size     <= 8, "Can't handle more than DW units.");
1174 
1175       switch (element_size) {
1176         case 1:  BLOCK_COMMENT("ARRAYCOPY CONJOINT byte  {"); break;
1177         case 2:  BLOCK_COMMENT("ARRAYCOPY CONJOINT short {"); break;
1178         case 4:  BLOCK_COMMENT("ARRAYCOPY CONJOINT int   {"); break;
1179         case 8:  BLOCK_COMMENT("ARRAYCOPY CONJOINT long  {"); break;
1180         default: BLOCK_COMMENT("ARRAYCOPY CONJOINT       {"); break;
1181       }
1182 
1183       assert_positive_int(len_reg);
1184 
1185       if (VM_Version::has_Prefetch()) {
1186         __ z_pfd(0x01, 0, Z_R0, src_reg); // Fetch access.
1187         __ z_pfd(0x02, 0, Z_R0, dst_reg); // Store access.
1188       }
1189 
1190       unsigned int log2_size = exact_log2(element_size);
1191       if (log2_size) {
1192         __ z_sllg(ix_reg, len_reg, log2_size);
1193       } else {
1194         __ z_lgr(ix_reg, len_reg);
1195       }
1196 
1197       // Optimize reverse copy loop.
1198       // Main loop copies DW units which may be unaligned. Unaligned access adds some penalty ticks.
1199       // Unaligned DW access (neither fetch nor store) is DW-atomic, but should be alignment-atomic.
1200       // Preceding the main loop, some bytes are copied to obtain a DW-multiple remaining length.
1201 
1202       Label countLoop1;
1203       Label copyLoop1;
1204       Label skipBY;
1205       Label skipHW;
1206       int   stride = -8;
1207 
1208       __ load_const_optimized(stride_reg, stride); // Prepare for DW copy loop.
1209 
1210       if (element_size == 8)    // Nothing to do here.
1211         __ z_bru(countLoop1);
1212       else {                    // Do not generate dead code.
1213         __ z_tmll(ix_reg, 7);   // Check the "odd" bits.
1214         __ z_bre(countLoop1);   // There are none, very good!
1215       }
1216 
1217       if (log2_size == 0) {     // Handle leftover Byte.
1218         __ z_tmll(ix_reg, 1);
1219         __ z_bre(skipBY);
1220         __ z_lb(data_reg,   -1, ix_reg, src_reg);
1221         __ z_stcy(data_reg, -1, ix_reg, dst_reg);
1222         __ add2reg(ix_reg, -1); // Decrement delayed to avoid AGI.
1223         __ bind(skipBY);
1224         // fallthru
1225       }
1226       if (log2_size <= 1) {     // Handle leftover HW.
1227         __ z_tmll(ix_reg, 2);
1228         __ z_bre(skipHW);
1229         __ z_lhy(data_reg,  -2, ix_reg, src_reg);
1230         __ z_sthy(data_reg, -2, ix_reg, dst_reg);
1231         __ add2reg(ix_reg, -2); // Decrement delayed to avoid AGI.
1232         __ bind(skipHW);
1233         __ z_tmll(ix_reg, 4);
1234         __ z_bre(countLoop1);
1235         // fallthru
1236       }
1237       if (log2_size <= 2) {     // There are just 4 bytes (left) that need to be copied.
1238         __ z_ly(data_reg,  -4, ix_reg, src_reg);
1239         __ z_sty(data_reg, -4, ix_reg, dst_reg);
1240         __ add2reg(ix_reg, -4); // Decrement delayed to avoid AGI.
1241         __ z_bru(countLoop1);
1242       }
1243 
1244       // Control can never get to here. Never! Never ever!
1245       __ z_illtrap(0x99);
1246       __ bind(copyLoop1);
1247       __ z_lg(data_reg,  0, ix_reg, src_reg);
1248       __ z_stg(data_reg, 0, ix_reg, dst_reg);
1249       __ bind(countLoop1);
1250       __ z_brxhg(ix_reg, stride_reg, copyLoop1);
1251 
1252       if (!branchToEnd)
1253         __ z_br(Z_R14);
1254 
1255       switch (element_size) {
1256         case 1:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT byte "); break;
1257         case 2:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT short"); break;
1258         case 4:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT int  "); break;
1259         case 8:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT long "); break;
1260         default: BLOCK_COMMENT("} ARRAYCOPY CONJOINT      "); break;
1261       }
1262     }
1263   }
1264 
1265   // Generate stub for disjoint byte copy. If "aligned" is true, the
1266   // "from" and "to" addresses are assumed to be heapword aligned.
1267   address generate_disjoint_byte_copy(bool aligned, const char * name) {
1268     StubCodeMark mark(this, "StubRoutines", name);
1269 
1270     // This is the zarch specific stub generator for byte array copy.
1271     // Refer to generate_disjoint_copy for a list of prereqs and features:
1272     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1273     generate_disjoint_copy(aligned, 1, false, false);
1274     return __ addr_at(start_off);
1275   }
1276 
1277 
1278   address generate_disjoint_short_copy(bool aligned, const char * name) {
1279     StubCodeMark mark(this, "StubRoutines", name);
1280     // This is the zarch specific stub generator for short array copy.
1281     // Refer to generate_disjoint_copy for a list of prereqs and features:
1282     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1283     generate_disjoint_copy(aligned, 2, false, false);
1284     return __ addr_at(start_off);
1285   }
1286 
1287 
1288   address generate_disjoint_int_copy(bool aligned, const char * name) {
1289     StubCodeMark mark(this, "StubRoutines", name);
1290     // This is the zarch specific stub generator for int array copy.
1291     // Refer to generate_disjoint_copy for a list of prereqs and features:
1292     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1293     generate_disjoint_copy(aligned, 4, false, false);
1294     return __ addr_at(start_off);
1295   }
1296 
1297 
1298   address generate_disjoint_long_copy(bool aligned, const char * name) {
1299     StubCodeMark mark(this, "StubRoutines", name);
1300     // This is the zarch specific stub generator for long array copy.
1301     // Refer to generate_disjoint_copy for a list of prereqs and features:
1302     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1303     generate_disjoint_copy(aligned, 8, false, false);
1304     return __ addr_at(start_off);
1305   }
1306 
1307 
1308   address generate_disjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
1309     StubCodeMark mark(this, "StubRoutines", name);
1310     // This is the zarch specific stub generator for oop array copy.
1311     // Refer to generate_disjoint_copy for a list of prereqs and features.
1312     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1313     unsigned int size      = UseCompressedOops ? 4 : 8;
1314 
1315     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1316     if (dest_uninitialized) {
1317       decorators |= IS_DEST_UNINITIALIZED;
1318     }
1319     if (aligned) {
1320       decorators |= ARRAYCOPY_ALIGNED;
1321     }
1322 
1323     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1324     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, Z_ARG1, Z_ARG2, Z_ARG3);
1325 
1326     generate_disjoint_copy(aligned, size, true, true);
1327 
1328     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, Z_ARG2, Z_ARG3, true);
1329 
1330     return __ addr_at(start_off);
1331   }
1332 
1333 
1334   address generate_conjoint_byte_copy(bool aligned, const char * name) {
1335     StubCodeMark mark(this, "StubRoutines", name);
1336     // This is the zarch specific stub generator for overlapping byte array copy.
1337     // Refer to generate_conjoint_copy for a list of prereqs and features:
1338     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1339     address nooverlap_target = aligned ? StubRoutines::arrayof_jbyte_disjoint_arraycopy()
1340                                        : StubRoutines::jbyte_disjoint_arraycopy();
1341 
1342     array_overlap_test(nooverlap_target, 0); // Branch away to nooverlap_target if disjoint.
1343     generate_conjoint_copy(aligned, 1, false);
1344 
1345     return __ addr_at(start_off);
1346   }
1347 
1348 
1349   address generate_conjoint_short_copy(bool aligned, const char * name) {
1350     StubCodeMark mark(this, "StubRoutines", name);
1351     // This is the zarch specific stub generator for overlapping short array copy.
1352     // Refer to generate_conjoint_copy for a list of prereqs and features:
1353     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1354     address nooverlap_target = aligned ? StubRoutines::arrayof_jshort_disjoint_arraycopy()
1355                                        : StubRoutines::jshort_disjoint_arraycopy();
1356 
1357     array_overlap_test(nooverlap_target, 1); // Branch away to nooverlap_target if disjoint.
1358     generate_conjoint_copy(aligned, 2, false);
1359 
1360     return __ addr_at(start_off);
1361   }
1362 
1363   address generate_conjoint_int_copy(bool aligned, const char * name) {
1364     StubCodeMark mark(this, "StubRoutines", name);
1365     // This is the zarch specific stub generator for overlapping int array copy.
1366     // Refer to generate_conjoint_copy for a list of prereqs and features:
1367 
1368     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1369     address nooverlap_target = aligned ? StubRoutines::arrayof_jint_disjoint_arraycopy()
1370                                        : StubRoutines::jint_disjoint_arraycopy();
1371 
1372     array_overlap_test(nooverlap_target, 2); // Branch away to nooverlap_target if disjoint.
1373     generate_conjoint_copy(aligned, 4, false);
1374 
1375     return __ addr_at(start_off);
1376   }
1377 
1378   address generate_conjoint_long_copy(bool aligned, const char * name) {
1379     StubCodeMark mark(this, "StubRoutines", name);
1380     // This is the zarch specific stub generator for overlapping long array copy.
1381     // Refer to generate_conjoint_copy for a list of prereqs and features:
1382 
1383     unsigned int start_off   = __ offset();  // Remember stub start address (is rtn value).
1384     address nooverlap_target = aligned ? StubRoutines::arrayof_jlong_disjoint_arraycopy()
1385                                        : StubRoutines::jlong_disjoint_arraycopy();
1386 
1387     array_overlap_test(nooverlap_target, 3); // Branch away to nooverlap_target if disjoint.
1388     generate_conjoint_copy(aligned, 8, false);
1389 
1390     return __ addr_at(start_off);
1391   }
1392 
1393   address generate_conjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
1394     StubCodeMark mark(this, "StubRoutines", name);
1395     // This is the zarch specific stub generator for overlapping oop array copy.
1396     // Refer to generate_conjoint_copy for a list of prereqs and features.
1397     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1398     unsigned int size      = UseCompressedOops ? 4 : 8;
1399     unsigned int shift     = UseCompressedOops ? 2 : 3;
1400 
1401     address nooverlap_target = aligned ? StubRoutines::arrayof_oop_disjoint_arraycopy(dest_uninitialized)
1402                                        : StubRoutines::oop_disjoint_arraycopy(dest_uninitialized);
1403 
1404     // Branch to disjoint_copy (if applicable) before pre_barrier to avoid double pre_barrier.
1405     array_overlap_test(nooverlap_target, shift);  // Branch away to nooverlap_target if disjoint.
1406 
1407     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1408     if (dest_uninitialized) {
1409       decorators |= IS_DEST_UNINITIALIZED;
1410     }
1411     if (aligned) {
1412       decorators |= ARRAYCOPY_ALIGNED;
1413     }
1414 
1415     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1416     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, Z_ARG1, Z_ARG2, Z_ARG3);
1417 
1418     generate_conjoint_copy(aligned, size, true);  // Must preserve ARG2, ARG3.
1419 
1420     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, Z_ARG2, Z_ARG3, true);
1421 
1422     return __ addr_at(start_off);
1423   }
1424 
1425 
1426   void generate_arraycopy_stubs() {
1427 
1428     // Note: the disjoint stubs must be generated first, some of
1429     // the conjoint stubs use them.
1430     StubRoutines::_jbyte_disjoint_arraycopy      = generate_disjoint_byte_copy (false, "jbyte_disjoint_arraycopy");
1431     StubRoutines::_jshort_disjoint_arraycopy     = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
1432     StubRoutines::_jint_disjoint_arraycopy       = generate_disjoint_int_copy  (false, "jint_disjoint_arraycopy");
1433     StubRoutines::_jlong_disjoint_arraycopy      = generate_disjoint_long_copy (false, "jlong_disjoint_arraycopy");
1434     StubRoutines::_oop_disjoint_arraycopy        = generate_disjoint_oop_copy  (false, "oop_disjoint_arraycopy", false);
1435     StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy  (false, "oop_disjoint_arraycopy_uninit", true);
1436 
1437     StubRoutines::_arrayof_jbyte_disjoint_arraycopy      = generate_disjoint_byte_copy (true, "arrayof_jbyte_disjoint_arraycopy");
1438     StubRoutines::_arrayof_jshort_disjoint_arraycopy     = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
1439     StubRoutines::_arrayof_jint_disjoint_arraycopy       = generate_disjoint_int_copy  (true, "arrayof_jint_disjoint_arraycopy");
1440     StubRoutines::_arrayof_jlong_disjoint_arraycopy      = generate_disjoint_long_copy (true, "arrayof_jlong_disjoint_arraycopy");
1441     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy  (true, "arrayof_oop_disjoint_arraycopy", false);
1442     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy  (true, "arrayof_oop_disjoint_arraycopy_uninit", true);
1443 
1444     StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy (false, "jbyte_arraycopy");
1445     StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, "jshort_arraycopy");
1446     StubRoutines::_jint_arraycopy            = generate_conjoint_int_copy  (false, "jint_arraycopy");
1447     StubRoutines::_jlong_arraycopy           = generate_conjoint_long_copy (false, "jlong_arraycopy");
1448     StubRoutines::_oop_arraycopy             = generate_conjoint_oop_copy  (false, "oop_arraycopy", false);
1449     StubRoutines::_oop_arraycopy_uninit      = generate_conjoint_oop_copy  (false, "oop_arraycopy_uninit", true);
1450 
1451     StubRoutines::_arrayof_jbyte_arraycopy      = generate_conjoint_byte_copy (true, "arrayof_jbyte_arraycopy");
1452     StubRoutines::_arrayof_jshort_arraycopy     = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
1453     StubRoutines::_arrayof_jint_arraycopy       = generate_conjoint_int_copy  (true, "arrayof_jint_arraycopy");
1454     StubRoutines::_arrayof_jlong_arraycopy      = generate_conjoint_long_copy (true, "arrayof_jlong_arraycopy");
1455     StubRoutines::_arrayof_oop_arraycopy        = generate_conjoint_oop_copy  (true, "arrayof_oop_arraycopy", false);
1456     StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy  (true, "arrayof_oop_arraycopy_uninit", true);
1457   }
1458 
1459   void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) {
1460 
1461     // safefetch signatures:
1462     //   int      SafeFetch32(int*      adr, int      errValue);
1463     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
1464     //
1465     // arguments:
1466     //   Z_ARG1 = adr
1467     //   Z_ARG2 = errValue
1468     //
1469     // result:
1470     //   Z_RET  = *adr or errValue
1471 
1472     StubCodeMark mark(this, "StubRoutines", name);
1473 
1474     // entry point
1475     // Load *adr into Z_ARG2, may fault.
1476     *entry = *fault_pc = __ pc();
1477     switch (size) {
1478       case 4:
1479         // Sign extended int32_t.
1480         __ z_lgf(Z_ARG2, 0, Z_ARG1);
1481         break;
1482       case 8:
1483         // int64_t
1484         __ z_lg(Z_ARG2, 0, Z_ARG1);
1485         break;
1486       default:
1487         ShouldNotReachHere();
1488     }
1489 
1490     // Return errValue or *adr.
1491     *continuation_pc = __ pc();
1492     __ z_lgr(Z_RET, Z_ARG2);
1493     __ z_br(Z_R14);
1494 
1495   }
1496 
1497   // Call interface for AES_encryptBlock, AES_decryptBlock stubs.
1498   //
1499   //   Z_ARG1 - source data block. Ptr to leftmost byte to be processed.
1500   //   Z_ARG2 - destination data block. Ptr to leftmost byte to be stored.
1501   //            For in-place encryption/decryption, ARG1 and ARG2 can point
1502   //            to the same piece of storage.
1503   //   Z_ARG3 - Crypto key address (expanded key). The first n bits of
1504   //            the expanded key constitute the original AES-<n> key (see below).
1505   //
1506   //   Z_RET  - return value. First unprocessed byte offset in src buffer.
1507   //
1508   // Some remarks:
1509   //   The crypto key, as passed from the caller to these encryption stubs,
1510   //   is a so-called expanded key. It is derived from the original key
1511   //   by the Rijndael key schedule, see http://en.wikipedia.org/wiki/Rijndael_key_schedule
1512   //   With the expanded key, the cipher/decipher task is decomposed in
1513   //   multiple, less complex steps, called rounds. Sun SPARC and Intel
1514   //   processors obviously implement support for those less complex steps.
1515   //   z/Architecture provides instructions for full cipher/decipher complexity.
1516   //   Therefore, we need the original, not the expanded key here.
1517   //   Luckily, the first n bits of an AES-<n> expanded key are formed
1518   //   by the original key itself. That takes us out of trouble. :-)
1519   //   The key length (in bytes) relation is as follows:
1520   //     original    expanded   rounds  key bit     keylen
1521   //    key bytes   key bytes            length   in words
1522   //           16         176       11      128         44
1523   //           24         208       13      192         52
1524   //           32         240       15      256         60
1525   //
1526   // The crypto instructions used in the AES* stubs have some specific register requirements.
1527   //   Z_R0   holds the crypto function code. Please refer to the KM/KMC instruction
1528   //          description in the "z/Architecture Principles of Operation" manual for details.
1529   //   Z_R1   holds the parameter block address. The parameter block contains the cryptographic key
1530   //          (KM instruction) and the chaining value (KMC instruction).
1531   //   dst    must designate an even-numbered register, holding the address of the output message.
1532   //   src    must designate an even/odd register pair, holding the address/length of the original message
1533 
1534   // Helper function which generates code to
1535   //  - load the function code in register fCode (== Z_R0).
1536   //  - load the data block length (depends on cipher function) into register srclen if requested.
1537   //  - is_decipher switches between cipher/decipher function codes
1538   //  - set_len requests (if true) loading the data block length in register srclen
1539   void generate_load_AES_fCode(Register keylen, Register fCode, Register srclen, bool is_decipher) {
1540 
1541     BLOCK_COMMENT("Set fCode {"); {
1542       Label fCode_set;
1543       int   mode = is_decipher ? VM_Version::CipherMode::decipher : VM_Version::CipherMode::cipher;
1544       bool  identical_dataBlk_len =  (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES192_dataBlk)
1545                                   && (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES256_dataBlk);
1546       // Expanded key length is 44/52/60 * 4 bytes for AES-128/AES-192/AES-256.
1547       __ z_cghi(keylen, 52); // Check only once at the beginning. keylen and fCode may share the same register.
1548 
1549       __ z_lghi(fCode, VM_Version::Cipher::_AES128 + mode);
1550       if (!identical_dataBlk_len) {
1551         __ z_lghi(srclen, VM_Version::Cipher::_AES128_dataBlk);
1552       }
1553       __ z_brl(fCode_set);  // keyLen <  52: AES128
1554 
1555       __ z_lghi(fCode, VM_Version::Cipher::_AES192 + mode);
1556       if (!identical_dataBlk_len) {
1557         __ z_lghi(srclen, VM_Version::Cipher::_AES192_dataBlk);
1558       }
1559       __ z_bre(fCode_set);  // keyLen == 52: AES192
1560 
1561       __ z_lghi(fCode, VM_Version::Cipher::_AES256 + mode);
1562       if (!identical_dataBlk_len) {
1563         __ z_lghi(srclen, VM_Version::Cipher::_AES256_dataBlk);
1564       }
1565       // __ z_brh(fCode_set);  // keyLen <  52: AES128           // fallthru
1566 
1567       __ bind(fCode_set);
1568       if (identical_dataBlk_len) {
1569         __ z_lghi(srclen, VM_Version::Cipher::_AES128_dataBlk);
1570       }
1571     }
1572     BLOCK_COMMENT("} Set fCode");
1573   }
1574 
1575   // Push a parameter block for the cipher/decipher instruction on the stack.
1576   // Layout of the additional stack space allocated for AES_cipherBlockChaining:
1577   //
1578   //   |        |
1579   //   +--------+ <-- SP before expansion
1580   //   |        |
1581   //   :        :  alignment loss, 0..(AES_parmBlk_align-8) bytes
1582   //   |        |
1583   //   +--------+
1584   //   |        |
1585   //   :        :  space for parameter block, size VM_Version::Cipher::_AES*_parmBlk_C
1586   //   |        |
1587   //   +--------+ <-- parmBlk, octoword-aligned, start of parameter block
1588   //   |        |
1589   //   :        :  additional stack space for spills etc., size AES_parmBlk_addspace, DW @ Z_SP not usable!!!
1590   //   |        |
1591   //   +--------+ <-- Z_SP after expansion
1592 
1593   void generate_push_Block(int dataBlk_len, int parmBlk_len, int crypto_fCode,
1594                            Register parmBlk, Register keylen, Register fCode, Register cv, Register key) {
1595     const int AES_parmBlk_align    = 32;  // octoword alignment.
1596     const int AES_parmBlk_addspace = 24;  // Must be sufficiently large to hold all spilled registers
1597                                           // (currently 2) PLUS 1 DW for the frame pointer.
1598 
1599     const int cv_len     = dataBlk_len;
1600     const int key_len    = parmBlk_len - cv_len;
1601     // This len must be known at JIT compile time. Only then are we able to recalc the SP before resize.
1602     // We buy this knowledge by wasting some (up to AES_parmBlk_align) bytes of stack space.
1603     const int resize_len = cv_len + key_len + AES_parmBlk_align + AES_parmBlk_addspace;
1604 
1605     // Use parmBlk as temp reg here to hold the frame pointer.
1606     __ resize_frame(-resize_len, parmBlk, true);
1607 
1608     // calculate parmBlk address from updated (resized) SP.
1609     __ add2reg(parmBlk, resize_len - (cv_len + key_len), Z_SP);
1610     __ z_nill(parmBlk, (~(AES_parmBlk_align-1)) & 0xffff); // Align parameter block.
1611 
1612     // There is room for stuff in the range [parmBlk-AES_parmBlk_addspace+8, parmBlk).
1613     __ z_stg(keylen,  -8, parmBlk);                        // Spill keylen for later use.
1614 
1615     // calculate (SP before resize) from updated SP.
1616     __ add2reg(keylen, resize_len, Z_SP);                  // keylen holds prev SP for now.
1617     __ z_stg(keylen, -16, parmBlk);                        // Spill prev SP for easy revert.
1618 
1619     __ z_mvc(0,      cv_len-1,  parmBlk, 0, cv);     // Copy cv.
1620     __ z_mvc(cv_len, key_len-1, parmBlk, 0, key);    // Copy key.
1621     __ z_lghi(fCode, crypto_fCode);
1622   }
1623 
1624   // NOTE:
1625   //   Before returning, the stub has to copy the chaining value from
1626   //   the parmBlk, where it was updated by the crypto instruction, back
1627   //   to the chaining value array the address of which was passed in the cv argument.
1628   //   As all the available registers are used and modified by KMC, we need to save
1629   //   the key length across the KMC instruction. We do so by spilling it to the stack,
1630   //   just preceding the parmBlk (at (parmBlk - 8)).
1631   void generate_push_parmBlk(Register keylen, Register fCode, Register parmBlk, Register key, Register cv, bool is_decipher) {
1632     int       mode = is_decipher ? VM_Version::CipherMode::decipher : VM_Version::CipherMode::cipher;
1633     Label     parmBlk_128, parmBlk_192, parmBlk_256, parmBlk_set;
1634 
1635     BLOCK_COMMENT("push parmBlk {");
1636     if (VM_Version::has_Crypto_AES()   ) { __ z_cghi(keylen, 52); }
1637     if (VM_Version::has_Crypto_AES128()) { __ z_brl(parmBlk_128); }  // keyLen <  52: AES128
1638     if (VM_Version::has_Crypto_AES192()) { __ z_bre(parmBlk_192); }  // keyLen == 52: AES192
1639     if (VM_Version::has_Crypto_AES256()) { __ z_brh(parmBlk_256); }  // keyLen >  52: AES256
1640 
1641     // Security net: requested AES function not available on this CPU.
1642     // NOTE:
1643     //   As of now (March 2015), this safety net is not required. JCE policy files limit the
1644     //   cryptographic strength of the keys used to 128 bit. If we have AES hardware support
1645     //   at all, we have at least AES-128.
1646     __ stop_static("AES key strength not supported by CPU. Use -XX:-UseAES as remedy.", 0);
1647 
1648     if (VM_Version::has_Crypto_AES256()) {
1649       __ bind(parmBlk_256);
1650       generate_push_Block(VM_Version::Cipher::_AES256_dataBlk,
1651                           VM_Version::Cipher::_AES256_parmBlk_C,
1652                           VM_Version::Cipher::_AES256 + mode,
1653                           parmBlk, keylen, fCode, cv, key);
1654       if (VM_Version::has_Crypto_AES128() || VM_Version::has_Crypto_AES192()) {
1655         __ z_bru(parmBlk_set);  // Fallthru otherwise.
1656       }
1657     }
1658 
1659     if (VM_Version::has_Crypto_AES192()) {
1660       __ bind(parmBlk_192);
1661       generate_push_Block(VM_Version::Cipher::_AES192_dataBlk,
1662                           VM_Version::Cipher::_AES192_parmBlk_C,
1663                           VM_Version::Cipher::_AES192 + mode,
1664                           parmBlk, keylen, fCode, cv, key);
1665       if (VM_Version::has_Crypto_AES128()) {
1666         __ z_bru(parmBlk_set);  // Fallthru otherwise.
1667       }
1668     }
1669 
1670     if (VM_Version::has_Crypto_AES128()) {
1671       __ bind(parmBlk_128);
1672       generate_push_Block(VM_Version::Cipher::_AES128_dataBlk,
1673                           VM_Version::Cipher::_AES128_parmBlk_C,
1674                           VM_Version::Cipher::_AES128 + mode,
1675                           parmBlk, keylen, fCode, cv, key);
1676       // Fallthru
1677     }
1678 
1679     __ bind(parmBlk_set);
1680     BLOCK_COMMENT("} push parmBlk");
1681   }
1682 
1683   // Pop a parameter block from the stack. The chaining value portion of the parameter block
1684   // is copied back to the cv array as it is needed for subsequent cipher steps.
1685   // The keylen value as well as the original SP (before resizing) was pushed to the stack
1686   // when pushing the parameter block.
1687   void generate_pop_parmBlk(Register keylen, Register parmBlk, Register key, Register cv) {
1688 
1689     BLOCK_COMMENT("pop parmBlk {");
1690     bool identical_dataBlk_len =  (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES192_dataBlk) &&
1691                                   (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES256_dataBlk);
1692     if (identical_dataBlk_len) {
1693       int cv_len = VM_Version::Cipher::_AES128_dataBlk;
1694       __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1695     } else {
1696       int cv_len;
1697       Label parmBlk_128, parmBlk_192, parmBlk_256, parmBlk_set;
1698       __ z_lg(keylen, -8, parmBlk);  // restore keylen
1699       __ z_cghi(keylen, 52);
1700       if (VM_Version::has_Crypto_AES256()) __ z_brh(parmBlk_256);  // keyLen >  52: AES256
1701       if (VM_Version::has_Crypto_AES192()) __ z_bre(parmBlk_192);  // keyLen == 52: AES192
1702       // if (VM_Version::has_Crypto_AES128()) __ z_brl(parmBlk_128);  // keyLen <  52: AES128  // fallthru
1703 
1704       // Security net: there is no one here. If we would need it, we should have
1705       // fallen into it already when pushing the parameter block.
1706       if (VM_Version::has_Crypto_AES128()) {
1707         __ bind(parmBlk_128);
1708         cv_len = VM_Version::Cipher::_AES128_dataBlk;
1709         __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1710         if (VM_Version::has_Crypto_AES192() || VM_Version::has_Crypto_AES256()) {
1711           __ z_bru(parmBlk_set);
1712         }
1713       }
1714 
1715       if (VM_Version::has_Crypto_AES192()) {
1716         __ bind(parmBlk_192);
1717         cv_len = VM_Version::Cipher::_AES192_dataBlk;
1718         __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1719         if (VM_Version::has_Crypto_AES256()) {
1720           __ z_bru(parmBlk_set);
1721         }
1722       }
1723 
1724       if (VM_Version::has_Crypto_AES256()) {
1725         __ bind(parmBlk_256);
1726         cv_len = VM_Version::Cipher::_AES256_dataBlk;
1727         __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1728         // __ z_bru(parmBlk_set);  // fallthru
1729       }
1730       __ bind(parmBlk_set);
1731     }
1732     __ z_lg(Z_SP, -16, parmBlk); // Revert resize_frame_absolute. Z_SP saved by push_parmBlk.
1733     BLOCK_COMMENT("} pop parmBlk");
1734   }
1735 
1736   // Compute AES encrypt/decrypt function.
1737   void generate_AES_cipherBlock(bool is_decipher) {
1738     // Incoming arguments.
1739     Register       from    = Z_ARG1; // source byte array
1740     Register       to      = Z_ARG2; // destination byte array
1741     Register       key     = Z_ARG3; // expanded key array
1742 
1743     const Register keylen  = Z_R0;   // Temporarily (until fCode is set) holds the expanded key array length.
1744 
1745     // Register definitions as required by KM instruction.
1746     const Register fCode   = Z_R0;   // crypto function code
1747     const Register parmBlk = Z_R1;   // parameter block address (points to crypto key)
1748     const Register src     = Z_ARG1; // Must be even reg (KM requirement).
1749     const Register srclen  = Z_ARG2; // Must be odd reg and pair with src. Overwrites destination address.
1750     const Register dst     = Z_ARG3; // Must be even reg (KM requirement). Overwrites expanded key address.
1751 
1752     // Read key len of expanded key (in 4-byte words).
1753     __ z_lgf(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1754 
1755     // Copy arguments to registers as required by crypto instruction.
1756     __ z_lgr(parmBlk, key);          // crypto key (in T_INT array).
1757     __ lgr_if_needed(src, from);     // Copy src address. Will not emit, src/from are identical.
1758     __ z_lgr(dst, to);               // Copy dst address, even register required.
1759 
1760     // Construct function code into fCode(Z_R0), data block length into srclen(Z_ARG2).
1761     generate_load_AES_fCode(keylen, fCode, srclen, is_decipher);
1762 
1763     __ km(dst, src);                 // Cipher the message.
1764 
1765     __ z_br(Z_R14);
1766   }
1767 
1768   // Compute AES encrypt function.
1769   address generate_AES_encryptBlock(const char* name) {
1770     __ align(CodeEntryAlignment);
1771     StubCodeMark mark(this, "StubRoutines", name);
1772     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1773 
1774     generate_AES_cipherBlock(false);
1775 
1776     return __ addr_at(start_off);
1777   }
1778 
1779   // Compute AES decrypt function.
1780   address generate_AES_decryptBlock(const char* name) {
1781     __ align(CodeEntryAlignment);
1782     StubCodeMark mark(this, "StubRoutines", name);
1783     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1784 
1785     generate_AES_cipherBlock(true);
1786 
1787     return __ addr_at(start_off);
1788   }
1789 
1790   // These stubs receive the addresses of the cryptographic key and of the chaining value as two separate
1791   // arguments (registers "key" and "cv", respectively). The KMC instruction, on the other hand, requires
1792   // chaining value and key to be, in this sequence, adjacent in storage. Thus, we need to allocate some
1793   // thread-local working storage. Using heap memory incurs all the hassles of allocating/freeing.
1794   // Stack space, on the contrary, is deallocated automatically when we return from the stub to the caller.
1795   // *** WARNING ***
1796   // Please note that we do not formally allocate stack space, nor do we
1797   // update the stack pointer. Therefore, no function calls are allowed
1798   // and nobody else must use the stack range where the parameter block
1799   // is located.
1800   // We align the parameter block to the next available octoword.
1801   //
1802   // Compute chained AES encrypt function.
1803   void generate_AES_cipherBlockChaining(bool is_decipher) {
1804 
1805     Register       from    = Z_ARG1; // source byte array (clear text)
1806     Register       to      = Z_ARG2; // destination byte array (ciphered)
1807     Register       key     = Z_ARG3; // expanded key array.
1808     Register       cv      = Z_ARG4; // chaining value
1809     const Register msglen  = Z_ARG5; // Total length of the msg to be encrypted. Value must be returned
1810                                      // in Z_RET upon completion of this stub. Is 32-bit integer.
1811 
1812     const Register keylen  = Z_R0;   // Expanded key length, as read from key array. Temp only.
1813     const Register fCode   = Z_R0;   // crypto function code
1814     const Register parmBlk = Z_R1;   // parameter block address (points to crypto key)
1815     const Register src     = Z_ARG1; // is Z_R2
1816     const Register srclen  = Z_ARG2; // Overwrites destination address.
1817     const Register dst     = Z_ARG3; // Overwrites key address.
1818 
1819     // Read key len of expanded key (in 4-byte words).
1820     __ z_lgf(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1821 
1822     // Construct parm block address in parmBlk (== Z_R1), copy cv and key to parm block.
1823     // Construct function code in fCode (Z_R0).
1824     generate_push_parmBlk(keylen, fCode, parmBlk, key, cv, is_decipher);
1825 
1826     // Prepare other registers for instruction.
1827     __ lgr_if_needed(src, from);     // Copy src address. Will not emit, src/from are identical.
1828     __ z_lgr(dst, to);
1829     __ z_llgfr(srclen, msglen);      // We pass the offsets as ints, not as longs as required.
1830 
1831     __ kmc(dst, src);                // Cipher the message.
1832 
1833     generate_pop_parmBlk(keylen, parmBlk, key, cv);
1834 
1835     __ z_llgfr(Z_RET, msglen);       // We pass the offsets as ints, not as longs as required.
1836     __ z_br(Z_R14);
1837   }
1838 
1839   // Compute chained AES encrypt function.
1840   address generate_cipherBlockChaining_AES_encrypt(const char* name) {
1841     __ align(CodeEntryAlignment);
1842     StubCodeMark mark(this, "StubRoutines", name);
1843     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1844 
1845     generate_AES_cipherBlockChaining(false);
1846 
1847     return __ addr_at(start_off);
1848   }
1849 
1850   // Compute chained AES encrypt function.
1851   address generate_cipherBlockChaining_AES_decrypt(const char* name) {
1852     __ align(CodeEntryAlignment);
1853     StubCodeMark mark(this, "StubRoutines", name);
1854     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1855 
1856     generate_AES_cipherBlockChaining(true);
1857 
1858     return __ addr_at(start_off);
1859   }
1860 
1861 
1862   // Compute GHASH function.
1863   address generate_ghash_processBlocks() {
1864     __ align(CodeEntryAlignment);
1865     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
1866     unsigned int start_off = __ offset();   // Remember stub start address (is rtn value).
1867 
1868     const Register state   = Z_ARG1;
1869     const Register subkeyH = Z_ARG2;
1870     const Register data    = Z_ARG3; // 1st of even-odd register pair.
1871     const Register blocks  = Z_ARG4;
1872     const Register len     = blocks; // 2nd of even-odd register pair.
1873 
1874     const int param_block_size = 4 * 8;
1875     const int frame_resize = param_block_size + 8; // Extra space for copy of fp.
1876 
1877     // Reserve stack space for parameter block (R1).
1878     __ z_lgr(Z_R1, Z_SP);
1879     __ resize_frame(-frame_resize, Z_R0, true);
1880     __ z_aghi(Z_R1, -param_block_size);
1881 
1882     // Fill parameter block.
1883     __ z_mvc(Address(Z_R1)    , Address(state)  , 16);
1884     __ z_mvc(Address(Z_R1, 16), Address(subkeyH), 16);
1885 
1886     // R4+5: data pointer + length
1887     __ z_llgfr(len, blocks);  // Cast to 64-bit.
1888 
1889     // R0: function code
1890     __ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_GHASH);
1891 
1892     // Compute.
1893     __ z_sllg(len, len, 4);  // In bytes.
1894     __ kimd(data);
1895 
1896     // Copy back result and free parameter block.
1897     __ z_mvc(Address(state), Address(Z_R1), 16);
1898     __ z_xc(Address(Z_R1), param_block_size, Address(Z_R1));
1899     __ z_aghi(Z_SP, frame_resize);
1900 
1901     __ z_br(Z_R14);
1902 
1903     return __ addr_at(start_off);
1904   }
1905 
1906 
1907   // Call interface for all SHA* stubs.
1908   //
1909   //   Z_ARG1 - source data block. Ptr to leftmost byte to be processed.
1910   //   Z_ARG2 - current SHA state. Ptr to state area. This area serves as
1911   //            parameter block as required by the crypto instruction.
1912   //   Z_ARG3 - current byte offset in source data block.
1913   //   Z_ARG4 - last byte offset in source data block.
1914   //            (Z_ARG4 - Z_ARG3) gives the #bytes remaining to be processed.
1915   //
1916   //   Z_RET  - return value. First unprocessed byte offset in src buffer.
1917   //
1918   //   A few notes on the call interface:
1919   //    - All stubs, whether they are single-block or multi-block, are assumed to
1920   //      digest an integer multiple of the data block length of data. All data
1921   //      blocks are digested using the intermediate message digest (KIMD) instruction.
1922   //      Special end processing, as done by the KLMD instruction, seems to be
1923   //      emulated by the calling code.
1924   //
1925   //    - Z_ARG1 addresses the first byte of source data. The offset (Z_ARG3) is
1926   //      already accounted for.
1927   //
1928   //    - The current SHA state (the intermediate message digest value) is contained
1929   //      in an area addressed by Z_ARG2. The area size depends on the SHA variant
1930   //      and is accessible via the enum VM_Version::MsgDigest::_SHA<n>_parmBlk_I
1931   //
1932   //    - The single-block stub is expected to digest exactly one data block, starting
1933   //      at the address passed in Z_ARG1.
1934   //
1935   //    - The multi-block stub is expected to digest all data blocks which start in
1936   //      the offset interval [srcOff(Z_ARG3), srcLimit(Z_ARG4)). The exact difference
1937   //      (srcLimit-srcOff), rounded up to the next multiple of the data block length,
1938   //      gives the number of blocks to digest. It must be assumed that the calling code
1939   //      provides for a large enough source data buffer.
1940   //
1941   // Compute SHA-1 function.
1942   address generate_SHA1_stub(bool multiBlock, const char* name) {
1943     __ align(CodeEntryAlignment);
1944     StubCodeMark mark(this, "StubRoutines", name);
1945     unsigned int start_off = __ offset();   // Remember stub start address (is rtn value).
1946 
1947     const Register srcBuff        = Z_ARG1; // Points to first block to process (offset already added).
1948     const Register SHAState       = Z_ARG2; // Only on entry. Reused soon thereafter for kimd register pairs.
1949     const Register srcOff         = Z_ARG3; // int
1950     const Register srcLimit       = Z_ARG4; // Only passed in multiBlock case. int
1951 
1952     const Register SHAState_local = Z_R1;
1953     const Register SHAState_save  = Z_ARG3;
1954     const Register srcBufLen      = Z_ARG2; // Destroys state address, must be copied before.
1955     Label useKLMD, rtn;
1956 
1957     __ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_SHA1);   // function code
1958     __ z_lgr(SHAState_local, SHAState);                                 // SHAState == parameter block
1959 
1960     if (multiBlock) {  // Process everything from offset to limit.
1961 
1962       // The following description is valid if we get a raw (unpimped) source data buffer,
1963       // spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailled above,
1964       // the calling convention for these stubs is different. We leave the description in
1965       // to inform the reader what must be happening hidden in the calling code.
1966       //
1967       // The data block to be processed can have arbitrary length, i.e. its length does not
1968       // need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement
1969       // two different paths. If the length is an integer multiple, we use KIMD, saving us
1970       // to copy the SHA state back and forth. If the length is odd, we copy the SHA state
1971       // to the stack, execute a KLMD instruction on it and copy the result back to the
1972       // caller's SHA state location.
1973 
1974       // Total #srcBuff blocks to process.
1975       if (VM_Version::has_DistinctOpnds()) {
1976         __ z_srk(srcBufLen, srcLimit, srcOff); // exact difference
1977         __ z_ahi(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1);   // round up
1978         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA1_dataBlk-1)) & 0xffff);
1979         __ z_ark(srcLimit, srcOff, srcBufLen); // Srclimit temporarily holds return value.
1980         __ z_llgfr(srcBufLen, srcBufLen);      // Cast to 64-bit.
1981       } else {
1982         __ z_lgfr(srcBufLen, srcLimit);        // Exact difference. srcLimit passed as int.
1983         __ z_sgfr(srcBufLen, srcOff);          // SrcOff passed as int, now properly casted to long.
1984         __ z_aghi(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1);   // round up
1985         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA1_dataBlk-1)) & 0xffff);
1986         __ z_lgr(srcLimit, srcOff);            // SrcLimit temporarily holds return value.
1987         __ z_agr(srcLimit, srcBufLen);
1988       }
1989 
1990       // Integral #blocks to digest?
1991       // As a result of the calculations above, srcBufLen MUST be an integer
1992       // multiple of _SHA1_dataBlk, or else we are in big trouble.
1993       // We insert an asm_assert into the KLMD case to guard against that.
1994       __ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1);
1995       __ z_brc(Assembler::bcondNotAllZero, useKLMD);
1996 
1997       // Process all full blocks.
1998       __ kimd(srcBuff);
1999 
2000       __ z_lgr(Z_RET, srcLimit);  // Offset of first unprocessed byte in buffer.
2001     } else {  // Process one data block only.
2002       __ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA1_dataBlk);   // #srcBuff bytes to process
2003       __ kimd(srcBuff);
2004       __ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA1_dataBlk, srcOff);            // Offset of first unprocessed byte in buffer. No 32 to 64 bit extension needed.
2005     }
2006 
2007     __ bind(rtn);
2008     __ z_br(Z_R14);
2009 
2010     if (multiBlock) {
2011       __ bind(useKLMD);
2012 
2013 #if 1
2014       // Security net: this stub is believed to be called for full-sized data blocks only
2015       // NOTE: The following code is believed to be correct, but is is not tested.
2016       __ stop_static("SHA128 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0);
2017 #endif
2018     }
2019 
2020     return __ addr_at(start_off);
2021   }
2022 
2023   // Compute SHA-256 function.
2024   address generate_SHA256_stub(bool multiBlock, const char* name) {
2025     __ align(CodeEntryAlignment);
2026     StubCodeMark mark(this, "StubRoutines", name);
2027     unsigned int start_off = __ offset();   // Remember stub start address (is rtn value).
2028 
2029     const Register srcBuff        = Z_ARG1;
2030     const Register SHAState       = Z_ARG2; // Only on entry. Reused soon thereafter.
2031     const Register SHAState_local = Z_R1;
2032     const Register SHAState_save  = Z_ARG3;
2033     const Register srcOff         = Z_ARG3;
2034     const Register srcLimit       = Z_ARG4;
2035     const Register srcBufLen      = Z_ARG2; // Destroys state address, must be copied before.
2036     Label useKLMD, rtn;
2037 
2038     __ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_SHA256); // function code
2039     __ z_lgr(SHAState_local, SHAState);                                 // SHAState == parameter block
2040 
2041     if (multiBlock) {  // Process everything from offset to limit.
2042       // The following description is valid if we get a raw (unpimped) source data buffer,
2043       // spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailled above,
2044       // the calling convention for these stubs is different. We leave the description in
2045       // to inform the reader what must be happening hidden in the calling code.
2046       //
2047       // The data block to be processed can have arbitrary length, i.e. its length does not
2048       // need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement
2049       // two different paths. If the length is an integer multiple, we use KIMD, saving us
2050       // to copy the SHA state back and forth. If the length is odd, we copy the SHA state
2051       // to the stack, execute a KLMD instruction on it and copy the result back to the
2052       // caller's SHA state location.
2053 
2054       // total #srcBuff blocks to process
2055       if (VM_Version::has_DistinctOpnds()) {
2056         __ z_srk(srcBufLen, srcLimit, srcOff);   // exact difference
2057         __ z_ahi(srcBufLen, VM_Version::MsgDigest::_SHA256_dataBlk-1); // round up
2058         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA256_dataBlk-1)) & 0xffff);
2059         __ z_ark(srcLimit, srcOff, srcBufLen);   // Srclimit temporarily holds return value.
2060         __ z_llgfr(srcBufLen, srcBufLen);        // Cast to 64-bit.
2061       } else {
2062         __ z_lgfr(srcBufLen, srcLimit);          // exact difference
2063         __ z_sgfr(srcBufLen, srcOff);
2064         __ z_aghi(srcBufLen, VM_Version::MsgDigest::_SHA256_dataBlk-1); // round up
2065         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA256_dataBlk-1)) & 0xffff);
2066         __ z_lgr(srcLimit, srcOff);              // Srclimit temporarily holds return value.
2067         __ z_agr(srcLimit, srcBufLen);
2068       }
2069 
2070       // Integral #blocks to digest?
2071       // As a result of the calculations above, srcBufLen MUST be an integer
2072       // multiple of _SHA1_dataBlk, or else we are in big trouble.
2073       // We insert an asm_assert into the KLMD case to guard against that.
2074       __ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA256_dataBlk-1);
2075       __ z_brc(Assembler::bcondNotAllZero, useKLMD);
2076 
2077       // Process all full blocks.
2078       __ kimd(srcBuff);
2079 
2080       __ z_lgr(Z_RET, srcLimit);  // Offset of first unprocessed byte in buffer.
2081     } else {  // Process one data block only.
2082       __ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA256_dataBlk); // #srcBuff bytes to process
2083       __ kimd(srcBuff);
2084       __ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA256_dataBlk, srcOff);          // Offset of first unprocessed byte in buffer.
2085     }
2086 
2087     __ bind(rtn);
2088     __ z_br(Z_R14);
2089 
2090     if (multiBlock) {
2091       __ bind(useKLMD);
2092 #if 1
2093       // Security net: this stub is believed to be called for full-sized data blocks only.
2094       // NOTE:
2095       //   The following code is believed to be correct, but is is not tested.
2096       __ stop_static("SHA256 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0);
2097 #endif
2098     }
2099 
2100     return __ addr_at(start_off);
2101   }
2102 
2103   // Compute SHA-512 function.
2104   address generate_SHA512_stub(bool multiBlock, const char* name) {
2105     __ align(CodeEntryAlignment);
2106     StubCodeMark mark(this, "StubRoutines", name);
2107     unsigned int start_off = __ offset();   // Remember stub start address (is rtn value).
2108 
2109     const Register srcBuff        = Z_ARG1;
2110     const Register SHAState       = Z_ARG2; // Only on entry. Reused soon thereafter.
2111     const Register SHAState_local = Z_R1;
2112     const Register SHAState_save  = Z_ARG3;
2113     const Register srcOff         = Z_ARG3;
2114     const Register srcLimit       = Z_ARG4;
2115     const Register srcBufLen      = Z_ARG2; // Destroys state address, must be copied before.
2116     Label useKLMD, rtn;
2117 
2118     __ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_SHA512); // function code
2119     __ z_lgr(SHAState_local, SHAState);                                 // SHAState == parameter block
2120 
2121     if (multiBlock) {  // Process everything from offset to limit.
2122       // The following description is valid if we get a raw (unpimped) source data buffer,
2123       // spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailled above,
2124       // the calling convention for these stubs is different. We leave the description in
2125       // to inform the reader what must be happening hidden in the calling code.
2126       //
2127       // The data block to be processed can have arbitrary length, i.e. its length does not
2128       // need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement
2129       // two different paths. If the length is an integer multiple, we use KIMD, saving us
2130       // to copy the SHA state back and forth. If the length is odd, we copy the SHA state
2131       // to the stack, execute a KLMD instruction on it and copy the result back to the
2132       // caller's SHA state location.
2133 
2134       // total #srcBuff blocks to process
2135       if (VM_Version::has_DistinctOpnds()) {
2136         __ z_srk(srcBufLen, srcLimit, srcOff);   // exact difference
2137         __ z_ahi(srcBufLen, VM_Version::MsgDigest::_SHA512_dataBlk-1); // round up
2138         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA512_dataBlk-1)) & 0xffff);
2139         __ z_ark(srcLimit, srcOff, srcBufLen);   // Srclimit temporarily holds return value.
2140         __ z_llgfr(srcBufLen, srcBufLen);        // Cast to 64-bit.
2141       } else {
2142         __ z_lgfr(srcBufLen, srcLimit);          // exact difference
2143         __ z_sgfr(srcBufLen, srcOff);
2144         __ z_aghi(srcBufLen, VM_Version::MsgDigest::_SHA512_dataBlk-1); // round up
2145         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA512_dataBlk-1)) & 0xffff);
2146         __ z_lgr(srcLimit, srcOff);              // Srclimit temporarily holds return value.
2147         __ z_agr(srcLimit, srcBufLen);
2148       }
2149 
2150       // integral #blocks to digest?
2151       // As a result of the calculations above, srcBufLen MUST be an integer
2152       // multiple of _SHA1_dataBlk, or else we are in big trouble.
2153       // We insert an asm_assert into the KLMD case to guard against that.
2154       __ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA512_dataBlk-1);
2155       __ z_brc(Assembler::bcondNotAllZero, useKLMD);
2156 
2157       // Process all full blocks.
2158       __ kimd(srcBuff);
2159 
2160       __ z_lgr(Z_RET, srcLimit);  // Offset of first unprocessed byte in buffer.
2161     } else {  // Process one data block only.
2162       __ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA512_dataBlk); // #srcBuff bytes to process
2163       __ kimd(srcBuff);
2164       __ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA512_dataBlk, srcOff);          // Offset of first unprocessed byte in buffer.
2165     }
2166 
2167     __ bind(rtn);
2168     __ z_br(Z_R14);
2169 
2170     if (multiBlock) {
2171       __ bind(useKLMD);
2172 #if 1
2173       // Security net: this stub is believed to be called for full-sized data blocks only
2174       // NOTE:
2175       //   The following code is believed to be correct, but is is not tested.
2176       __ stop_static("SHA512 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0);
2177 #endif
2178     }
2179 
2180     return __ addr_at(start_off);
2181   }
2182 
2183 
2184   /**
2185    *  Arguments:
2186    *
2187    * Inputs:
2188    *   Z_ARG1    - int   crc
2189    *   Z_ARG2    - byte* buf
2190    *   Z_ARG3    - int   length (of buffer)
2191    *
2192    * Result:
2193    *   Z_RET     - int   crc result
2194    **/
2195   // Compute CRC function (generic, for all polynomials).
2196   void generate_CRC_updateBytes(const char* name, Register table, bool invertCRC) {
2197 
2198     // arguments to kernel_crc32:
2199     Register       crc     = Z_ARG1;  // Current checksum, preset by caller or result from previous call, int.
2200     Register       data    = Z_ARG2;  // source byte array
2201     Register       dataLen = Z_ARG3;  // #bytes to process, int
2202 //    Register       table   = Z_ARG4;  // crc table address. Preloaded and passed in by caller.
2203     const Register t0      = Z_R10;   // work reg for kernel* emitters
2204     const Register t1      = Z_R11;   // work reg for kernel* emitters
2205     const Register t2      = Z_R12;   // work reg for kernel* emitters
2206     const Register t3      = Z_R13;   // work reg for kernel* emitters
2207 
2208     assert_different_registers(crc, data, dataLen, table);
2209 
2210     // We pass these values as ints, not as longs as required by C calling convention.
2211     // Crc used as int.
2212     __ z_llgfr(dataLen, dataLen);
2213 
2214     __ resize_frame(-(6*8), Z_R0, true); // Resize frame to provide add'l space to spill 5 registers.
2215     __ z_stmg(Z_R10, Z_R13, 1*8, Z_SP);  // Spill regs 10..11 to make them available as work registers.
2216     __ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, invertCRC);
2217     __ z_lmg(Z_R10, Z_R13, 1*8, Z_SP);   // Spill regs 10..11 back from stack.
2218     __ resize_frame(+(6*8), Z_R0, true); // Resize frame to provide add'l space to spill 5 registers.
2219 
2220     __ z_llgfr(Z_RET, crc);  // Updated crc is function result. No copying required, just zero upper 32 bits.
2221     __ z_br(Z_R14);          // Result already in Z_RET == Z_ARG1.
2222   }
2223 
2224 
2225   // Compute CRC32 function.
2226   address generate_CRC32_updateBytes(const char* name) {
2227     __ align(CodeEntryAlignment);
2228     StubCodeMark mark(this, "StubRoutines", name);
2229     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
2230 
2231     assert(UseCRC32Intrinsics, "should not generate this stub (%s) with CRC32 intrinsics disabled", name);
2232 
2233     BLOCK_COMMENT("CRC32_updateBytes {");
2234     Register       table   = Z_ARG4;  // crc32 table address.
2235     StubRoutines::zarch::generate_load_crc_table_addr(_masm, table);
2236 
2237     generate_CRC_updateBytes(name, table, true);
2238     BLOCK_COMMENT("} CRC32_updateBytes");
2239 
2240     return __ addr_at(start_off);
2241   }
2242 
2243 
2244   // Compute CRC32C function.
2245   address generate_CRC32C_updateBytes(const char* name) {
2246     __ align(CodeEntryAlignment);
2247     StubCodeMark mark(this, "StubRoutines", name);
2248     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
2249 
2250     assert(UseCRC32CIntrinsics, "should not generate this stub (%s) with CRC32C intrinsics disabled", name);
2251 
2252     BLOCK_COMMENT("CRC32C_updateBytes {");
2253     Register       table   = Z_ARG4;  // crc32c table address.
2254     StubRoutines::zarch::generate_load_crc32c_table_addr(_masm, table);
2255 
2256     generate_CRC_updateBytes(name, table, false);
2257     BLOCK_COMMENT("} CRC32C_updateBytes");
2258 
2259     return __ addr_at(start_off);
2260   }
2261 
2262 
2263   // Arguments:
2264   //   Z_ARG1    - x address
2265   //   Z_ARG2    - x length
2266   //   Z_ARG3    - y address
2267   //   Z_ARG4    - y length
2268   //   Z_ARG5    - z address
2269   //   160[Z_SP] - z length
2270   address generate_multiplyToLen() {
2271     __ align(CodeEntryAlignment);
2272     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
2273 
2274     address start = __ pc();
2275 
2276     const Register x    = Z_ARG1;
2277     const Register xlen = Z_ARG2;
2278     const Register y    = Z_ARG3;
2279     const Register ylen = Z_ARG4;
2280     const Register z    = Z_ARG5;
2281     // zlen is passed on the stack:
2282     // Address zlen(Z_SP, _z_abi(remaining_cargs));
2283 
2284     // Next registers will be saved on stack in multiply_to_len().
2285     const Register tmp1 = Z_tmp_1;
2286     const Register tmp2 = Z_tmp_2;
2287     const Register tmp3 = Z_tmp_3;
2288     const Register tmp4 = Z_tmp_4;
2289     const Register tmp5 = Z_R9;
2290 
2291     BLOCK_COMMENT("Entry:");
2292 
2293     __ z_llgfr(xlen, xlen);
2294     __ z_llgfr(ylen, ylen);
2295 
2296     __ multiply_to_len(x, xlen, y, ylen, z, tmp1, tmp2, tmp3, tmp4, tmp5);
2297 
2298     __ z_br(Z_R14);  // Return to caller.
2299 
2300     return start;
2301   }
2302 
2303   void generate_initial() {
2304     // Generates all stubs and initializes the entry points.
2305 
2306     // Entry points that exist in all platforms.
2307     // Note: This is code that could be shared among different
2308     // platforms - however the benefit seems to be smaller than the
2309     // disadvantage of having a much more complicated generator
2310     // structure. See also comment in stubRoutines.hpp.
2311     StubRoutines::_forward_exception_entry                 = generate_forward_exception();
2312 
2313     StubRoutines::_call_stub_entry                         = generate_call_stub(StubRoutines::_call_stub_return_address);
2314     StubRoutines::_catch_exception_entry                   = generate_catch_exception();
2315 
2316     // Build this early so it's available for the interpreter.
2317     StubRoutines::_throw_StackOverflowError_entry          =
2318       generate_throw_exception("StackOverflowError throw_exception",
2319                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
2320     StubRoutines::_throw_delayed_StackOverflowError_entry  =
2321       generate_throw_exception("delayed StackOverflowError throw_exception",
2322                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError), false);
2323 
2324     //----------------------------------------------------------------------
2325     // Entry points that are platform specific.
2326 
2327     if (UseCRC32Intrinsics) {
2328       StubRoutines::_crc_table_adr     = (address)StubRoutines::zarch::_crc_table;
2329       StubRoutines::_updateBytesCRC32  = generate_CRC32_updateBytes("CRC32_updateBytes");
2330     }
2331 
2332     if (UseCRC32CIntrinsics) {
2333       StubRoutines::_crc32c_table_addr = (address)StubRoutines::zarch::_crc32c_table;
2334       StubRoutines::_updateBytesCRC32C = generate_CRC32C_updateBytes("CRC32C_updateBytes");
2335     }
2336 
2337     // Comapct string intrinsics: Translate table for string inflate intrinsic. Used by trot instruction.
2338     StubRoutines::zarch::_trot_table_addr = (address)StubRoutines::zarch::_trot_table;
2339   }
2340 
2341 
2342   void generate_all() {
2343     // Generates all stubs and initializes the entry points.
2344 
2345     StubRoutines::zarch::_partial_subtype_check            = generate_partial_subtype_check();
2346 
2347     // These entry points require SharedInfo::stack0 to be set up in non-core builds.
2348     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);
2349     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError),  false);
2350     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
2351 
2352     // Support for verify_oop (must happen after universe_init).
2353     StubRoutines::_verify_oop_subroutine_entry             = generate_verify_oop_subroutine();
2354 
2355     // Arraycopy stubs used by compilers.
2356     generate_arraycopy_stubs();
2357 
2358     // safefetch stubs
2359     generate_safefetch("SafeFetch32", sizeof(int),      &StubRoutines::_safefetch32_entry, &StubRoutines::_safefetch32_fault_pc, &StubRoutines::_safefetch32_continuation_pc);
2360     generate_safefetch("SafeFetchN",  sizeof(intptr_t), &StubRoutines::_safefetchN_entry,  &StubRoutines::_safefetchN_fault_pc,  &StubRoutines::_safefetchN_continuation_pc);
2361 
2362     // Generate AES intrinsics code.
2363     if (UseAESIntrinsics) {
2364       StubRoutines::_aescrypt_encryptBlock = generate_AES_encryptBlock("AES_encryptBlock");
2365       StubRoutines::_aescrypt_decryptBlock = generate_AES_decryptBlock("AES_decryptBlock");
2366       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_AES_encrypt("AES_encryptBlock_chaining");
2367       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_AES_decrypt("AES_decryptBlock_chaining");
2368     }
2369 
2370     // Generate GHASH intrinsics code
2371     if (UseGHASHIntrinsics) {
2372       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
2373     }
2374 
2375     // Generate SHA1/SHA256/SHA512 intrinsics code.
2376     if (UseSHA1Intrinsics) {
2377       StubRoutines::_sha1_implCompress     = generate_SHA1_stub(false,   "SHA1_singleBlock");
2378       StubRoutines::_sha1_implCompressMB   = generate_SHA1_stub(true,    "SHA1_multiBlock");
2379     }
2380     if (UseSHA256Intrinsics) {
2381       StubRoutines::_sha256_implCompress   = generate_SHA256_stub(false, "SHA256_singleBlock");
2382       StubRoutines::_sha256_implCompressMB = generate_SHA256_stub(true,  "SHA256_multiBlock");
2383     }
2384     if (UseSHA512Intrinsics) {
2385       StubRoutines::_sha512_implCompress   = generate_SHA512_stub(false, "SHA512_singleBlock");
2386       StubRoutines::_sha512_implCompressMB = generate_SHA512_stub(true,  "SHA512_multiBlock");
2387     }
2388 
2389 #ifdef COMPILER2
2390     if (UseMultiplyToLenIntrinsic) {
2391       StubRoutines::_multiplyToLen = generate_multiplyToLen();
2392     }
2393     if (UseMontgomeryMultiplyIntrinsic) {
2394       StubRoutines::_montgomeryMultiply
2395         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
2396     }
2397     if (UseMontgomerySquareIntrinsic) {
2398       StubRoutines::_montgomerySquare
2399         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
2400     }
2401 #endif
2402   }
2403 
2404  public:
2405   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
2406     // Replace the standard masm with a special one:
2407     _masm = new MacroAssembler(code);
2408 
2409     _stub_count = !all ? 0x100 : 0x200;
2410     if (all) {
2411       generate_all();
2412     } else {
2413       generate_initial();
2414     }
2415   }
2416 
2417  private:
2418   int _stub_count;
2419   void stub_prolog(StubCodeDesc* cdesc) {
2420 #ifdef ASSERT
2421     // Put extra information in the stub code, to make it more readable.
2422     // Write the high part of the address.
2423     // [RGV] Check if there is a dependency on the size of this prolog.
2424     __ emit_32((intptr_t)cdesc >> 32);
2425     __ emit_32((intptr_t)cdesc);
2426     __ emit_32(++_stub_count);
2427 #endif
2428     align(true);
2429   }
2430 
2431   void align(bool at_header = false) {
2432     // z/Architecture cache line size is 256 bytes.
2433     // There is no obvious benefit in aligning stub
2434     // code to cache lines. Use CodeEntryAlignment instead.
2435     const unsigned int icache_line_size      = CodeEntryAlignment;
2436     const unsigned int icache_half_line_size = MIN2<unsigned int>(32, CodeEntryAlignment);
2437 
2438     if (at_header) {
2439       while ((intptr_t)(__ pc()) % icache_line_size != 0) {
2440         __ emit_16(0);
2441       }
2442     } else {
2443       while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
2444         __ z_nop();
2445       }
2446     }
2447   }
2448 
2449 };
2450 
2451 void StubGenerator_generate(CodeBuffer* code, bool all) {
2452   StubGenerator g(code, all);
2453 }