1 /*
   2  * Copyright (c) 2016, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2016, 2017, SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "registerSaver_s390.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "interpreter/interp_masm.hpp"
  33 #include "nativeInst_s390.hpp"
  34 #include "oops/instanceOop.hpp"
  35 #include "oops/objArrayKlass.hpp"
  36 #include "oops/oop.inline.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "runtime/frame.inline.hpp"
  39 #include "runtime/handles.inline.hpp"
  40 #include "runtime/sharedRuntime.hpp"
  41 #include "runtime/stubCodeGenerator.hpp"
  42 #include "runtime/stubRoutines.hpp"
  43 #include "runtime/thread.inline.hpp"
  44 
  45 // Declaration and definition of StubGenerator (no .hpp file).
  46 // For a more detailed description of the stub routine structure
  47 // see the comment in stubRoutines.hpp.
  48 
  49 #ifdef PRODUCT
  50 #define __ _masm->
  51 #else
  52 #define __ (Verbose ? (_masm->block_comment(FILE_AND_LINE),_masm):_masm)->
  53 #endif
  54 
  55 #define BLOCK_COMMENT(str) if (PrintAssembly) __ block_comment(str)
  56 #define BIND(label)        bind(label); BLOCK_COMMENT(#label ":")
  57 
  58 // -----------------------------------------------------------------------
  59 // Stub Code definitions
  60 
  61 class StubGenerator: public StubCodeGenerator {
  62  private:
  63 
  64   //----------------------------------------------------------------------
  65   // Call stubs are used to call Java from C.
  66 
  67   //
  68   // Arguments:
  69   //
  70   //   R2        - call wrapper address     : address
  71   //   R3        - result                   : intptr_t*
  72   //   R4        - result type              : BasicType
  73   //   R5        - method                   : method
  74   //   R6        - frame mgr entry point    : address
  75   //   [SP+160]  - parameter block          : intptr_t*
  76   //   [SP+172]  - parameter count in words : int
  77   //   [SP+176]  - thread                   : Thread*
  78   //
  79   address generate_call_stub(address& return_address) {
  80     // Set up a new C frame, copy Java arguments, call frame manager
  81     // or native_entry, and process result.
  82 
  83     StubCodeMark mark(this, "StubRoutines", "call_stub");
  84     address start = __ pc();
  85 
  86     Register r_arg_call_wrapper_addr   = Z_ARG1;
  87     Register r_arg_result_addr         = Z_ARG2;
  88     Register r_arg_result_type         = Z_ARG3;
  89     Register r_arg_method              = Z_ARG4;
  90     Register r_arg_entry               = Z_ARG5;
  91 
  92     // offsets to fp
  93     #define d_arg_thread 176
  94     #define d_arg_argument_addr 160
  95     #define d_arg_argument_count 168+4
  96 
  97     Register r_entryframe_fp           = Z_tmp_1;
  98     Register r_top_of_arguments_addr   = Z_ARG4;
  99     Register r_new_arg_entry = Z_R14;
 100 
 101     // macros for frame offsets
 102     #define call_wrapper_address_offset \
 103                _z_entry_frame_locals_neg(call_wrapper_address)
 104     #define result_address_offset \
 105               _z_entry_frame_locals_neg(result_address)
 106     #define result_type_offset \
 107               _z_entry_frame_locals_neg(result_type)
 108     #define arguments_tos_address_offset \
 109               _z_entry_frame_locals_neg(arguments_tos_address)
 110 
 111     {
 112       //
 113       // STACK on entry to call_stub:
 114       //
 115       //     F1      [C_FRAME]
 116       //            ...
 117       //
 118 
 119       Register r_argument_addr              = Z_tmp_3;
 120       Register r_argumentcopy_addr          = Z_tmp_4;
 121       Register r_argument_size_in_bytes     = Z_ARG5;
 122       Register r_frame_size                 = Z_R1;
 123 
 124       Label arguments_copied;
 125 
 126       // Save non-volatile registers to ABI of caller frame.
 127       BLOCK_COMMENT("save registers, push frame {");
 128       __ z_stmg(Z_R6, Z_R14, 16, Z_SP);
 129       __ z_std(Z_F8, 96, Z_SP);
 130       __ z_std(Z_F9, 104, Z_SP);
 131       __ z_std(Z_F10, 112, Z_SP);
 132       __ z_std(Z_F11, 120, Z_SP);
 133       __ z_std(Z_F12, 128, Z_SP);
 134       __ z_std(Z_F13, 136, Z_SP);
 135       __ z_std(Z_F14, 144, Z_SP);
 136       __ z_std(Z_F15, 152, Z_SP);
 137 
 138       //
 139       // Push ENTRY_FRAME including arguments:
 140       //
 141       //     F0      [TOP_IJAVA_FRAME_ABI]
 142       //             [outgoing Java arguments]
 143       //             [ENTRY_FRAME_LOCALS]
 144       //     F1      [C_FRAME]
 145       //             ...
 146       //
 147 
 148       // Calculate new frame size and push frame.
 149       #define abi_plus_locals_size \
 150                 (frame::z_top_ijava_frame_abi_size + frame::z_entry_frame_locals_size)
 151       if (abi_plus_locals_size % BytesPerWord == 0) {
 152         // Preload constant part of frame size.
 153         __ load_const_optimized(r_frame_size, -abi_plus_locals_size/BytesPerWord);
 154         // Keep copy of our frame pointer (caller's SP).
 155         __ z_lgr(r_entryframe_fp, Z_SP);
 156         // Add space required by arguments to frame size.
 157         __ z_slgf(r_frame_size, d_arg_argument_count, Z_R0, Z_SP);
 158         // Move Z_ARG5 early, it will be used as a local.
 159         __ z_lgr(r_new_arg_entry, r_arg_entry);
 160         // Convert frame size from words to bytes.
 161         __ z_sllg(r_frame_size, r_frame_size, LogBytesPerWord);
 162         __ push_frame(r_frame_size, r_entryframe_fp,
 163                       false/*don't copy SP*/, true /*frame size sign inverted*/);
 164       } else {
 165         guarantee(false, "frame sizes should be multiples of word size (BytesPerWord)");
 166       }
 167       BLOCK_COMMENT("} save, push");
 168 
 169       // Load argument registers for call.
 170       BLOCK_COMMENT("prepare/copy arguments {");
 171       __ z_lgr(Z_method, r_arg_method);
 172       __ z_lg(Z_thread, d_arg_thread, r_entryframe_fp);
 173 
 174       // Calculate top_of_arguments_addr which will be tos (not prepushed) later.
 175       // Wimply use SP + frame::top_ijava_frame_size.
 176       __ add2reg(r_top_of_arguments_addr,
 177                  frame::z_top_ijava_frame_abi_size - BytesPerWord, Z_SP);
 178 
 179       // Initialize call_stub locals (step 1).
 180       if ((call_wrapper_address_offset + BytesPerWord == result_address_offset) &&
 181           (result_address_offset + BytesPerWord == result_type_offset)          &&
 182           (result_type_offset + BytesPerWord == arguments_tos_address_offset)) {
 183 
 184         __ z_stmg(r_arg_call_wrapper_addr, r_top_of_arguments_addr,
 185                   call_wrapper_address_offset, r_entryframe_fp);
 186       } else {
 187         __ z_stg(r_arg_call_wrapper_addr,
 188                  call_wrapper_address_offset, r_entryframe_fp);
 189         __ z_stg(r_arg_result_addr,
 190                  result_address_offset, r_entryframe_fp);
 191         __ z_stg(r_arg_result_type,
 192                  result_type_offset, r_entryframe_fp);
 193         __ z_stg(r_top_of_arguments_addr,
 194                  arguments_tos_address_offset, r_entryframe_fp);
 195       }
 196 
 197       // Copy Java arguments.
 198 
 199       // Any arguments to copy?
 200       __ load_and_test_int2long(Z_R1, Address(r_entryframe_fp, d_arg_argument_count));
 201       __ z_bre(arguments_copied);
 202 
 203       // Prepare loop and copy arguments in reverse order.
 204       {
 205         // Calculate argument size in bytes.
 206         __ z_sllg(r_argument_size_in_bytes, Z_R1, LogBytesPerWord);
 207 
 208         // Get addr of first incoming Java argument.
 209         __ z_lg(r_argument_addr, d_arg_argument_addr, r_entryframe_fp);
 210 
 211         // Let r_argumentcopy_addr point to last outgoing Java argument.
 212         __ add2reg(r_argumentcopy_addr, BytesPerWord, r_top_of_arguments_addr); // = Z_SP+160 effectively.
 213 
 214         // Let r_argument_addr point to last incoming Java argument.
 215         __ add2reg_with_index(r_argument_addr, -BytesPerWord,
 216                               r_argument_size_in_bytes, r_argument_addr);
 217 
 218         // Now loop while Z_R1 > 0 and copy arguments.
 219         {
 220           Label next_argument;
 221           __ bind(next_argument);
 222           // Mem-mem move.
 223           __ z_mvc(0, BytesPerWord-1, r_argumentcopy_addr, 0, r_argument_addr);
 224           __ add2reg(r_argument_addr,    -BytesPerWord);
 225           __ add2reg(r_argumentcopy_addr, BytesPerWord);
 226           __ z_brct(Z_R1, next_argument);
 227         }
 228       }  // End of argument copy loop.
 229 
 230       __ bind(arguments_copied);
 231     }
 232     BLOCK_COMMENT("} arguments");
 233 
 234     BLOCK_COMMENT("call {");
 235     {
 236       // Call frame manager or native entry.
 237 
 238       //
 239       // Register state on entry to frame manager / native entry:
 240       //
 241       //   Z_ARG1 = r_top_of_arguments_addr  - intptr_t *sender tos (prepushed)
 242       //                                       Lesp = (SP) + copied_arguments_offset - 8
 243       //   Z_method                          - method
 244       //   Z_thread                          - JavaThread*
 245       //
 246 
 247       // Here, the usual SP is the initial_caller_sp.
 248       __ z_lgr(Z_R10, Z_SP);
 249 
 250       // Z_esp points to the slot below the last argument.
 251       __ z_lgr(Z_esp, r_top_of_arguments_addr);
 252 
 253       //
 254       // Stack on entry to frame manager / native entry:
 255       //
 256       //     F0      [TOP_IJAVA_FRAME_ABI]
 257       //             [outgoing Java arguments]
 258       //             [ENTRY_FRAME_LOCALS]
 259       //     F1      [C_FRAME]
 260       //             ...
 261       //
 262 
 263       // Do a light-weight C-call here, r_new_arg_entry holds the address
 264       // of the interpreter entry point (frame manager or native entry)
 265       // and save runtime-value of return_pc in return_address
 266       // (call by reference argument).
 267       return_address = __ call_stub(r_new_arg_entry);
 268     }
 269     BLOCK_COMMENT("} call");
 270 
 271     {
 272       BLOCK_COMMENT("restore registers {");
 273       // Returned from frame manager or native entry.
 274       // Now pop frame, process result, and return to caller.
 275 
 276       //
 277       // Stack on exit from frame manager / native entry:
 278       //
 279       //     F0      [ABI]
 280       //             ...
 281       //             [ENTRY_FRAME_LOCALS]
 282       //     F1      [C_FRAME]
 283       //             ...
 284       //
 285       // Just pop the topmost frame ...
 286       //
 287 
 288       // Restore frame pointer.
 289       __ z_lg(r_entryframe_fp, _z_abi(callers_sp), Z_SP);
 290       // Pop frame. Done here to minimize stalls.
 291       __ pop_frame();
 292 
 293       // Reload some volatile registers which we've spilled before the call
 294       // to frame manager / native entry.
 295       // Access all locals via frame pointer, because we know nothing about
 296       // the topmost frame's size.
 297       __ z_lg(r_arg_result_addr, result_address_offset, r_entryframe_fp);
 298       __ z_lg(r_arg_result_type, result_type_offset, r_entryframe_fp);
 299 
 300       // Restore non-volatiles.
 301       __ z_lmg(Z_R6, Z_R14, 16, Z_SP);
 302       __ z_ld(Z_F8, 96, Z_SP);
 303       __ z_ld(Z_F9, 104, Z_SP);
 304       __ z_ld(Z_F10, 112, Z_SP);
 305       __ z_ld(Z_F11, 120, Z_SP);
 306       __ z_ld(Z_F12, 128, Z_SP);
 307       __ z_ld(Z_F13, 136, Z_SP);
 308       __ z_ld(Z_F14, 144, Z_SP);
 309       __ z_ld(Z_F15, 152, Z_SP);
 310       BLOCK_COMMENT("} restore");
 311 
 312       //
 313       // Stack on exit from call_stub:
 314       //
 315       //     0       [C_FRAME]
 316       //             ...
 317       //
 318       // No call_stub frames left.
 319       //
 320 
 321       // All non-volatiles have been restored at this point!!
 322 
 323       //------------------------------------------------------------------------
 324       // The following code makes some assumptions on the T_<type> enum values.
 325       // The enum is defined in globalDefinitions.hpp.
 326       // The validity of the assumptions is tested as far as possible.
 327       //   The assigned values should not be shuffled
 328       //   T_BOOLEAN==4    - lowest used enum value
 329       //   T_NARROWOOP==16 - largest used enum value
 330       //------------------------------------------------------------------------
 331       BLOCK_COMMENT("process result {");
 332       Label firstHandler;
 333       int   handlerLen= 8;
 334 #ifdef ASSERT
 335       char  assertMsg[] = "check BasicType definition in globalDefinitions.hpp";
 336       __ z_chi(r_arg_result_type, T_BOOLEAN);
 337       __ asm_assert_low(assertMsg, 0x0234);
 338       __ z_chi(r_arg_result_type, T_NARROWOOP);
 339       __ asm_assert_high(assertMsg, 0x0235);
 340 #endif
 341       __ add2reg(r_arg_result_type, -T_BOOLEAN);          // Remove offset.
 342       __ z_larl(Z_R1, firstHandler);                      // location of first handler
 343       __ z_sllg(r_arg_result_type, r_arg_result_type, 3); // Each handler is 8 bytes long.
 344       __ z_bc(MacroAssembler::bcondAlways, 0, r_arg_result_type, Z_R1);
 345 
 346       __ align(handlerLen);
 347       __ bind(firstHandler);
 348       // T_BOOLEAN:
 349         guarantee(T_BOOLEAN == 4, "check BasicType definition in globalDefinitions.hpp");
 350         __ z_st(Z_RET, 0, r_arg_result_addr);
 351         __ z_br(Z_R14); // Return to caller.
 352         __ align(handlerLen);
 353       // T_CHAR:
 354         guarantee(T_CHAR == T_BOOLEAN+1, "check BasicType definition in globalDefinitions.hpp");
 355         __ z_st(Z_RET, 0, r_arg_result_addr);
 356         __ z_br(Z_R14); // Return to caller.
 357         __ align(handlerLen);
 358       // T_FLOAT:
 359         guarantee(T_FLOAT == T_CHAR+1, "check BasicType definition in globalDefinitions.hpp");
 360         __ z_ste(Z_FRET, 0, r_arg_result_addr);
 361         __ z_br(Z_R14); // Return to caller.
 362         __ align(handlerLen);
 363       // T_DOUBLE:
 364         guarantee(T_DOUBLE == T_FLOAT+1, "check BasicType definition in globalDefinitions.hpp");
 365         __ z_std(Z_FRET, 0, r_arg_result_addr);
 366         __ z_br(Z_R14); // Return to caller.
 367         __ align(handlerLen);
 368       // T_BYTE:
 369         guarantee(T_BYTE == T_DOUBLE+1, "check BasicType definition in globalDefinitions.hpp");
 370         __ z_st(Z_RET, 0, r_arg_result_addr);
 371         __ z_br(Z_R14); // Return to caller.
 372         __ align(handlerLen);
 373       // T_SHORT:
 374         guarantee(T_SHORT == T_BYTE+1, "check BasicType definition in globalDefinitions.hpp");
 375         __ z_st(Z_RET, 0, r_arg_result_addr);
 376         __ z_br(Z_R14); // Return to caller.
 377         __ align(handlerLen);
 378       // T_INT:
 379         guarantee(T_INT == T_SHORT+1, "check BasicType definition in globalDefinitions.hpp");
 380         __ z_st(Z_RET, 0, r_arg_result_addr);
 381         __ z_br(Z_R14); // Return to caller.
 382         __ align(handlerLen);
 383       // T_LONG:
 384         guarantee(T_LONG == T_INT+1, "check BasicType definition in globalDefinitions.hpp");
 385         __ z_stg(Z_RET, 0, r_arg_result_addr);
 386         __ z_br(Z_R14); // Return to caller.
 387         __ align(handlerLen);
 388       // T_OBJECT:
 389         guarantee(T_OBJECT == T_LONG+1, "check BasicType definition in globalDefinitions.hpp");
 390         __ z_stg(Z_RET, 0, r_arg_result_addr);
 391         __ z_br(Z_R14); // Return to caller.
 392         __ align(handlerLen);
 393       // T_ARRAY:
 394         guarantee(T_ARRAY == T_OBJECT+1, "check BasicType definition in globalDefinitions.hpp");
 395         __ z_stg(Z_RET, 0, r_arg_result_addr);
 396         __ z_br(Z_R14); // Return to caller.
 397         __ align(handlerLen);
 398       // T_VOID:
 399         guarantee(T_VOID == T_ARRAY+1, "check BasicType definition in globalDefinitions.hpp");
 400         __ z_stg(Z_RET, 0, r_arg_result_addr);
 401         __ z_br(Z_R14); // Return to caller.
 402         __ align(handlerLen);
 403       // T_ADDRESS:
 404         guarantee(T_ADDRESS == T_VOID+1, "check BasicType definition in globalDefinitions.hpp");
 405         __ z_stg(Z_RET, 0, r_arg_result_addr);
 406         __ z_br(Z_R14); // Return to caller.
 407         __ align(handlerLen);
 408       // T_NARROWOOP:
 409         guarantee(T_NARROWOOP == T_ADDRESS+1, "check BasicType definition in globalDefinitions.hpp");
 410         __ z_st(Z_RET, 0, r_arg_result_addr);
 411         __ z_br(Z_R14); // Return to caller.
 412         __ align(handlerLen);
 413       BLOCK_COMMENT("} process result");
 414     }
 415     return start;
 416   }
 417 
 418   // Return point for a Java call if there's an exception thrown in
 419   // Java code. The exception is caught and transformed into a
 420   // pending exception stored in JavaThread that can be tested from
 421   // within the VM.
 422   address generate_catch_exception() {
 423     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 424 
 425     address start = __ pc();
 426 
 427     //
 428     // Registers alive
 429     //
 430     //   Z_thread
 431     //   Z_ARG1 - address of pending exception
 432     //   Z_ARG2 - return address in call stub
 433     //
 434 
 435     const Register exception_file = Z_R0;
 436     const Register exception_line = Z_R1;
 437 
 438     __ load_const_optimized(exception_file, (void*)__FILE__);
 439     __ load_const_optimized(exception_line, (void*)__LINE__);
 440 
 441     __ z_stg(Z_ARG1, thread_(pending_exception));
 442     // Store into `char *'.
 443     __ z_stg(exception_file, thread_(exception_file));
 444     // Store into `int'.
 445     __ z_st(exception_line, thread_(exception_line));
 446 
 447     // Complete return to VM.
 448     assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
 449 
 450     // Continue in call stub.
 451     __ z_br(Z_ARG2);
 452 
 453     return start;
 454   }
 455 
 456   // Continuation point for runtime calls returning with a pending
 457   // exception. The pending exception check happened in the runtime
 458   // or native call stub. The pending exception in Thread is
 459   // converted into a Java-level exception.
 460   //
 461   // Read:
 462   //   Z_R14: pc the runtime library callee wants to return to.
 463   //   Since the exception occurred in the callee, the return pc
 464   //   from the point of view of Java is the exception pc.
 465   //
 466   // Invalidate:
 467   //   Volatile registers (except below).
 468   //
 469   // Update:
 470   //   Z_ARG1: exception
 471   //   (Z_R14 is unchanged and is live out).
 472   //
 473   address generate_forward_exception() {
 474     StubCodeMark mark(this, "StubRoutines", "forward_exception");
 475     address start = __ pc();
 476 
 477     #define pending_exception_offset in_bytes(Thread::pending_exception_offset())
 478 #ifdef ASSERT
 479     // Get pending exception oop.
 480     __ z_lg(Z_ARG1, pending_exception_offset, Z_thread);
 481 
 482     // Make sure that this code is only executed if there is a pending exception.
 483     {
 484       Label L;
 485       __ z_ltgr(Z_ARG1, Z_ARG1);
 486       __ z_brne(L);
 487       __ stop("StubRoutines::forward exception: no pending exception (1)");
 488       __ bind(L);
 489     }
 490 
 491     __ verify_oop(Z_ARG1, "StubRoutines::forward exception: not an oop");
 492 #endif
 493 
 494     __ z_lgr(Z_ARG2, Z_R14); // Copy exception pc into Z_ARG2.
 495     __ save_return_pc();
 496     __ push_frame_abi160(0);
 497     // Find exception handler.
 498     __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address),
 499                     Z_thread,
 500                     Z_ARG2);
 501     // Copy handler's address.
 502     __ z_lgr(Z_R1, Z_RET);
 503     __ pop_frame();
 504     __ restore_return_pc();
 505 
 506     // Set up the arguments for the exception handler:
 507     // - Z_ARG1: exception oop
 508     // - Z_ARG2: exception pc
 509 
 510     // Load pending exception oop.
 511     __ z_lg(Z_ARG1, pending_exception_offset, Z_thread);
 512 
 513     // The exception pc is the return address in the caller,
 514     // must load it into Z_ARG2
 515     __ z_lgr(Z_ARG2, Z_R14);
 516 
 517 #ifdef ASSERT
 518     // Make sure exception is set.
 519     { Label L;
 520       __ z_ltgr(Z_ARG1, Z_ARG1);
 521       __ z_brne(L);
 522       __ stop("StubRoutines::forward exception: no pending exception (2)");
 523       __ bind(L);
 524     }
 525 #endif
 526     // Clear the pending exception.
 527     __ clear_mem(Address(Z_thread, pending_exception_offset), sizeof(void *));
 528     // Jump to exception handler
 529     __ z_br(Z_R1 /*handler address*/);
 530 
 531     return start;
 532 
 533     #undef pending_exception_offset
 534   }
 535 
 536   // Continuation point for throwing of implicit exceptions that are
 537   // not handled in the current activation. Fabricates an exception
 538   // oop and initiates normal exception dispatching in this
 539   // frame. Only callee-saved registers are preserved (through the
 540   // normal RegisterMap handling). If the compiler
 541   // needs all registers to be preserved between the fault point and
 542   // the exception handler then it must assume responsibility for that
 543   // in AbstractCompiler::continuation_for_implicit_null_exception or
 544   // continuation_for_implicit_division_by_zero_exception. All other
 545   // implicit exceptions (e.g., NullPointerException or
 546   // AbstractMethodError on entry) are either at call sites or
 547   // otherwise assume that stack unwinding will be initiated, so
 548   // caller saved registers were assumed volatile in the compiler.
 549 
 550   // Note that we generate only this stub into a RuntimeStub, because
 551   // it needs to be properly traversed and ignored during GC, so we
 552   // change the meaning of the "__" macro within this method.
 553 
 554   // Note: the routine set_pc_not_at_call_for_caller in
 555   // SharedRuntime.cpp requires that this code be generated into a
 556   // RuntimeStub.
 557 #undef __
 558 #define __ masm->
 559 
 560   address generate_throw_exception(const char* name, address runtime_entry,
 561                                    bool restore_saved_exception_pc,
 562                                    Register arg1 = noreg, Register arg2 = noreg) {
 563     assert_different_registers(arg1, Z_R0_scratch);  // would be destroyed by push_frame()
 564     assert_different_registers(arg2, Z_R0_scratch);  // would be destroyed by push_frame()
 565 
 566     int insts_size = 256;
 567     int locs_size  = 0;
 568     CodeBuffer      code(name, insts_size, locs_size);
 569     MacroAssembler* masm = new MacroAssembler(&code);
 570     int framesize_in_bytes;
 571     address start = __ pc();
 572 
 573     __ save_return_pc();
 574     framesize_in_bytes = __ push_frame_abi160(0);
 575 
 576     address frame_complete_pc = __ pc();
 577     if (restore_saved_exception_pc) {
 578       __ unimplemented("StubGenerator::throw_exception", 74);
 579     }
 580 
 581     // Note that we always have a runtime stub frame on the top of stack at this point.
 582     __ get_PC(Z_R1);
 583     __ set_last_Java_frame(/*sp*/Z_SP, /*pc*/Z_R1);
 584 
 585     // Do the call.
 586     BLOCK_COMMENT("call runtime_entry");
 587     __ call_VM_leaf(runtime_entry, Z_thread, arg1, arg2);
 588 
 589     __ reset_last_Java_frame();
 590 
 591 #ifdef ASSERT
 592     // Make sure that this code is only executed if there is a pending exception.
 593     { Label L;
 594       __ z_lg(Z_R0,
 595                 in_bytes(Thread::pending_exception_offset()),
 596                 Z_thread);
 597       __ z_ltgr(Z_R0, Z_R0);
 598       __ z_brne(L);
 599       __ stop("StubRoutines::throw_exception: no pending exception");
 600       __ bind(L);
 601     }
 602 #endif
 603 
 604     __ pop_frame();
 605     __ restore_return_pc();
 606 
 607     __ load_const_optimized(Z_R1, StubRoutines::forward_exception_entry());
 608     __ z_br(Z_R1);
 609 
 610     RuntimeStub* stub =
 611       RuntimeStub::new_runtime_stub(name, &code,
 612                                     frame_complete_pc - start,
 613                                     framesize_in_bytes/wordSize,
 614                                     NULL /*oop_maps*/, false);
 615 
 616     return stub->entry_point();
 617   }
 618 
 619 #undef __
 620 #ifdef PRODUCT
 621 #define __ _masm->
 622 #else
 623 #define __ (Verbose ? (_masm->block_comment(FILE_AND_LINE),_masm):_masm)->
 624 #endif
 625 
 626   // Support for uint StubRoutine::zarch::partial_subtype_check(Klass
 627   // sub, Klass super);
 628   //
 629   // Arguments:
 630   //   ret  : Z_RET, returned
 631   //   sub  : Z_ARG2, argument, not changed
 632   //   super: Z_ARG3, argument, not changed
 633   //
 634   //   raddr: Z_R14, blown by call
 635   //
 636   address generate_partial_subtype_check() {
 637     StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
 638     Label miss;
 639 
 640     address start = __ pc();
 641 
 642     const Register Rsubklass   = Z_ARG2; // subklass
 643     const Register Rsuperklass = Z_ARG3; // superklass
 644 
 645     // No args, but tmp registers that are killed.
 646     const Register Rlength     = Z_ARG4; // cache array length
 647     const Register Rarray_ptr  = Z_ARG5; // Current value from cache array.
 648 
 649     if (UseCompressedOops) {
 650       assert(Universe::heap() != NULL, "java heap must be initialized to generate partial_subtype_check stub");
 651     }
 652 
 653     // Always take the slow path (see SPARC).
 654     __ check_klass_subtype_slow_path(Rsubklass, Rsuperklass,
 655                                      Rarray_ptr, Rlength, NULL, &miss);
 656 
 657     // Match falls through here.
 658     __ clear_reg(Z_RET);               // Zero indicates a match. Set EQ flag in CC.
 659     __ z_br(Z_R14);
 660 
 661     __ BIND(miss);
 662     __ load_const_optimized(Z_RET, 1); // One indicates a miss.
 663     __ z_ltgr(Z_RET, Z_RET);           // Set NE flag in CR.
 664     __ z_br(Z_R14);
 665 
 666     return start;
 667   }
 668 
 669   // Return address of code to be called from code generated by
 670   // MacroAssembler::verify_oop.
 671   //
 672   // Don't generate, rather use C++ code.
 673   address generate_verify_oop_subroutine() {
 674     // Don't generate a StubCodeMark, because no code is generated!
 675     // Generating the mark triggers notifying the oprofile jvmti agent
 676     // about the dynamic code generation, but the stub without
 677     // code (code_size == 0) confuses opjitconv
 678     // StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
 679 
 680     address start = 0;
 681     return start;
 682   }
 683 
 684   // This is to test that the count register contains a positive int value.
 685   // Required because C2 does not respect int to long conversion for stub calls.
 686   void assert_positive_int(Register count) {
 687 #ifdef ASSERT
 688     __ z_srag(Z_R0, count, 31);  // Just leave the sign (must be zero) in Z_R0.
 689     __ asm_assert_eq("missing zero extend", 0xAFFE);
 690 #endif
 691   }
 692 
 693   //  Generate overlap test for array copy stubs.
 694   //  If no actual overlap is detected, control is transferred to the
 695   //  "normal" copy stub (entry address passed in disjoint_copy_target).
 696   //  Otherwise, execution continues with the code generated by the
 697   //  caller of array_overlap_test.
 698   //
 699   //  Input:
 700   //    Z_ARG1    - from
 701   //    Z_ARG2    - to
 702   //    Z_ARG3    - element count
 703   void array_overlap_test(address disjoint_copy_target, int log2_elem_size) {
 704     __ MacroAssembler::compare_and_branch_optimized(Z_ARG2, Z_ARG1, Assembler::bcondNotHigh,
 705                                                     disjoint_copy_target, /*len64=*/true, /*has_sign=*/false);
 706 
 707     Register index = Z_ARG3;
 708     if (log2_elem_size > 0) {
 709       __ z_sllg(Z_R1, Z_ARG3, log2_elem_size);  // byte count
 710       index = Z_R1;
 711     }
 712     __ add2reg_with_index(Z_R1, 0, index, Z_ARG1);  // First byte after "from" range.
 713 
 714     __ MacroAssembler::compare_and_branch_optimized(Z_R1, Z_ARG2, Assembler::bcondNotHigh,
 715                                                     disjoint_copy_target, /*len64=*/true, /*has_sign=*/false);
 716 
 717     // Destructive overlap: let caller generate code for that.
 718   }
 719 
 720   //  Generate stub for disjoint array copy. If "aligned" is true, the
 721   //  "from" and "to" addresses are assumed to be heapword aligned.
 722   //
 723   //  Arguments for generated stub:
 724   //      from:  Z_ARG1
 725   //      to:    Z_ARG2
 726   //      count: Z_ARG3 treated as signed
 727   void generate_disjoint_copy(bool aligned, int element_size,
 728                               bool branchToEnd,
 729                               bool restoreArgs) {
 730     // This is the zarch specific stub generator for general array copy tasks.
 731     // It has the following prereqs and features:
 732     //
 733     // - No destructive overlap allowed (else unpredictable results).
 734     // - Destructive overlap does not exist if the leftmost byte of the target
 735     //   does not coincide with any of the source bytes (except the leftmost).
 736     //
 737     //   Register usage upon entry:
 738     //      Z_ARG1 == Z_R2 :   address of source array
 739     //      Z_ARG2 == Z_R3 :   address of target array
 740     //      Z_ARG3 == Z_R4 :   length of operands (# of elements on entry)
 741     //
 742     // Register usage within the generator:
 743     // - Z_R0 and Z_R1 are KILLed by the stub routine (target addr/len).
 744     //                 Used as pair register operand in complex moves, scratch registers anyway.
 745     // - Z_R5 is KILLed by the stub routine (source register pair addr/len) (even/odd reg).
 746     //                  Same as R0/R1, but no scratch register.
 747     // - Z_ARG1, Z_ARG2, Z_ARG3 are USEd but preserved by the stub routine,
 748     //                          but they might get temporarily overwritten.
 749 
 750     Register  save_reg    = Z_ARG4;   // (= Z_R5), holds original target operand address for restore.
 751 
 752     {
 753       Register   llen_reg = Z_R1;     // Holds left operand len (odd reg).
 754       Register  laddr_reg = Z_R0;     // Holds left operand addr (even reg), overlaps with data_reg.
 755       Register   rlen_reg = Z_R5;     // Holds right operand len (odd reg), overlaps with save_reg.
 756       Register  raddr_reg = Z_R4;     // Holds right operand addr (even reg), overlaps with len_reg.
 757 
 758       Register   data_reg = Z_R0;     // Holds copied data chunk in alignment process and copy loop.
 759       Register    len_reg = Z_ARG3;   // Holds operand len (#elements at entry, #bytes shortly after).
 760       Register    dst_reg = Z_ARG2;   // Holds left (target)  operand addr.
 761       Register    src_reg = Z_ARG1;   // Holds right (source) operand addr.
 762 
 763       Label     doMVCLOOP, doMVCLOOPcount, doMVCLOOPiterate;
 764       Label     doMVCUnrolled;
 765       NearLabel doMVC,  doMVCgeneral, done;
 766       Label     MVC_template;
 767       address   pcMVCblock_b, pcMVCblock_e;
 768 
 769       bool      usedMVCLE       = true;
 770       bool      usedMVCLOOP     = true;
 771       bool      usedMVCUnrolled = false;
 772       bool      usedMVC         = false;
 773       bool      usedMVCgeneral  = false;
 774 
 775       int       stride;
 776       Register  stride_reg;
 777       Register  ix_reg;
 778 
 779       assert((element_size<=256) && (256%element_size == 0), "element size must be <= 256, power of 2");
 780       unsigned int log2_size = exact_log2(element_size);
 781 
 782       switch (element_size) {
 783         case 1:  BLOCK_COMMENT("ARRAYCOPY DISJOINT byte  {"); break;
 784         case 2:  BLOCK_COMMENT("ARRAYCOPY DISJOINT short {"); break;
 785         case 4:  BLOCK_COMMENT("ARRAYCOPY DISJOINT int   {"); break;
 786         case 8:  BLOCK_COMMENT("ARRAYCOPY DISJOINT long  {"); break;
 787         default: BLOCK_COMMENT("ARRAYCOPY DISJOINT       {"); break;
 788       }
 789 
 790       assert_positive_int(len_reg);
 791 
 792       BLOCK_COMMENT("preparation {");
 793 
 794       // No copying if len <= 0.
 795       if (branchToEnd) {
 796         __ compare64_and_branch(len_reg, (intptr_t) 0, Assembler::bcondNotHigh, done);
 797       } else {
 798         if (VM_Version::has_CompareBranch()) {
 799           __ z_cgib(len_reg, 0, Assembler::bcondNotHigh, 0, Z_R14);
 800         } else {
 801           __ z_ltgr(len_reg, len_reg);
 802           __ z_bcr(Assembler::bcondNotPositive, Z_R14);
 803         }
 804       }
 805 
 806       // Prefetch just one cache line. Speculative opt for short arrays.
 807       // Do not use Z_R1 in prefetch. Is undefined here.
 808       if (VM_Version::has_Prefetch()) {
 809         __ z_pfd(0x01, 0, Z_R0, src_reg); // Fetch access.
 810         __ z_pfd(0x02, 0, Z_R0, dst_reg); // Store access.
 811       }
 812 
 813       BLOCK_COMMENT("} preparation");
 814 
 815       // Save args only if really needed.
 816       // Keep len test local to branch. Is generated only once.
 817 
 818       BLOCK_COMMENT("mode selection {");
 819 
 820       // Special handling for arrays with only a few elements.
 821       // Nothing fancy: just an executed MVC.
 822       if (log2_size > 0) {
 823         __ z_sllg(Z_R1, len_reg, log2_size); // Remember #bytes in Z_R1.
 824       }
 825       if (element_size != 8) {
 826         __ z_cghi(len_reg, 256/element_size);
 827         __ z_brnh(doMVC);
 828         usedMVC = true;
 829       }
 830       if (element_size == 8) { // Long and oop arrays are always aligned.
 831         __ z_cghi(len_reg, 256/element_size);
 832         __ z_brnh(doMVCUnrolled);
 833         usedMVCUnrolled = true;
 834       }
 835 
 836       // Prefetch another cache line. We, for sure, have more than one line to copy.
 837       if (VM_Version::has_Prefetch()) {
 838         __ z_pfd(0x01, 256, Z_R0, src_reg); // Fetch access.
 839         __ z_pfd(0x02, 256, Z_R0, dst_reg); // Store access.
 840       }
 841 
 842       if (restoreArgs) {
 843         // Remember entry value of ARG2 to restore all arguments later from that knowledge.
 844         __ z_lgr(save_reg, dst_reg);
 845       }
 846 
 847       __ z_cghi(len_reg, 4096/element_size);
 848       if (log2_size == 0) {
 849         __ z_lgr(Z_R1, len_reg); // Init Z_R1 with #bytes
 850       }
 851       __ z_brnh(doMVCLOOP);
 852 
 853       // Fall through to MVCLE case.
 854 
 855       BLOCK_COMMENT("} mode selection");
 856 
 857       // MVCLE: for long arrays
 858       //   DW aligned: Best performance for sizes > 4kBytes.
 859       //   unaligned:  Least complex for sizes > 256 bytes.
 860       if (usedMVCLE) {
 861         BLOCK_COMMENT("mode MVCLE {");
 862 
 863         // Setup registers for mvcle.
 864         //__ z_lgr(llen_reg, len_reg);// r1 <- r4  #bytes already in Z_R1, aka llen_reg.
 865         __ z_lgr(laddr_reg, dst_reg); // r0 <- r3
 866         __ z_lgr(raddr_reg, src_reg); // r4 <- r2
 867         __ z_lgr(rlen_reg, llen_reg); // r5 <- r1
 868 
 869         __ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0xb0);    // special: bypass cache
 870         // __ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0xb8); // special: Hold data in cache.
 871         // __ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0);
 872 
 873         if (restoreArgs) {
 874           // MVCLE updates the source (Z_R4,Z_R5) and target (Z_R0,Z_R1) register pairs.
 875           // Dst_reg (Z_ARG2) and src_reg (Z_ARG1) are left untouched. No restore required.
 876           // Len_reg (Z_ARG3) is destroyed and must be restored.
 877           __ z_slgr(laddr_reg, dst_reg);    // copied #bytes
 878           if (log2_size > 0) {
 879             __ z_srag(Z_ARG3, laddr_reg, log2_size); // Convert back to #elements.
 880           } else {
 881             __ z_lgr(Z_ARG3, laddr_reg);
 882           }
 883         }
 884         if (branchToEnd) {
 885           __ z_bru(done);
 886         } else {
 887           __ z_br(Z_R14);
 888         }
 889         BLOCK_COMMENT("} mode MVCLE");
 890       }
 891       // No fallthru possible here.
 892 
 893       //  MVCUnrolled: for short, aligned arrays.
 894 
 895       if (usedMVCUnrolled) {
 896         BLOCK_COMMENT("mode MVC unrolled {");
 897         stride = 8;
 898 
 899         // Generate unrolled MVC instructions.
 900         for (int ii = 32; ii > 1; ii--) {
 901           __ z_mvc(0, ii * stride-1, dst_reg, 0, src_reg); // ii*8 byte copy
 902           if (branchToEnd) {
 903             __ z_bru(done);
 904           } else {
 905             __ z_br(Z_R14);
 906           }
 907         }
 908 
 909         pcMVCblock_b = __ pc();
 910         __ z_mvc(0, 1 * stride-1, dst_reg, 0, src_reg); // 8 byte copy
 911         if (branchToEnd) {
 912           __ z_bru(done);
 913         } else {
 914           __ z_br(Z_R14);
 915         }
 916 
 917         pcMVCblock_e = __ pc();
 918         Label MVC_ListEnd;
 919         __ bind(MVC_ListEnd);
 920 
 921         // This is an absolute fast path:
 922         // - Array len in bytes must be not greater than 256.
 923         // - Array len in bytes must be an integer mult of DW
 924         //   to save expensive handling of trailing bytes.
 925         // - Argument restore is not done,
 926         //   i.e. previous code must not alter arguments (this code doesn't either).
 927 
 928         __ bind(doMVCUnrolled);
 929 
 930         // Avoid mul, prefer shift where possible.
 931         // Combine shift right (for #DW) with shift left (for block size).
 932         // Set CC for zero test below (asm_assert).
 933         // Note: #bytes comes in Z_R1, #DW in len_reg.
 934         unsigned int MVCblocksize    = pcMVCblock_e - pcMVCblock_b;
 935         unsigned int logMVCblocksize = 0xffffffffU; // Pacify compiler ("used uninitialized" warning).
 936 
 937         if (log2_size > 0) { // Len was scaled into Z_R1.
 938           switch (MVCblocksize) {
 939 
 940             case  8: logMVCblocksize = 3;
 941                      __ z_ltgr(Z_R0, Z_R1); // #bytes is index
 942                      break;                 // reasonable size, use shift
 943 
 944             case 16: logMVCblocksize = 4;
 945                      __ z_slag(Z_R0, Z_R1, logMVCblocksize-log2_size);
 946                      break;                 // reasonable size, use shift
 947 
 948             default: logMVCblocksize = 0;
 949                      __ z_ltgr(Z_R0, len_reg); // #DW for mul
 950                      break;                 // all other sizes: use mul
 951           }
 952         } else {
 953           guarantee(log2_size, "doMVCUnrolled: only for DW entities");
 954         }
 955 
 956         // This test (and branch) is redundant. Previous code makes sure that
 957         //  - element count > 0
 958         //  - element size == 8.
 959         // Thus, len reg should never be zero here. We insert an asm_assert() here,
 960         // just to double-check and to be on the safe side.
 961         __ asm_assert(false, "zero len cannot occur", 99);
 962 
 963         __ z_larl(Z_R1, MVC_ListEnd);        // Get addr of last instr block.
 964         // Avoid mul, prefer shift where possible.
 965         if (logMVCblocksize == 0) {
 966           __ z_mghi(Z_R0, MVCblocksize);
 967         }
 968         __ z_slgr(Z_R1, Z_R0);
 969         __ z_br(Z_R1);
 970         BLOCK_COMMENT("} mode MVC unrolled");
 971       }
 972       // No fallthru possible here.
 973 
 974       // MVC execute template
 975       // Must always generate. Usage may be switched on below.
 976       // There is no suitable place after here to put the template.
 977       __ bind(MVC_template);
 978       __ z_mvc(0,0,dst_reg,0,src_reg);      // Instr template, never exec directly!
 979 
 980 
 981       // MVC Loop: for medium-sized arrays
 982 
 983       // Only for DW aligned arrays (src and dst).
 984       // #bytes to copy must be at least 256!!!
 985       // Non-aligned cases handled separately.
 986       stride     = 256;
 987       stride_reg = Z_R1;   // Holds #bytes when control arrives here.
 988       ix_reg     = Z_ARG3; // Alias for len_reg.
 989 
 990 
 991       if (usedMVCLOOP) {
 992         BLOCK_COMMENT("mode MVC loop {");
 993         __ bind(doMVCLOOP);
 994 
 995         __ z_lcgr(ix_reg, Z_R1);         // Ix runs from -(n-2)*stride to 1*stride (inclusive).
 996         __ z_llill(stride_reg, stride);
 997         __ add2reg(ix_reg, 2*stride);    // Thus: increment ix by 2*stride.
 998 
 999         __ bind(doMVCLOOPiterate);
1000           __ z_mvc(0, stride-1, dst_reg, 0, src_reg);
1001           __ add2reg(dst_reg, stride);
1002           __ add2reg(src_reg, stride);
1003           __ bind(doMVCLOOPcount);
1004           __ z_brxlg(ix_reg, stride_reg, doMVCLOOPiterate);
1005 
1006         // Don 't use add2reg() here, since we must set the condition code!
1007         __ z_aghi(ix_reg, -2*stride);       // Compensate incr from above: zero diff means "all copied".
1008 
1009         if (restoreArgs) {
1010           __ z_lcgr(Z_R1, ix_reg);          // Prepare ix_reg for copy loop, #bytes expected in Z_R1.
1011           __ z_brnz(doMVCgeneral);          // We're not done yet, ix_reg is not zero.
1012 
1013           // ARG1, ARG2, and ARG3 were altered by the code above, so restore them building on save_reg.
1014           __ z_slgr(dst_reg, save_reg);     // copied #bytes
1015           __ z_slgr(src_reg, dst_reg);      // = ARG1 (now restored)
1016           if (log2_size) {
1017             __ z_srag(Z_ARG3, dst_reg, log2_size); // Convert back to #elements to restore ARG3.
1018           } else {
1019             __ z_lgr(Z_ARG3, dst_reg);
1020           }
1021           __ z_lgr(Z_ARG2, save_reg);       // ARG2 now restored.
1022 
1023           if (branchToEnd) {
1024             __ z_bru(done);
1025           } else {
1026             __ z_br(Z_R14);
1027           }
1028 
1029         } else {
1030             if (branchToEnd) {
1031               __ z_brz(done);                        // CC set by aghi instr.
1032           } else {
1033               __ z_bcr(Assembler::bcondZero, Z_R14); // We're all done if zero.
1034             }
1035 
1036           __ z_lcgr(Z_R1, ix_reg);    // Prepare ix_reg for copy loop, #bytes expected in Z_R1.
1037           // __ z_bru(doMVCgeneral);  // fallthru
1038         }
1039         usedMVCgeneral = true;
1040         BLOCK_COMMENT("} mode MVC loop");
1041       }
1042       // Fallthru to doMVCgeneral
1043 
1044       // MVCgeneral: for short, unaligned arrays, after other copy operations
1045 
1046       // Somewhat expensive due to use of EX instruction, but simple.
1047       if (usedMVCgeneral) {
1048         BLOCK_COMMENT("mode MVC general {");
1049         __ bind(doMVCgeneral);
1050 
1051         __ add2reg(len_reg, -1, Z_R1);             // Get #bytes-1 for EXECUTE.
1052         if (VM_Version::has_ExecuteExtensions()) {
1053           __ z_exrl(len_reg, MVC_template);        // Execute MVC with variable length.
1054         } else {
1055           __ z_larl(Z_R1, MVC_template);           // Get addr of instr template.
1056           __ z_ex(len_reg, 0, Z_R0, Z_R1);         // Execute MVC with variable length.
1057         }                                          // penalty: 9 ticks
1058 
1059         if (restoreArgs) {
1060           // ARG1, ARG2, and ARG3 were altered by code executed before, so restore them building on save_reg
1061           __ z_slgr(dst_reg, save_reg);            // Copied #bytes without the "doMVCgeneral" chunk
1062           __ z_slgr(src_reg, dst_reg);             // = ARG1 (now restored), was not advanced for "doMVCgeneral" chunk
1063           __ add2reg_with_index(dst_reg, 1, len_reg, dst_reg); // Len of executed MVC was not accounted for, yet.
1064           if (log2_size) {
1065             __ z_srag(Z_ARG3, dst_reg, log2_size); // Convert back to #elements to restore ARG3
1066           } else {
1067              __ z_lgr(Z_ARG3, dst_reg);
1068           }
1069           __ z_lgr(Z_ARG2, save_reg);              // ARG2 now restored.
1070         }
1071 
1072         if (usedMVC) {
1073           if (branchToEnd) {
1074             __ z_bru(done);
1075           } else {
1076             __ z_br(Z_R14);
1077         }
1078         } else {
1079           if (!branchToEnd) __ z_br(Z_R14);
1080         }
1081         BLOCK_COMMENT("} mode MVC general");
1082       }
1083       // Fallthru possible if following block not generated.
1084 
1085       // MVC: for short, unaligned arrays
1086 
1087       // Somewhat expensive due to use of EX instruction, but simple. penalty: 9 ticks.
1088       // Differs from doMVCgeneral in reconstruction of ARG2, ARG3, and ARG4.
1089       if (usedMVC) {
1090         BLOCK_COMMENT("mode MVC {");
1091         __ bind(doMVC);
1092 
1093         // get #bytes-1 for EXECUTE
1094         if (log2_size) {
1095           __ add2reg(Z_R1, -1);                // Length was scaled into Z_R1.
1096         } else {
1097           __ add2reg(Z_R1, -1, len_reg);       // Length was not scaled.
1098         }
1099 
1100         if (VM_Version::has_ExecuteExtensions()) {
1101           __ z_exrl(Z_R1, MVC_template);       // Execute MVC with variable length.
1102         } else {
1103           __ z_lgr(Z_R0, Z_R5);                // Save ARG4, may be unnecessary.
1104           __ z_larl(Z_R5, MVC_template);       // Get addr of instr template.
1105           __ z_ex(Z_R1, 0, Z_R0, Z_R5);        // Execute MVC with variable length.
1106           __ z_lgr(Z_R5, Z_R0);                // Restore ARG4, may be unnecessary.
1107         }
1108 
1109         if (!branchToEnd) {
1110           __ z_br(Z_R14);
1111         }
1112         BLOCK_COMMENT("} mode MVC");
1113       }
1114 
1115       __ bind(done);
1116 
1117       switch (element_size) {
1118         case 1:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT byte "); break;
1119         case 2:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT short"); break;
1120         case 4:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT int  "); break;
1121         case 8:  BLOCK_COMMENT("} ARRAYCOPY DISJOINT long "); break;
1122         default: BLOCK_COMMENT("} ARRAYCOPY DISJOINT      "); break;
1123       }
1124     }
1125   }
1126 
1127   // Generate stub for conjoint array copy. If "aligned" is true, the
1128   // "from" and "to" addresses are assumed to be heapword aligned.
1129   //
1130   // Arguments for generated stub:
1131   //   from:  Z_ARG1
1132   //   to:    Z_ARG2
1133   //   count: Z_ARG3 treated as signed
1134   void generate_conjoint_copy(bool aligned, int element_size, bool branchToEnd) {
1135 
1136     // This is the zarch specific stub generator for general array copy tasks.
1137     // It has the following prereqs and features:
1138     //
1139     // - Destructive overlap exists and is handled by reverse copy.
1140     // - Destructive overlap exists if the leftmost byte of the target
1141     //   does coincide with any of the source bytes (except the leftmost).
1142     // - Z_R0 and Z_R1 are KILLed by the stub routine (data and stride)
1143     // - Z_ARG1 and Z_ARG2 are USEd but preserved by the stub routine.
1144     // - Z_ARG3 is USED but preserved by the stub routine.
1145     // - Z_ARG4 is used as index register and is thus KILLed.
1146     //
1147     {
1148       Register stride_reg = Z_R1;     // Stride & compare value in loop (negative element_size).
1149       Register   data_reg = Z_R0;     // Holds value of currently processed element.
1150       Register     ix_reg = Z_ARG4;   // Holds byte index of currently processed element.
1151       Register    len_reg = Z_ARG3;   // Holds length (in #elements) of arrays.
1152       Register    dst_reg = Z_ARG2;   // Holds left  operand addr.
1153       Register    src_reg = Z_ARG1;   // Holds right operand addr.
1154 
1155       assert(256%element_size == 0, "Element size must be power of 2.");
1156       assert(element_size     <= 8, "Can't handle more than DW units.");
1157 
1158       switch (element_size) {
1159         case 1:  BLOCK_COMMENT("ARRAYCOPY CONJOINT byte  {"); break;
1160         case 2:  BLOCK_COMMENT("ARRAYCOPY CONJOINT short {"); break;
1161         case 4:  BLOCK_COMMENT("ARRAYCOPY CONJOINT int   {"); break;
1162         case 8:  BLOCK_COMMENT("ARRAYCOPY CONJOINT long  {"); break;
1163         default: BLOCK_COMMENT("ARRAYCOPY CONJOINT       {"); break;
1164       }
1165 
1166       assert_positive_int(len_reg);
1167 
1168       if (VM_Version::has_Prefetch()) {
1169         __ z_pfd(0x01, 0, Z_R0, src_reg); // Fetch access.
1170         __ z_pfd(0x02, 0, Z_R0, dst_reg); // Store access.
1171       }
1172 
1173       unsigned int log2_size = exact_log2(element_size);
1174       if (log2_size) {
1175         __ z_sllg(ix_reg, len_reg, log2_size);
1176       } else {
1177         __ z_lgr(ix_reg, len_reg);
1178       }
1179 
1180       // Optimize reverse copy loop.
1181       // Main loop copies DW units which may be unaligned. Unaligned access adds some penalty ticks.
1182       // Unaligned DW access (neither fetch nor store) is DW-atomic, but should be alignment-atomic.
1183       // Preceding the main loop, some bytes are copied to obtain a DW-multiple remaining length.
1184 
1185       Label countLoop1;
1186       Label copyLoop1;
1187       Label skipBY;
1188       Label skipHW;
1189       int   stride = -8;
1190 
1191       __ load_const_optimized(stride_reg, stride); // Prepare for DW copy loop.
1192 
1193       if (element_size == 8)    // Nothing to do here.
1194         __ z_bru(countLoop1);
1195       else {                    // Do not generate dead code.
1196         __ z_tmll(ix_reg, 7);   // Check the "odd" bits.
1197         __ z_bre(countLoop1);   // There are none, very good!
1198       }
1199 
1200       if (log2_size == 0) {     // Handle leftover Byte.
1201         __ z_tmll(ix_reg, 1);
1202         __ z_bre(skipBY);
1203         __ z_lb(data_reg,   -1, ix_reg, src_reg);
1204         __ z_stcy(data_reg, -1, ix_reg, dst_reg);
1205         __ add2reg(ix_reg, -1); // Decrement delayed to avoid AGI.
1206         __ bind(skipBY);
1207         // fallthru
1208       }
1209       if (log2_size <= 1) {     // Handle leftover HW.
1210         __ z_tmll(ix_reg, 2);
1211         __ z_bre(skipHW);
1212         __ z_lhy(data_reg,  -2, ix_reg, src_reg);
1213         __ z_sthy(data_reg, -2, ix_reg, dst_reg);
1214         __ add2reg(ix_reg, -2); // Decrement delayed to avoid AGI.
1215         __ bind(skipHW);
1216         __ z_tmll(ix_reg, 4);
1217         __ z_bre(countLoop1);
1218         // fallthru
1219       }
1220       if (log2_size <= 2) {     // There are just 4 bytes (left) that need to be copied.
1221         __ z_ly(data_reg,  -4, ix_reg, src_reg);
1222         __ z_sty(data_reg, -4, ix_reg, dst_reg);
1223         __ add2reg(ix_reg, -4); // Decrement delayed to avoid AGI.
1224         __ z_bru(countLoop1);
1225       }
1226 
1227       // Control can never get to here. Never! Never ever!
1228       __ z_illtrap(0x99);
1229       __ bind(copyLoop1);
1230       __ z_lg(data_reg,  0, ix_reg, src_reg);
1231       __ z_stg(data_reg, 0, ix_reg, dst_reg);
1232       __ bind(countLoop1);
1233       __ z_brxhg(ix_reg, stride_reg, copyLoop1);
1234 
1235       if (!branchToEnd)
1236         __ z_br(Z_R14);
1237 
1238       switch (element_size) {
1239         case 1:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT byte "); break;
1240         case 2:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT short"); break;
1241         case 4:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT int  "); break;
1242         case 8:  BLOCK_COMMENT("} ARRAYCOPY CONJOINT long "); break;
1243         default: BLOCK_COMMENT("} ARRAYCOPY CONJOINT      "); break;
1244       }
1245     }
1246   }
1247 
1248   // Generate stub for disjoint byte copy. If "aligned" is true, the
1249   // "from" and "to" addresses are assumed to be heapword aligned.
1250   address generate_disjoint_byte_copy(bool aligned, const char * name) {
1251     StubCodeMark mark(this, "StubRoutines", name);
1252 
1253     // This is the zarch specific stub generator for byte array copy.
1254     // Refer to generate_disjoint_copy for a list of prereqs and features:
1255     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1256     generate_disjoint_copy(aligned, 1, false, false);
1257     return __ addr_at(start_off);
1258   }
1259 
1260 
1261   address generate_disjoint_short_copy(bool aligned, const char * name) {
1262     StubCodeMark mark(this, "StubRoutines", name);
1263     // This is the zarch specific stub generator for short array copy.
1264     // Refer to generate_disjoint_copy for a list of prereqs and features:
1265     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1266     generate_disjoint_copy(aligned, 2, false, false);
1267     return __ addr_at(start_off);
1268   }
1269 
1270 
1271   address generate_disjoint_int_copy(bool aligned, const char * name) {
1272     StubCodeMark mark(this, "StubRoutines", name);
1273     // This is the zarch specific stub generator for int array copy.
1274     // Refer to generate_disjoint_copy for a list of prereqs and features:
1275     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1276     generate_disjoint_copy(aligned, 4, false, false);
1277     return __ addr_at(start_off);
1278   }
1279 
1280 
1281   address generate_disjoint_long_copy(bool aligned, const char * name) {
1282     StubCodeMark mark(this, "StubRoutines", name);
1283     // This is the zarch specific stub generator for long array copy.
1284     // Refer to generate_disjoint_copy for a list of prereqs and features:
1285     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1286     generate_disjoint_copy(aligned, 8, false, false);
1287     return __ addr_at(start_off);
1288   }
1289 
1290 
1291   address generate_disjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
1292     StubCodeMark mark(this, "StubRoutines", name);
1293     // This is the zarch specific stub generator for oop array copy.
1294     // Refer to generate_disjoint_copy for a list of prereqs and features.
1295     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1296     unsigned int size      = UseCompressedOops ? 4 : 8;
1297 
1298     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1299     if (dest_uninitialized) {
1300       decorators |= IS_DEST_UNINITIALIZED;
1301     }
1302     if (aligned) {
1303       decorators |= ARRAYCOPY_ALIGNED;
1304     }
1305 
1306     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1307     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, Z_ARG1, Z_ARG2, Z_ARG3);
1308 
1309     generate_disjoint_copy(aligned, size, true, true);
1310 
1311     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, Z_ARG2, Z_ARG3, true);
1312 
1313     return __ addr_at(start_off);
1314   }
1315 
1316 
1317   address generate_conjoint_byte_copy(bool aligned, const char * name) {
1318     StubCodeMark mark(this, "StubRoutines", name);
1319     // This is the zarch specific stub generator for overlapping byte array copy.
1320     // Refer to generate_conjoint_copy for a list of prereqs and features:
1321     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1322     address nooverlap_target = aligned ? StubRoutines::arrayof_jbyte_disjoint_arraycopy()
1323                                        : StubRoutines::jbyte_disjoint_arraycopy();
1324 
1325     array_overlap_test(nooverlap_target, 0); // Branch away to nooverlap_target if disjoint.
1326     generate_conjoint_copy(aligned, 1, false);
1327 
1328     return __ addr_at(start_off);
1329   }
1330 
1331 
1332   address generate_conjoint_short_copy(bool aligned, const char * name) {
1333     StubCodeMark mark(this, "StubRoutines", name);
1334     // This is the zarch specific stub generator for overlapping short array copy.
1335     // Refer to generate_conjoint_copy for a list of prereqs and features:
1336     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1337     address nooverlap_target = aligned ? StubRoutines::arrayof_jshort_disjoint_arraycopy()
1338                                        : StubRoutines::jshort_disjoint_arraycopy();
1339 
1340     array_overlap_test(nooverlap_target, 1); // Branch away to nooverlap_target if disjoint.
1341     generate_conjoint_copy(aligned, 2, false);
1342 
1343     return __ addr_at(start_off);
1344   }
1345 
1346   address generate_conjoint_int_copy(bool aligned, const char * name) {
1347     StubCodeMark mark(this, "StubRoutines", name);
1348     // This is the zarch specific stub generator for overlapping int array copy.
1349     // Refer to generate_conjoint_copy for a list of prereqs and features:
1350 
1351     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1352     address nooverlap_target = aligned ? StubRoutines::arrayof_jint_disjoint_arraycopy()
1353                                        : StubRoutines::jint_disjoint_arraycopy();
1354 
1355     array_overlap_test(nooverlap_target, 2); // Branch away to nooverlap_target if disjoint.
1356     generate_conjoint_copy(aligned, 4, false);
1357 
1358     return __ addr_at(start_off);
1359   }
1360 
1361   address generate_conjoint_long_copy(bool aligned, const char * name) {
1362     StubCodeMark mark(this, "StubRoutines", name);
1363     // This is the zarch specific stub generator for overlapping long array copy.
1364     // Refer to generate_conjoint_copy for a list of prereqs and features:
1365 
1366     unsigned int start_off   = __ offset();  // Remember stub start address (is rtn value).
1367     address nooverlap_target = aligned ? StubRoutines::arrayof_jlong_disjoint_arraycopy()
1368                                        : StubRoutines::jlong_disjoint_arraycopy();
1369 
1370     array_overlap_test(nooverlap_target, 3); // Branch away to nooverlap_target if disjoint.
1371     generate_conjoint_copy(aligned, 8, false);
1372 
1373     return __ addr_at(start_off);
1374   }
1375 
1376   address generate_conjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
1377     StubCodeMark mark(this, "StubRoutines", name);
1378     // This is the zarch specific stub generator for overlapping oop array copy.
1379     // Refer to generate_conjoint_copy for a list of prereqs and features.
1380     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1381     unsigned int size      = UseCompressedOops ? 4 : 8;
1382     unsigned int shift     = UseCompressedOops ? 2 : 3;
1383 
1384     address nooverlap_target = aligned ? StubRoutines::arrayof_oop_disjoint_arraycopy(dest_uninitialized)
1385                                        : StubRoutines::oop_disjoint_arraycopy(dest_uninitialized);
1386 
1387     // Branch to disjoint_copy (if applicable) before pre_barrier to avoid double pre_barrier.
1388     array_overlap_test(nooverlap_target, shift);  // Branch away to nooverlap_target if disjoint.
1389 
1390     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1391     if (dest_uninitialized) {
1392       decorators |= IS_DEST_UNINITIALIZED;
1393     }
1394     if (aligned) {
1395       decorators |= ARRAYCOPY_ALIGNED;
1396     }
1397 
1398     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1399     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, Z_ARG1, Z_ARG2, Z_ARG3);
1400 
1401     generate_conjoint_copy(aligned, size, true);  // Must preserve ARG2, ARG3.
1402 
1403     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, Z_ARG2, Z_ARG3, true);
1404 
1405     return __ addr_at(start_off);
1406   }
1407 
1408 
1409   void generate_arraycopy_stubs() {
1410 
1411     // Note: the disjoint stubs must be generated first, some of
1412     // the conjoint stubs use them.
1413     StubRoutines::_jbyte_disjoint_arraycopy      = generate_disjoint_byte_copy (false, "jbyte_disjoint_arraycopy");
1414     StubRoutines::_jshort_disjoint_arraycopy     = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
1415     StubRoutines::_jint_disjoint_arraycopy       = generate_disjoint_int_copy  (false, "jint_disjoint_arraycopy");
1416     StubRoutines::_jlong_disjoint_arraycopy      = generate_disjoint_long_copy (false, "jlong_disjoint_arraycopy");
1417     StubRoutines::_oop_disjoint_arraycopy        = generate_disjoint_oop_copy  (false, "oop_disjoint_arraycopy", false);
1418     StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy  (false, "oop_disjoint_arraycopy_uninit", true);
1419 
1420     StubRoutines::_arrayof_jbyte_disjoint_arraycopy      = generate_disjoint_byte_copy (true, "arrayof_jbyte_disjoint_arraycopy");
1421     StubRoutines::_arrayof_jshort_disjoint_arraycopy     = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
1422     StubRoutines::_arrayof_jint_disjoint_arraycopy       = generate_disjoint_int_copy  (true, "arrayof_jint_disjoint_arraycopy");
1423     StubRoutines::_arrayof_jlong_disjoint_arraycopy      = generate_disjoint_long_copy (true, "arrayof_jlong_disjoint_arraycopy");
1424     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy  (true, "arrayof_oop_disjoint_arraycopy", false);
1425     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy  (true, "arrayof_oop_disjoint_arraycopy_uninit", true);
1426 
1427     StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy (false, "jbyte_arraycopy");
1428     StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, "jshort_arraycopy");
1429     StubRoutines::_jint_arraycopy            = generate_conjoint_int_copy  (false, "jint_arraycopy");
1430     StubRoutines::_jlong_arraycopy           = generate_conjoint_long_copy (false, "jlong_arraycopy");
1431     StubRoutines::_oop_arraycopy             = generate_conjoint_oop_copy  (false, "oop_arraycopy", false);
1432     StubRoutines::_oop_arraycopy_uninit      = generate_conjoint_oop_copy  (false, "oop_arraycopy_uninit", true);
1433 
1434     StubRoutines::_arrayof_jbyte_arraycopy      = generate_conjoint_byte_copy (true, "arrayof_jbyte_arraycopy");
1435     StubRoutines::_arrayof_jshort_arraycopy     = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
1436     StubRoutines::_arrayof_jint_arraycopy       = generate_conjoint_int_copy  (true, "arrayof_jint_arraycopy");
1437     StubRoutines::_arrayof_jlong_arraycopy      = generate_conjoint_long_copy (true, "arrayof_jlong_arraycopy");
1438     StubRoutines::_arrayof_oop_arraycopy        = generate_conjoint_oop_copy  (true, "arrayof_oop_arraycopy", false);
1439     StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy  (true, "arrayof_oop_arraycopy_uninit", true);
1440   }
1441 
1442   void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) {
1443 
1444     // safefetch signatures:
1445     //   int      SafeFetch32(int*      adr, int      errValue);
1446     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
1447     //
1448     // arguments:
1449     //   Z_ARG1 = adr
1450     //   Z_ARG2 = errValue
1451     //
1452     // result:
1453     //   Z_RET  = *adr or errValue
1454 
1455     StubCodeMark mark(this, "StubRoutines", name);
1456 
1457     // entry point
1458     // Load *adr into Z_ARG2, may fault.
1459     *entry = *fault_pc = __ pc();
1460     switch (size) {
1461       case 4:
1462         // Sign extended int32_t.
1463         __ z_lgf(Z_ARG2, 0, Z_ARG1);
1464         break;
1465       case 8:
1466         // int64_t
1467         __ z_lg(Z_ARG2, 0, Z_ARG1);
1468         break;
1469       default:
1470         ShouldNotReachHere();
1471     }
1472 
1473     // Return errValue or *adr.
1474     *continuation_pc = __ pc();
1475     __ z_lgr(Z_RET, Z_ARG2);
1476     __ z_br(Z_R14);
1477 
1478   }
1479 
1480   // Call interface for AES_encryptBlock, AES_decryptBlock stubs.
1481   //
1482   //   Z_ARG1 - source data block. Ptr to leftmost byte to be processed.
1483   //   Z_ARG2 - destination data block. Ptr to leftmost byte to be stored.
1484   //            For in-place encryption/decryption, ARG1 and ARG2 can point
1485   //            to the same piece of storage.
1486   //   Z_ARG3 - Crypto key address (expanded key). The first n bits of
1487   //            the expanded key constitute the original AES-<n> key (see below).
1488   //
1489   //   Z_RET  - return value. First unprocessed byte offset in src buffer.
1490   //
1491   // Some remarks:
1492   //   The crypto key, as passed from the caller to these encryption stubs,
1493   //   is a so-called expanded key. It is derived from the original key
1494   //   by the Rijndael key schedule, see http://en.wikipedia.org/wiki/Rijndael_key_schedule
1495   //   With the expanded key, the cipher/decipher task is decomposed in
1496   //   multiple, less complex steps, called rounds. Sun SPARC and Intel
1497   //   processors obviously implement support for those less complex steps.
1498   //   z/Architecture provides instructions for full cipher/decipher complexity.
1499   //   Therefore, we need the original, not the expanded key here.
1500   //   Luckily, the first n bits of an AES-<n> expanded key are formed
1501   //   by the original key itself. That takes us out of trouble. :-)
1502   //   The key length (in bytes) relation is as follows:
1503   //     original    expanded   rounds  key bit     keylen
1504   //    key bytes   key bytes            length   in words
1505   //           16         176       11      128         44
1506   //           24         208       13      192         52
1507   //           32         240       15      256         60
1508   //
1509   // The crypto instructions used in the AES* stubs have some specific register requirements.
1510   //   Z_R0   holds the crypto function code. Please refer to the KM/KMC instruction
1511   //          description in the "z/Architecture Principles of Operation" manual for details.
1512   //   Z_R1   holds the parameter block address. The parameter block contains the cryptographic key
1513   //          (KM instruction) and the chaining value (KMC instruction).
1514   //   dst    must designate an even-numbered register, holding the address of the output message.
1515   //   src    must designate an even/odd register pair, holding the address/length of the original message
1516 
1517   // Helper function which generates code to
1518   //  - load the function code in register fCode (== Z_R0).
1519   //  - load the data block length (depends on cipher function) into register srclen if requested.
1520   //  - is_decipher switches between cipher/decipher function codes
1521   //  - set_len requests (if true) loading the data block length in register srclen
1522   void generate_load_AES_fCode(Register keylen, Register fCode, Register srclen, bool is_decipher) {
1523 
1524     BLOCK_COMMENT("Set fCode {"); {
1525       Label fCode_set;
1526       int   mode = is_decipher ? VM_Version::CipherMode::decipher : VM_Version::CipherMode::cipher;
1527       bool  identical_dataBlk_len =  (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES192_dataBlk)
1528                                   && (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES256_dataBlk);
1529       // Expanded key length is 44/52/60 * 4 bytes for AES-128/AES-192/AES-256.
1530       __ z_cghi(keylen, 52); // Check only once at the beginning. keylen and fCode may share the same register.
1531 
1532       __ z_lghi(fCode, VM_Version::Cipher::_AES128 + mode);
1533       if (!identical_dataBlk_len) {
1534         __ z_lghi(srclen, VM_Version::Cipher::_AES128_dataBlk);
1535       }
1536       __ z_brl(fCode_set);  // keyLen <  52: AES128
1537 
1538       __ z_lghi(fCode, VM_Version::Cipher::_AES192 + mode);
1539       if (!identical_dataBlk_len) {
1540         __ z_lghi(srclen, VM_Version::Cipher::_AES192_dataBlk);
1541       }
1542       __ z_bre(fCode_set);  // keyLen == 52: AES192
1543 
1544       __ z_lghi(fCode, VM_Version::Cipher::_AES256 + mode);
1545       if (!identical_dataBlk_len) {
1546         __ z_lghi(srclen, VM_Version::Cipher::_AES256_dataBlk);
1547       }
1548       // __ z_brh(fCode_set);  // keyLen <  52: AES128           // fallthru
1549 
1550       __ bind(fCode_set);
1551       if (identical_dataBlk_len) {
1552         __ z_lghi(srclen, VM_Version::Cipher::_AES128_dataBlk);
1553       }
1554     }
1555     BLOCK_COMMENT("} Set fCode");
1556   }
1557 
1558   // Push a parameter block for the cipher/decipher instruction on the stack.
1559   // Layout of the additional stack space allocated for AES_cipherBlockChaining:
1560   //
1561   //   |        |
1562   //   +--------+ <-- SP before expansion
1563   //   |        |
1564   //   :        :  alignment loss, 0..(AES_parmBlk_align-8) bytes
1565   //   |        |
1566   //   +--------+
1567   //   |        |
1568   //   :        :  space for parameter block, size VM_Version::Cipher::_AES*_parmBlk_C
1569   //   |        |
1570   //   +--------+ <-- parmBlk, octoword-aligned, start of parameter block
1571   //   |        |
1572   //   :        :  additional stack space for spills etc., size AES_parmBlk_addspace, DW @ Z_SP not usable!!!
1573   //   |        |
1574   //   +--------+ <-- Z_SP after expansion
1575 
1576   void generate_push_Block(int dataBlk_len, int parmBlk_len, int crypto_fCode,
1577                            Register parmBlk, Register keylen, Register fCode, Register cv, Register key) {
1578     const int AES_parmBlk_align    = 32;  // octoword alignment.
1579     const int AES_parmBlk_addspace = 24;  // Must be sufficiently large to hold all spilled registers
1580                                           // (currently 2) PLUS 1 DW for the frame pointer.
1581 
1582     const int cv_len     = dataBlk_len;
1583     const int key_len    = parmBlk_len - cv_len;
1584     // This len must be known at JIT compile time. Only then are we able to recalc the SP before resize.
1585     // We buy this knowledge by wasting some (up to AES_parmBlk_align) bytes of stack space.
1586     const int resize_len = cv_len + key_len + AES_parmBlk_align + AES_parmBlk_addspace;
1587 
1588     // Use parmBlk as temp reg here to hold the frame pointer.
1589     __ resize_frame(-resize_len, parmBlk, true);
1590 
1591     // calculate parmBlk address from updated (resized) SP.
1592     __ add2reg(parmBlk, resize_len - (cv_len + key_len), Z_SP);
1593     __ z_nill(parmBlk, (~(AES_parmBlk_align-1)) & 0xffff); // Align parameter block.
1594 
1595     // There is room for stuff in the range [parmBlk-AES_parmBlk_addspace+8, parmBlk).
1596     __ z_stg(keylen,  -8, parmBlk);                        // Spill keylen for later use.
1597 
1598     // calculate (SP before resize) from updated SP.
1599     __ add2reg(keylen, resize_len, Z_SP);                  // keylen holds prev SP for now.
1600     __ z_stg(keylen, -16, parmBlk);                        // Spill prev SP for easy revert.
1601 
1602     __ z_mvc(0,      cv_len-1,  parmBlk, 0, cv);     // Copy cv.
1603     __ z_mvc(cv_len, key_len-1, parmBlk, 0, key);    // Copy key.
1604     __ z_lghi(fCode, crypto_fCode);
1605   }
1606 
1607   // NOTE:
1608   //   Before returning, the stub has to copy the chaining value from
1609   //   the parmBlk, where it was updated by the crypto instruction, back
1610   //   to the chaining value array the address of which was passed in the cv argument.
1611   //   As all the available registers are used and modified by KMC, we need to save
1612   //   the key length across the KMC instruction. We do so by spilling it to the stack,
1613   //   just preceding the parmBlk (at (parmBlk - 8)).
1614   void generate_push_parmBlk(Register keylen, Register fCode, Register parmBlk, Register key, Register cv, bool is_decipher) {
1615     int       mode = is_decipher ? VM_Version::CipherMode::decipher : VM_Version::CipherMode::cipher;
1616     Label     parmBlk_128, parmBlk_192, parmBlk_256, parmBlk_set;
1617 
1618     BLOCK_COMMENT("push parmBlk {");
1619     if (VM_Version::has_Crypto_AES()   ) { __ z_cghi(keylen, 52); }
1620     if (VM_Version::has_Crypto_AES128()) { __ z_brl(parmBlk_128); }  // keyLen <  52: AES128
1621     if (VM_Version::has_Crypto_AES192()) { __ z_bre(parmBlk_192); }  // keyLen == 52: AES192
1622     if (VM_Version::has_Crypto_AES256()) { __ z_brh(parmBlk_256); }  // keyLen >  52: AES256
1623 
1624     // Security net: requested AES function not available on this CPU.
1625     // NOTE:
1626     //   As of now (March 2015), this safety net is not required. JCE policy files limit the
1627     //   cryptographic strength of the keys used to 128 bit. If we have AES hardware support
1628     //   at all, we have at least AES-128.
1629     __ stop_static("AES key strength not supported by CPU. Use -XX:-UseAES as remedy.", 0);
1630 
1631     if (VM_Version::has_Crypto_AES256()) {
1632       __ bind(parmBlk_256);
1633       generate_push_Block(VM_Version::Cipher::_AES256_dataBlk,
1634                           VM_Version::Cipher::_AES256_parmBlk_C,
1635                           VM_Version::Cipher::_AES256 + mode,
1636                           parmBlk, keylen, fCode, cv, key);
1637       if (VM_Version::has_Crypto_AES128() || VM_Version::has_Crypto_AES192()) {
1638         __ z_bru(parmBlk_set);  // Fallthru otherwise.
1639       }
1640     }
1641 
1642     if (VM_Version::has_Crypto_AES192()) {
1643       __ bind(parmBlk_192);
1644       generate_push_Block(VM_Version::Cipher::_AES192_dataBlk,
1645                           VM_Version::Cipher::_AES192_parmBlk_C,
1646                           VM_Version::Cipher::_AES192 + mode,
1647                           parmBlk, keylen, fCode, cv, key);
1648       if (VM_Version::has_Crypto_AES128()) {
1649         __ z_bru(parmBlk_set);  // Fallthru otherwise.
1650       }
1651     }
1652 
1653     if (VM_Version::has_Crypto_AES128()) {
1654       __ bind(parmBlk_128);
1655       generate_push_Block(VM_Version::Cipher::_AES128_dataBlk,
1656                           VM_Version::Cipher::_AES128_parmBlk_C,
1657                           VM_Version::Cipher::_AES128 + mode,
1658                           parmBlk, keylen, fCode, cv, key);
1659       // Fallthru
1660     }
1661 
1662     __ bind(parmBlk_set);
1663     BLOCK_COMMENT("} push parmBlk");
1664   }
1665 
1666   // Pop a parameter block from the stack. The chaining value portion of the parameter block
1667   // is copied back to the cv array as it is needed for subsequent cipher steps.
1668   // The keylen value as well as the original SP (before resizing) was pushed to the stack
1669   // when pushing the parameter block.
1670   void generate_pop_parmBlk(Register keylen, Register parmBlk, Register key, Register cv) {
1671 
1672     BLOCK_COMMENT("pop parmBlk {");
1673     bool identical_dataBlk_len =  (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES192_dataBlk) &&
1674                                   (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES256_dataBlk);
1675     if (identical_dataBlk_len) {
1676       int cv_len = VM_Version::Cipher::_AES128_dataBlk;
1677       __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1678     } else {
1679       int cv_len;
1680       Label parmBlk_128, parmBlk_192, parmBlk_256, parmBlk_set;
1681       __ z_lg(keylen, -8, parmBlk);  // restore keylen
1682       __ z_cghi(keylen, 52);
1683       if (VM_Version::has_Crypto_AES256()) __ z_brh(parmBlk_256);  // keyLen >  52: AES256
1684       if (VM_Version::has_Crypto_AES192()) __ z_bre(parmBlk_192);  // keyLen == 52: AES192
1685       // if (VM_Version::has_Crypto_AES128()) __ z_brl(parmBlk_128);  // keyLen <  52: AES128  // fallthru
1686 
1687       // Security net: there is no one here. If we would need it, we should have
1688       // fallen into it already when pushing the parameter block.
1689       if (VM_Version::has_Crypto_AES128()) {
1690         __ bind(parmBlk_128);
1691         cv_len = VM_Version::Cipher::_AES128_dataBlk;
1692         __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1693         if (VM_Version::has_Crypto_AES192() || VM_Version::has_Crypto_AES256()) {
1694           __ z_bru(parmBlk_set);
1695         }
1696       }
1697 
1698       if (VM_Version::has_Crypto_AES192()) {
1699         __ bind(parmBlk_192);
1700         cv_len = VM_Version::Cipher::_AES192_dataBlk;
1701         __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1702         if (VM_Version::has_Crypto_AES256()) {
1703           __ z_bru(parmBlk_set);
1704         }
1705       }
1706 
1707       if (VM_Version::has_Crypto_AES256()) {
1708         __ bind(parmBlk_256);
1709         cv_len = VM_Version::Cipher::_AES256_dataBlk;
1710         __ z_mvc(0, cv_len-1, cv, 0, parmBlk);  // Copy cv.
1711         // __ z_bru(parmBlk_set);  // fallthru
1712       }
1713       __ bind(parmBlk_set);
1714     }
1715     __ z_lg(Z_SP, -16, parmBlk); // Revert resize_frame_absolute. Z_SP saved by push_parmBlk.
1716     BLOCK_COMMENT("} pop parmBlk");
1717   }
1718 
1719   // Compute AES encrypt/decrypt function.
1720   void generate_AES_cipherBlock(bool is_decipher) {
1721     // Incoming arguments.
1722     Register       from    = Z_ARG1; // source byte array
1723     Register       to      = Z_ARG2; // destination byte array
1724     Register       key     = Z_ARG3; // expanded key array
1725 
1726     const Register keylen  = Z_R0;   // Temporarily (until fCode is set) holds the expanded key array length.
1727 
1728     // Register definitions as required by KM instruction.
1729     const Register fCode   = Z_R0;   // crypto function code
1730     const Register parmBlk = Z_R1;   // parameter block address (points to crypto key)
1731     const Register src     = Z_ARG1; // Must be even reg (KM requirement).
1732     const Register srclen  = Z_ARG2; // Must be odd reg and pair with src. Overwrites destination address.
1733     const Register dst     = Z_ARG3; // Must be even reg (KM requirement). Overwrites expanded key address.
1734 
1735     // Read key len of expanded key (in 4-byte words).
1736     __ z_lgf(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1737 
1738     // Copy arguments to registers as required by crypto instruction.
1739     __ z_lgr(parmBlk, key);          // crypto key (in T_INT array).
1740     __ lgr_if_needed(src, from);     // Copy src address. Will not emit, src/from are identical.
1741     __ z_lgr(dst, to);               // Copy dst address, even register required.
1742 
1743     // Construct function code into fCode(Z_R0), data block length into srclen(Z_ARG2).
1744     generate_load_AES_fCode(keylen, fCode, srclen, is_decipher);
1745 
1746     __ km(dst, src);                 // Cipher the message.
1747 
1748     __ z_br(Z_R14);
1749   }
1750 
1751   // Compute AES encrypt function.
1752   address generate_AES_encryptBlock(const char* name) {
1753     __ align(CodeEntryAlignment);
1754     StubCodeMark mark(this, "StubRoutines", name);
1755     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1756 
1757     generate_AES_cipherBlock(false);
1758 
1759     return __ addr_at(start_off);
1760   }
1761 
1762   // Compute AES decrypt function.
1763   address generate_AES_decryptBlock(const char* name) {
1764     __ align(CodeEntryAlignment);
1765     StubCodeMark mark(this, "StubRoutines", name);
1766     unsigned int start_off = __ offset();  // Remember stub start address (is rtn value).
1767 
1768     generate_AES_cipherBlock(true);
1769 
1770     return __ addr_at(start_off);
1771   }
1772 
1773   // These stubs receive the addresses of the cryptographic key and of the chaining value as two separate
1774   // arguments (registers "key" and "cv", respectively). The KMC instruction, on the other hand, requires
1775   // chaining value and key to be, in this sequence, adjacent in storage. Thus, we need to allocate some
1776   // thread-local working storage. Using heap memory incurs all the hassles of allocating/freeing.
1777   // Stack space, on the contrary, is deallocated automatically when we return from the stub to the caller.
1778   // *** WARNING ***
1779   // Please note that we do not formally allocate stack space, nor do we
1780   // update the stack pointer. Therefore, no function calls are allowed
1781   // and nobody else must use the stack range where the parameter block
1782   // is located.
1783   // We align the parameter block to the next available octoword.
1784   //
1785   // Compute chained AES encrypt function.
1786   void generate_AES_cipherBlockChaining(bool is_decipher) {
1787 
1788     Register       from    = Z_ARG1; // source byte array (clear text)
1789     Register       to      = Z_ARG2; // destination byte array (ciphered)
1790     Register       key     = Z_ARG3; // expanded key array.
1791     Register       cv      = Z_ARG4; // chaining value
1792     const Register msglen  = Z_ARG5; // Total length of the msg to be encrypted. Value must be returned
1793                                      // in Z_RET upon completion of this stub. Is 32-bit integer.
1794 
1795     const Register keylen  = Z_R0;   // Expanded key length, as read from key array. Temp only.
1796     const Register fCode   = Z_R0;   // crypto function code
1797     const Register parmBlk = Z_R1;   // parameter block address (points to crypto key)
1798     const Register src     = Z_ARG1; // is Z_R2
1799     const Register srclen  = Z_ARG2; // Overwrites destination address.
1800     const Register dst     = Z_ARG3; // Overwrites key address.
1801 
1802     // Read key len of expanded key (in 4-byte words).
1803     __ z_lgf(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
1804 
1805     // Construct parm block address in parmBlk (== Z_R1), copy cv and key to parm block.
1806     // Construct function code in fCode (Z_R0).
1807     generate_push_parmBlk(keylen, fCode, parmBlk, key, cv, is_decipher);
1808 
1809     // Prepare other registers for instruction.
1810     __ lgr_if_needed(src, from);     // Copy src address. Will not emit, src/from are identical.
1811     __ z_lgr(dst, to);
1812     __ z_llgfr(srclen, msglen);      // We pass the offsets as ints, not as longs as required.
1813 
1814     __ kmc(dst, src);                // Cipher the message.
1815 
1816     generate_pop_parmBlk(keylen, parmBlk, key, cv);
1817 
1818     __ z_llgfr(Z_RET, msglen);       // We pass the offsets as ints, not as longs as required.
1819     __ z_br(Z_R14);
1820   }
1821 
1822   // Compute chained AES encrypt function.
1823   address generate_cipherBlockChaining_AES_encrypt(const char* name) {
1824     __ align(CodeEntryAlignment);
1825     StubCodeMark mark(this, "StubRoutines", name);
1826     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1827 
1828     generate_AES_cipherBlockChaining(false);
1829 
1830     return __ addr_at(start_off);
1831   }
1832 
1833   // Compute chained AES encrypt function.
1834   address generate_cipherBlockChaining_AES_decrypt(const char* name) {
1835     __ align(CodeEntryAlignment);
1836     StubCodeMark mark(this, "StubRoutines", name);
1837     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
1838 
1839     generate_AES_cipherBlockChaining(true);
1840 
1841     return __ addr_at(start_off);
1842   }
1843 
1844 
1845   // Call interface for all SHA* stubs.
1846   //
1847   //   Z_ARG1 - source data block. Ptr to leftmost byte to be processed.
1848   //   Z_ARG2 - current SHA state. Ptr to state area. This area serves as
1849   //            parameter block as required by the crypto instruction.
1850   //   Z_ARG3 - current byte offset in source data block.
1851   //   Z_ARG4 - last byte offset in source data block.
1852   //            (Z_ARG4 - Z_ARG3) gives the #bytes remaining to be processed.
1853   //
1854   //   Z_RET  - return value. First unprocessed byte offset in src buffer.
1855   //
1856   //   A few notes on the call interface:
1857   //    - All stubs, whether they are single-block or multi-block, are assumed to
1858   //      digest an integer multiple of the data block length of data. All data
1859   //      blocks are digested using the intermediate message digest (KIMD) instruction.
1860   //      Special end processing, as done by the KLMD instruction, seems to be
1861   //      emulated by the calling code.
1862   //
1863   //    - Z_ARG1 addresses the first byte of source data. The offset (Z_ARG3) is
1864   //      already accounted for.
1865   //
1866   //    - The current SHA state (the intermediate message digest value) is contained
1867   //      in an area addressed by Z_ARG2. The area size depends on the SHA variant
1868   //      and is accessible via the enum VM_Version::MsgDigest::_SHA<n>_parmBlk_I
1869   //
1870   //    - The single-block stub is expected to digest exactly one data block, starting
1871   //      at the address passed in Z_ARG1.
1872   //
1873   //    - The multi-block stub is expected to digest all data blocks which start in
1874   //      the offset interval [srcOff(Z_ARG3), srcLimit(Z_ARG4)). The exact difference
1875   //      (srcLimit-srcOff), rounded up to the next multiple of the data block length,
1876   //      gives the number of blocks to digest. It must be assumed that the calling code
1877   //      provides for a large enough source data buffer.
1878   //
1879   // Compute SHA-1 function.
1880   address generate_SHA1_stub(bool multiBlock, const char* name) {
1881     __ align(CodeEntryAlignment);
1882     StubCodeMark mark(this, "StubRoutines", name);
1883     unsigned int start_off = __ offset();   // Remember stub start address (is rtn value).
1884 
1885     const Register srcBuff        = Z_ARG1; // Points to first block to process (offset already added).
1886     const Register SHAState       = Z_ARG2; // Only on entry. Reused soon thereafter for kimd register pairs.
1887     const Register srcOff         = Z_ARG3; // int
1888     const Register srcLimit       = Z_ARG4; // Only passed in multiBlock case. int
1889 
1890     const Register SHAState_local = Z_R1;
1891     const Register SHAState_save  = Z_ARG3;
1892     const Register srcBufLen      = Z_ARG2; // Destroys state address, must be copied before.
1893     Label useKLMD, rtn;
1894 
1895     __ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_SHA1);   // function code
1896     __ z_lgr(SHAState_local, SHAState);                                 // SHAState == parameter block
1897 
1898     if (multiBlock) {  // Process everything from offset to limit.
1899 
1900       // The following description is valid if we get a raw (unpimped) source data buffer,
1901       // spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailled above,
1902       // the calling convention for these stubs is different. We leave the description in
1903       // to inform the reader what must be happening hidden in the calling code.
1904       //
1905       // The data block to be processed can have arbitrary length, i.e. its length does not
1906       // need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement
1907       // two different paths. If the length is an integer multiple, we use KIMD, saving us
1908       // to copy the SHA state back and forth. If the length is odd, we copy the SHA state
1909       // to the stack, execute a KLMD instruction on it and copy the result back to the
1910       // caller's SHA state location.
1911 
1912       // Total #srcBuff blocks to process.
1913       if (VM_Version::has_DistinctOpnds()) {
1914         __ z_srk(srcBufLen, srcLimit, srcOff); // exact difference
1915         __ z_ahi(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1);   // round up
1916         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA1_dataBlk-1)) & 0xffff);
1917         __ z_ark(srcLimit, srcOff, srcBufLen); // Srclimit temporarily holds return value.
1918         __ z_llgfr(srcBufLen, srcBufLen);      // Cast to 64-bit.
1919       } else {
1920         __ z_lgfr(srcBufLen, srcLimit);        // Exact difference. srcLimit passed as int.
1921         __ z_sgfr(srcBufLen, srcOff);          // SrcOff passed as int, now properly casted to long.
1922         __ z_aghi(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1);   // round up
1923         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA1_dataBlk-1)) & 0xffff);
1924         __ z_lgr(srcLimit, srcOff);            // SrcLimit temporarily holds return value.
1925         __ z_agr(srcLimit, srcBufLen);
1926       }
1927 
1928       // Integral #blocks to digest?
1929       // As a result of the calculations above, srcBufLen MUST be an integer
1930       // multiple of _SHA1_dataBlk, or else we are in big trouble.
1931       // We insert an asm_assert into the KLMD case to guard against that.
1932       __ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1);
1933       __ z_brc(Assembler::bcondNotAllZero, useKLMD);
1934 
1935       // Process all full blocks.
1936       __ kimd(srcBuff);
1937 
1938       __ z_lgr(Z_RET, srcLimit);  // Offset of first unprocessed byte in buffer.
1939     } else {  // Process one data block only.
1940       __ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA1_dataBlk);   // #srcBuff bytes to process
1941       __ kimd(srcBuff);
1942       __ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA1_dataBlk, srcOff);            // Offset of first unprocessed byte in buffer. No 32 to 64 bit extension needed.
1943     }
1944 
1945     __ bind(rtn);
1946     __ z_br(Z_R14);
1947 
1948     if (multiBlock) {
1949       __ bind(useKLMD);
1950 
1951 #if 1
1952       // Security net: this stub is believed to be called for full-sized data blocks only
1953       // NOTE: The following code is believed to be correct, but is is not tested.
1954       __ stop_static("SHA128 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0);
1955 #endif
1956     }
1957 
1958     return __ addr_at(start_off);
1959   }
1960 
1961   // Compute SHA-256 function.
1962   address generate_SHA256_stub(bool multiBlock, const char* name) {
1963     __ align(CodeEntryAlignment);
1964     StubCodeMark mark(this, "StubRoutines", name);
1965     unsigned int start_off = __ offset();   // Remember stub start address (is rtn value).
1966 
1967     const Register srcBuff        = Z_ARG1;
1968     const Register SHAState       = Z_ARG2; // Only on entry. Reused soon thereafter.
1969     const Register SHAState_local = Z_R1;
1970     const Register SHAState_save  = Z_ARG3;
1971     const Register srcOff         = Z_ARG3;
1972     const Register srcLimit       = Z_ARG4;
1973     const Register srcBufLen      = Z_ARG2; // Destroys state address, must be copied before.
1974     Label useKLMD, rtn;
1975 
1976     __ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_SHA256); // function code
1977     __ z_lgr(SHAState_local, SHAState);                                 // SHAState == parameter block
1978 
1979     if (multiBlock) {  // Process everything from offset to limit.
1980       // The following description is valid if we get a raw (unpimped) source data buffer,
1981       // spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailled above,
1982       // the calling convention for these stubs is different. We leave the description in
1983       // to inform the reader what must be happening hidden in the calling code.
1984       //
1985       // The data block to be processed can have arbitrary length, i.e. its length does not
1986       // need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement
1987       // two different paths. If the length is an integer multiple, we use KIMD, saving us
1988       // to copy the SHA state back and forth. If the length is odd, we copy the SHA state
1989       // to the stack, execute a KLMD instruction on it and copy the result back to the
1990       // caller's SHA state location.
1991 
1992       // total #srcBuff blocks to process
1993       if (VM_Version::has_DistinctOpnds()) {
1994         __ z_srk(srcBufLen, srcLimit, srcOff);   // exact difference
1995         __ z_ahi(srcBufLen, VM_Version::MsgDigest::_SHA256_dataBlk-1); // round up
1996         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA256_dataBlk-1)) & 0xffff);
1997         __ z_ark(srcLimit, srcOff, srcBufLen);   // Srclimit temporarily holds return value.
1998         __ z_llgfr(srcBufLen, srcBufLen);        // Cast to 64-bit.
1999       } else {
2000         __ z_lgfr(srcBufLen, srcLimit);          // exact difference
2001         __ z_sgfr(srcBufLen, srcOff);
2002         __ z_aghi(srcBufLen, VM_Version::MsgDigest::_SHA256_dataBlk-1); // round up
2003         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA256_dataBlk-1)) & 0xffff);
2004         __ z_lgr(srcLimit, srcOff);              // Srclimit temporarily holds return value.
2005         __ z_agr(srcLimit, srcBufLen);
2006       }
2007 
2008       // Integral #blocks to digest?
2009       // As a result of the calculations above, srcBufLen MUST be an integer
2010       // multiple of _SHA1_dataBlk, or else we are in big trouble.
2011       // We insert an asm_assert into the KLMD case to guard against that.
2012       __ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA256_dataBlk-1);
2013       __ z_brc(Assembler::bcondNotAllZero, useKLMD);
2014 
2015       // Process all full blocks.
2016       __ kimd(srcBuff);
2017 
2018       __ z_lgr(Z_RET, srcLimit);  // Offset of first unprocessed byte in buffer.
2019     } else {  // Process one data block only.
2020       __ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA256_dataBlk); // #srcBuff bytes to process
2021       __ kimd(srcBuff);
2022       __ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA256_dataBlk, srcOff);          // Offset of first unprocessed byte in buffer.
2023     }
2024 
2025     __ bind(rtn);
2026     __ z_br(Z_R14);
2027 
2028     if (multiBlock) {
2029       __ bind(useKLMD);
2030 #if 1
2031       // Security net: this stub is believed to be called for full-sized data blocks only.
2032       // NOTE:
2033       //   The following code is believed to be correct, but is is not tested.
2034       __ stop_static("SHA256 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0);
2035 #endif
2036     }
2037 
2038     return __ addr_at(start_off);
2039   }
2040 
2041   // Compute SHA-512 function.
2042   address generate_SHA512_stub(bool multiBlock, const char* name) {
2043     __ align(CodeEntryAlignment);
2044     StubCodeMark mark(this, "StubRoutines", name);
2045     unsigned int start_off = __ offset();   // Remember stub start address (is rtn value).
2046 
2047     const Register srcBuff        = Z_ARG1;
2048     const Register SHAState       = Z_ARG2; // Only on entry. Reused soon thereafter.
2049     const Register SHAState_local = Z_R1;
2050     const Register SHAState_save  = Z_ARG3;
2051     const Register srcOff         = Z_ARG3;
2052     const Register srcLimit       = Z_ARG4;
2053     const Register srcBufLen      = Z_ARG2; // Destroys state address, must be copied before.
2054     Label useKLMD, rtn;
2055 
2056     __ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_SHA512); // function code
2057     __ z_lgr(SHAState_local, SHAState);                                 // SHAState == parameter block
2058 
2059     if (multiBlock) {  // Process everything from offset to limit.
2060       // The following description is valid if we get a raw (unpimped) source data buffer,
2061       // spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailled above,
2062       // the calling convention for these stubs is different. We leave the description in
2063       // to inform the reader what must be happening hidden in the calling code.
2064       //
2065       // The data block to be processed can have arbitrary length, i.e. its length does not
2066       // need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement
2067       // two different paths. If the length is an integer multiple, we use KIMD, saving us
2068       // to copy the SHA state back and forth. If the length is odd, we copy the SHA state
2069       // to the stack, execute a KLMD instruction on it and copy the result back to the
2070       // caller's SHA state location.
2071 
2072       // total #srcBuff blocks to process
2073       if (VM_Version::has_DistinctOpnds()) {
2074         __ z_srk(srcBufLen, srcLimit, srcOff);   // exact difference
2075         __ z_ahi(srcBufLen, VM_Version::MsgDigest::_SHA512_dataBlk-1); // round up
2076         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA512_dataBlk-1)) & 0xffff);
2077         __ z_ark(srcLimit, srcOff, srcBufLen);   // Srclimit temporarily holds return value.
2078         __ z_llgfr(srcBufLen, srcBufLen);        // Cast to 64-bit.
2079       } else {
2080         __ z_lgfr(srcBufLen, srcLimit);          // exact difference
2081         __ z_sgfr(srcBufLen, srcOff);
2082         __ z_aghi(srcBufLen, VM_Version::MsgDigest::_SHA512_dataBlk-1); // round up
2083         __ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA512_dataBlk-1)) & 0xffff);
2084         __ z_lgr(srcLimit, srcOff);              // Srclimit temporarily holds return value.
2085         __ z_agr(srcLimit, srcBufLen);
2086       }
2087 
2088       // integral #blocks to digest?
2089       // As a result of the calculations above, srcBufLen MUST be an integer
2090       // multiple of _SHA1_dataBlk, or else we are in big trouble.
2091       // We insert an asm_assert into the KLMD case to guard against that.
2092       __ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA512_dataBlk-1);
2093       __ z_brc(Assembler::bcondNotAllZero, useKLMD);
2094 
2095       // Process all full blocks.
2096       __ kimd(srcBuff);
2097 
2098       __ z_lgr(Z_RET, srcLimit);  // Offset of first unprocessed byte in buffer.
2099     } else {  // Process one data block only.
2100       __ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA512_dataBlk); // #srcBuff bytes to process
2101       __ kimd(srcBuff);
2102       __ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA512_dataBlk, srcOff);          // Offset of first unprocessed byte in buffer.
2103     }
2104 
2105     __ bind(rtn);
2106     __ z_br(Z_R14);
2107 
2108     if (multiBlock) {
2109       __ bind(useKLMD);
2110 #if 1
2111       // Security net: this stub is believed to be called for full-sized data blocks only
2112       // NOTE:
2113       //   The following code is believed to be correct, but is is not tested.
2114       __ stop_static("SHA512 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0);
2115 #endif
2116     }
2117 
2118     return __ addr_at(start_off);
2119   }
2120 
2121 
2122   /**
2123    *  Arguments:
2124    *
2125    * Inputs:
2126    *   Z_ARG1    - int   crc
2127    *   Z_ARG2    - byte* buf
2128    *   Z_ARG3    - int   length (of buffer)
2129    *
2130    * Result:
2131    *   Z_RET     - int   crc result
2132    **/
2133   // Compute CRC function (generic, for all polynomials).
2134   void generate_CRC_updateBytes(const char* name, Register table, bool invertCRC) {
2135 
2136     // arguments to kernel_crc32:
2137     Register       crc     = Z_ARG1;  // Current checksum, preset by caller or result from previous call, int.
2138     Register       data    = Z_ARG2;  // source byte array
2139     Register       dataLen = Z_ARG3;  // #bytes to process, int
2140 //    Register       table   = Z_ARG4;  // crc table address. Preloaded and passed in by caller.
2141     const Register t0      = Z_R10;   // work reg for kernel* emitters
2142     const Register t1      = Z_R11;   // work reg for kernel* emitters
2143     const Register t2      = Z_R12;   // work reg for kernel* emitters
2144     const Register t3      = Z_R13;   // work reg for kernel* emitters
2145 
2146     assert_different_registers(crc, data, dataLen, table);
2147 
2148     // We pass these values as ints, not as longs as required by C calling convention.
2149     // Crc used as int.
2150     __ z_llgfr(dataLen, dataLen);
2151 
2152     __ resize_frame(-(6*8), Z_R0, true); // Resize frame to provide add'l space to spill 5 registers.
2153     __ z_stmg(Z_R10, Z_R13, 1*8, Z_SP);  // Spill regs 10..11 to make them available as work registers.
2154     __ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, invertCRC);
2155     __ z_lmg(Z_R10, Z_R13, 1*8, Z_SP);   // Spill regs 10..11 back from stack.
2156     __ resize_frame(+(6*8), Z_R0, true); // Resize frame to provide add'l space to spill 5 registers.
2157 
2158     __ z_llgfr(Z_RET, crc);  // Updated crc is function result. No copying required, just zero upper 32 bits.
2159     __ z_br(Z_R14);          // Result already in Z_RET == Z_ARG1.
2160   }
2161 
2162 
2163   // Compute CRC32 function.
2164   address generate_CRC32_updateBytes(const char* name) {
2165     __ align(CodeEntryAlignment);
2166     StubCodeMark mark(this, "StubRoutines", name);
2167     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
2168 
2169     assert(UseCRC32Intrinsics, "should not generate this stub (%s) with CRC32 intrinsics disabled", name);
2170 
2171     BLOCK_COMMENT("CRC32_updateBytes {");
2172     Register       table   = Z_ARG4;  // crc32 table address.
2173     StubRoutines::zarch::generate_load_crc_table_addr(_masm, table);
2174 
2175     generate_CRC_updateBytes(name, table, true);
2176     BLOCK_COMMENT("} CRC32_updateBytes");
2177 
2178     return __ addr_at(start_off);
2179   }
2180 
2181 
2182   // Compute CRC32C function.
2183   address generate_CRC32C_updateBytes(const char* name) {
2184     __ align(CodeEntryAlignment);
2185     StubCodeMark mark(this, "StubRoutines", name);
2186     unsigned int   start_off = __ offset();  // Remember stub start address (is rtn value).
2187 
2188     assert(UseCRC32CIntrinsics, "should not generate this stub (%s) with CRC32C intrinsics disabled", name);
2189 
2190     BLOCK_COMMENT("CRC32C_updateBytes {");
2191     Register       table   = Z_ARG4;  // crc32c table address.
2192     StubRoutines::zarch::generate_load_crc32c_table_addr(_masm, table);
2193 
2194     generate_CRC_updateBytes(name, table, false);
2195     BLOCK_COMMENT("} CRC32C_updateBytes");
2196 
2197     return __ addr_at(start_off);
2198   }
2199 
2200 
2201   // Arguments:
2202   //   Z_ARG1    - x address
2203   //   Z_ARG2    - x length
2204   //   Z_ARG3    - y address
2205   //   Z_ARG4    - y length
2206   //   Z_ARG5    - z address
2207   //   160[Z_SP] - z length
2208   address generate_multiplyToLen() {
2209     __ align(CodeEntryAlignment);
2210     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
2211 
2212     address start = __ pc();
2213 
2214     const Register x    = Z_ARG1;
2215     const Register xlen = Z_ARG2;
2216     const Register y    = Z_ARG3;
2217     const Register ylen = Z_ARG4;
2218     const Register z    = Z_ARG5;
2219     // zlen is passed on the stack:
2220     // Address zlen(Z_SP, _z_abi(remaining_cargs));
2221 
2222     // Next registers will be saved on stack in multiply_to_len().
2223     const Register tmp1 = Z_tmp_1;
2224     const Register tmp2 = Z_tmp_2;
2225     const Register tmp3 = Z_tmp_3;
2226     const Register tmp4 = Z_tmp_4;
2227     const Register tmp5 = Z_R9;
2228 
2229     BLOCK_COMMENT("Entry:");
2230 
2231     __ z_llgfr(xlen, xlen);
2232     __ z_llgfr(ylen, ylen);
2233 
2234     __ multiply_to_len(x, xlen, y, ylen, z, tmp1, tmp2, tmp3, tmp4, tmp5);
2235 
2236     __ z_br(Z_R14);  // Return to caller.
2237 
2238     return start;
2239   }
2240 
2241   void generate_initial() {
2242     // Generates all stubs and initializes the entry points.
2243 
2244     // Entry points that exist in all platforms.
2245     // Note: This is code that could be shared among different
2246     // platforms - however the benefit seems to be smaller than the
2247     // disadvantage of having a much more complicated generator
2248     // structure. See also comment in stubRoutines.hpp.
2249     StubRoutines::_forward_exception_entry                 = generate_forward_exception();
2250 
2251     StubRoutines::_call_stub_entry                         = generate_call_stub(StubRoutines::_call_stub_return_address);
2252     StubRoutines::_catch_exception_entry                   = generate_catch_exception();
2253 
2254     // Build this early so it's available for the interpreter.
2255     StubRoutines::_throw_StackOverflowError_entry          =
2256       generate_throw_exception("StackOverflowError throw_exception",
2257                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
2258     StubRoutines::_throw_delayed_StackOverflowError_entry  =
2259       generate_throw_exception("delayed StackOverflowError throw_exception",
2260                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError), false);
2261 
2262     //----------------------------------------------------------------------
2263     // Entry points that are platform specific.
2264 
2265     if (UseCRC32Intrinsics) {
2266       StubRoutines::_crc_table_adr     = (address)StubRoutines::zarch::_crc_table;
2267       StubRoutines::_updateBytesCRC32  = generate_CRC32_updateBytes("CRC32_updateBytes");
2268     }
2269 
2270     if (UseCRC32CIntrinsics) {
2271       StubRoutines::_crc32c_table_addr = (address)StubRoutines::zarch::_crc32c_table;
2272       StubRoutines::_updateBytesCRC32C = generate_CRC32C_updateBytes("CRC32C_updateBytes");
2273     }
2274 
2275     // Comapct string intrinsics: Translate table for string inflate intrinsic. Used by trot instruction.
2276     StubRoutines::zarch::_trot_table_addr = (address)StubRoutines::zarch::_trot_table;
2277   }
2278 
2279 
2280   void generate_all() {
2281     // Generates all stubs and initializes the entry points.
2282 
2283     StubRoutines::zarch::_partial_subtype_check            = generate_partial_subtype_check();
2284 
2285     // These entry points require SharedInfo::stack0 to be set up in non-core builds.
2286     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);
2287     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError),  false);
2288     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
2289 
2290     // Support for verify_oop (must happen after universe_init).
2291     StubRoutines::_verify_oop_subroutine_entry             = generate_verify_oop_subroutine();
2292 
2293     // Arraycopy stubs used by compilers.
2294     generate_arraycopy_stubs();
2295 
2296     // safefetch stubs
2297     generate_safefetch("SafeFetch32", sizeof(int),      &StubRoutines::_safefetch32_entry, &StubRoutines::_safefetch32_fault_pc, &StubRoutines::_safefetch32_continuation_pc);
2298     generate_safefetch("SafeFetchN",  sizeof(intptr_t), &StubRoutines::_safefetchN_entry,  &StubRoutines::_safefetchN_fault_pc,  &StubRoutines::_safefetchN_continuation_pc);
2299 
2300     // Generate AES intrinsics code.
2301     if (UseAESIntrinsics) {
2302       StubRoutines::_aescrypt_encryptBlock = generate_AES_encryptBlock("AES_encryptBlock");
2303       StubRoutines::_aescrypt_decryptBlock = generate_AES_decryptBlock("AES_decryptBlock");
2304       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_AES_encrypt("AES_encryptBlock_chaining");
2305       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_AES_decrypt("AES_decryptBlock_chaining");
2306     }
2307 
2308     // Generate SHA1/SHA256/SHA512 intrinsics code.
2309     if (UseSHA1Intrinsics) {
2310       StubRoutines::_sha1_implCompress     = generate_SHA1_stub(false,   "SHA1_singleBlock");
2311       StubRoutines::_sha1_implCompressMB   = generate_SHA1_stub(true,    "SHA1_multiBlock");
2312     }
2313     if (UseSHA256Intrinsics) {
2314       StubRoutines::_sha256_implCompress   = generate_SHA256_stub(false, "SHA256_singleBlock");
2315       StubRoutines::_sha256_implCompressMB = generate_SHA256_stub(true,  "SHA256_multiBlock");
2316     }
2317     if (UseSHA512Intrinsics) {
2318       StubRoutines::_sha512_implCompress   = generate_SHA512_stub(false, "SHA512_singleBlock");
2319       StubRoutines::_sha512_implCompressMB = generate_SHA512_stub(true,  "SHA512_multiBlock");
2320     }
2321 
2322 #ifdef COMPILER2
2323     if (UseMultiplyToLenIntrinsic) {
2324       StubRoutines::_multiplyToLen = generate_multiplyToLen();
2325     }
2326     if (UseMontgomeryMultiplyIntrinsic) {
2327       StubRoutines::_montgomeryMultiply
2328         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
2329     }
2330     if (UseMontgomerySquareIntrinsic) {
2331       StubRoutines::_montgomerySquare
2332         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
2333     }
2334 #endif
2335   }
2336 
2337  public:
2338   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
2339     // Replace the standard masm with a special one:
2340     _masm = new MacroAssembler(code);
2341 
2342     _stub_count = !all ? 0x100 : 0x200;
2343     if (all) {
2344       generate_all();
2345     } else {
2346       generate_initial();
2347     }
2348   }
2349 
2350  private:
2351   int _stub_count;
2352   void stub_prolog(StubCodeDesc* cdesc) {
2353 #ifdef ASSERT
2354     // Put extra information in the stub code, to make it more readable.
2355     // Write the high part of the address.
2356     // [RGV] Check if there is a dependency on the size of this prolog.
2357     __ emit_32((intptr_t)cdesc >> 32);
2358     __ emit_32((intptr_t)cdesc);
2359     __ emit_32(++_stub_count);
2360 #endif
2361     align(true);
2362   }
2363 
2364   void align(bool at_header = false) {
2365     // z/Architecture cache line size is 256 bytes.
2366     // There is no obvious benefit in aligning stub
2367     // code to cache lines. Use CodeEntryAlignment instead.
2368     const unsigned int icache_line_size      = CodeEntryAlignment;
2369     const unsigned int icache_half_line_size = MIN2<unsigned int>(32, CodeEntryAlignment);
2370 
2371     if (at_header) {
2372       while ((intptr_t)(__ pc()) % icache_line_size != 0) {
2373         __ emit_16(0);
2374       }
2375     } else {
2376       while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
2377         __ z_nop();
2378       }
2379     }
2380   }
2381 
2382 };
2383 
2384 void StubGenerator_generate(CodeBuffer* code, bool all) {
2385   StubGenerator g(code, all);
2386 }