1 /*
   2  * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "ci/ciUtilities.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "nativeInst_x86.hpp"
  33 #include "oops/instanceOop.hpp"
  34 #include "oops/method.hpp"
  35 #include "oops/objArrayKlass.hpp"
  36 #include "oops/oop.inline.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "runtime/frame.inline.hpp"
  39 #include "runtime/handles.inline.hpp"
  40 #include "runtime/sharedRuntime.hpp"
  41 #include "runtime/stubCodeGenerator.hpp"
  42 #include "runtime/stubRoutines.hpp"
  43 #include "runtime/thread.inline.hpp"
  44 #ifdef COMPILER2
  45 #include "opto/runtime.hpp"
  46 #endif
  47 #if INCLUDE_ZGC
  48 #include "gc/z/zThreadLocalData.hpp"
  49 #endif
  50 
  51 #ifdef __VECTOR_API_MATH_INTRINSICS_COMMON
  52 // Vector API SVML routines written in assembly
  53 extern "C"
  54 {
  55    float __svml_expf4_ha_ex(float a);
  56    double __svml_exp1_ha_ex(double a); 
  57    double __svml_exp2_ha_ex(double a);     
  58    float __svml_expf4_ha_l9(float a);
  59    float __svml_expf8_ha_l9(float a);
  60    float __svml_expf4_ha_e9(float a);
  61    float __svml_expf8_ha_e9(float a);
  62    float __svml_expf16_ha_z0(float a);
  63    double __svml_exp1_ha_l9(double a);    
  64    double __svml_exp2_ha_l9(double a);
  65    double __svml_exp4_ha_l9(double a);
  66    double __svml_exp1_ha_e9(double a);    
  67    double __svml_exp2_ha_e9(double a);
  68    double __svml_exp4_ha_e9(double a);
  69    double __svml_exp8_ha_z0(double a); 
  70    float  __svml_expm1f4_ha_ex(float a);
  71    double __svml_expm11_ha_ex(double a);
  72    double __svml_expm12_ha_ex(double a);
  73    float  __svml_expm1f4_ha_l9(float a);
  74    float  __svml_expm1f8_ha_l9(float a);
  75    float  __svml_expm1f4_ha_e9(float a);
  76    float  __svml_expm1f8_ha_e9(float a);
  77    float __svml_expm1f16_ha_z0(float a);
  78    double __svml_expm11_ha_l9(double a); 
  79    double __svml_expm12_ha_l9(double a);
  80    double __svml_expm14_ha_l9(double a);
  81    double __svml_expm11_ha_e9(double a); 
  82    double __svml_expm12_ha_e9(double a);
  83    double __svml_expm14_ha_e9(double a);
  84    double __svml_expm18_ha_z0(double a); 
  85    float __svml_log1pf4_ha_l9(float a);
  86    float __svml_log1pf8_ha_l9(float a);
  87    float __svml_log1pf4_ha_e9(float a);
  88    float __svml_log1pf8_ha_e9(float a);
  89    float __svml_log1pf16_ha_z0(float a);
  90    double __svml_log1p1_ha_l9(double a);
  91    double __svml_log1p2_ha_l9(double a);
  92    double __svml_log1p4_ha_l9(double a);
  93    double __svml_log1p1_ha_e9(double a);
  94    double __svml_log1p2_ha_e9(double a);
  95    double __svml_log1p4_ha_e9(double a);
  96    double __svml_log1p8_ha_z0(double a);
  97    float __svml_logf4_ha_l9(float a);
  98    float __svml_logf8_ha_l9(float a);
  99    float __svml_logf4_ha_e9(float a);
 100    float __svml_logf8_ha_e9(float a);
 101    float __svml_logf16_ha_z0(float a);
 102    double __svml_log1_ha_l9(double a);
 103    double __svml_log2_ha_l9(double a);
 104    double __svml_log4_ha_l9(double a);
 105    double __svml_log1_ha_e9(double a);
 106    double __svml_log2_ha_e9(double a);
 107    double __svml_log4_ha_e9(double a);
 108    double __svml_log8_ha_z0(double a);
 109    float __svml_log10f4_ha_l9(float a);
 110    float __svml_log10f8_ha_l9(float a);
 111    float __svml_log10f4_ha_e9(float a);
 112    float __svml_log10f8_ha_e9(float a);
 113    float __svml_log10f16_ha_z0(float a);
 114    double __svml_log101_ha_l9(double a);
 115    double __svml_log102_ha_l9(double a);
 116    double __svml_log104_ha_l9(double a); 
 117    double __svml_log101_ha_e9(double a);
 118    double __svml_log102_ha_e9(double a);
 119    double __svml_log104_ha_e9(double a);
 120    double __svml_log108_ha_z0(double a);
 121    float __svml_sinf4_ha_l9(float a);
 122    float __svml_sinf8_ha_l9(float a);
 123    float __svml_sinf4_ha_e9(float a);
 124    float __svml_sinf8_ha_e9(float a);
 125    float __svml_sinf16_ha_z0(float a);
 126    double __svml_sin1_ha_l9(double a); 
 127    double __svml_sin2_ha_l9(double a);
 128    double __svml_sin4_ha_l9(double a); 
 129    double __svml_sin1_ha_e9(double a); 
 130    double __svml_sin2_ha_e9(double a);
 131    double __svml_sin4_ha_e9(double a); 
 132    double __svml_sin8_ha_z0(double a);
 133    float __svml_cosf4_ha_l9(float a);
 134    float __svml_cosf8_ha_l9(float a);
 135    float __svml_cosf4_ha_e9(float a);
 136    float __svml_cosf8_ha_e9(float a);
 137    float __svml_cosf16_ha_z0(float a);
 138    double  __svml_cos1_ha_l9(double a);
 139    double  __svml_cos2_ha_l9(double a);
 140    double __svml_cos4_ha_l9(double a);
 141    double  __svml_cos1_ha_e9(double a);
 142    double  __svml_cos2_ha_e9(double a);
 143    double __svml_cos4_ha_e9(double a);
 144    double  __svml_cos8_ha_z0(double a);
 145    float __svml_tanf4_ha_l9(float a);
 146    float __svml_tanf8_ha_l9(float a);
 147    float __svml_tanf4_ha_e9(float a);
 148    float __svml_tanf8_ha_e9(float a);
 149    float __svml_tanf16_ha_z0(float a);
 150    double __svml_tan1_ha_l9(double a);
 151    double __svml_tan2_ha_l9(double a);
 152    double __svml_tan4_ha_l9(double a);
 153    double __svml_tan1_ha_e9(double a);
 154    double __svml_tan2_ha_e9(double a);
 155    double __svml_tan4_ha_e9(double a);
 156    double __svml_tan8_ha_z0(double a);
 157    double __svml_sinh1_ha_l9(double a);
 158    double __svml_sinh2_ha_l9(double a);
 159    double __svml_sinh4_ha_l9(double a);
 160    double __svml_sinh1_ha_e9(double a);
 161    double __svml_sinh2_ha_e9(double a);
 162    double __svml_sinh4_ha_e9(double a);
 163    double __svml_sinh8_ha_z0(double a);
 164    float __svml_sinhf4_ha_l9(float a);
 165    float __svml_sinhf8_ha_l9(float a);
 166    float __svml_sinhf4_ha_e9(float a);
 167    float __svml_sinhf8_ha_e9(float a);
 168    float __svml_sinhf16_ha_z0(float a);
 169    double __svml_cosh1_ha_l9(double a);
 170    double __svml_cosh2_ha_l9(double a);
 171    double __svml_cosh4_ha_l9(double a);
 172    double __svml_cosh1_ha_e9(double a);
 173    double __svml_cosh2_ha_e9(double a);
 174    double __svml_cosh4_ha_e9(double a);
 175    double __svml_cosh8_ha_z0(double a);
 176    float __svml_coshf4_ha_l9(float a);
 177    float __svml_coshf8_ha_l9(float a);
 178    float __svml_coshf4_ha_e9(float a);
 179    float __svml_coshf8_ha_e9(float a);
 180    float __svml_coshf16_ha_z0(float a); 
 181    double __svml_tanh1_ha_l9(double a);
 182    double __svml_tanh2_ha_l9(double a);
 183    double __svml_tanh4_ha_l9(double a);
 184    double __svml_tanh1_ha_e9(double a);
 185    double __svml_tanh2_ha_e9(double a);
 186    double __svml_tanh4_ha_e9(double a);
 187    double __svml_tanh8_ha_z0(double a);
 188    float __svml_tanhf4_ha_l9(float a);
 189    float __svml_tanhf8_ha_l9(float a);
 190    float __svml_tanhf4_ha_e9(float a);
 191    float __svml_tanhf8_ha_e9(float a);
 192    float __svml_tanhf16_ha_z0(float a);
 193    float __svml_acosf4_ha_ex(float a);
 194    float __svml_acosf4_ha_l9(float a);
 195    float __svml_acosf8_ha_l9(float a);
 196    float __svml_acosf4_ha_e9(float a);
 197    float __svml_acosf8_ha_e9(float a);
 198    float __svml_acosf16_ha_z0(float a);
 199    double __svml_acos1_ha_ex(double a);
 200    double __svml_acos2_ha_ex(double a);
 201    double __svml_acos1_ha_l9(double a);
 202    double __svml_acos2_ha_l9(double a);
 203    double __svml_acos4_ha_l9(double a);
 204    double __svml_acos1_ha_e9(double a);
 205    double __svml_acos2_ha_e9(double a);
 206    double __svml_acos4_ha_e9(double a);
 207    double __svml_acos8_ha_z0(double a);
 208    float __svml_asinf4_ha_ex(float a);
 209    double __svml_asin1_ha_ex(double a);
 210    double __svml_asin2_ha_ex(double a);
 211    double __svml_asin1_ha_l9(double a);
 212    double __svml_asin2_ha_l9(double a);
 213    double __svml_asin4_ha_l9(double a);
 214    double __svml_asin1_ha_e9(double a);
 215    double __svml_asin2_ha_e9(double a);
 216    double __svml_asin4_ha_e9(double a);
 217    double __svml_asin8_ha_z0(double a);
 218    float __svml_asinf4_ha_l9(float a);
 219    float __svml_asinf8_ha_l9(float a);
 220    float __svml_asinf4_ha_e9(float a);
 221    float __svml_asinf8_ha_e9(float a);
 222    float __svml_asinf16_ha_z0(float a);
 223    float __svml_atanf4_ha_ex(float a);
 224    double __svml_atan1_ha_ex(double a);
 225    double __svml_atan2_ha_ex(double a);
 226    double __svml_atan1_ha_l9(double a);
 227    double __svml_atan2_ha_l9(double a);
 228    double __svml_atan4_ha_l9(double a);
 229    double __svml_atan1_ha_e9(double a);
 230    double __svml_atan2_ha_e9(double a);
 231    double __svml_atan4_ha_e9(double a);
 232    double __svml_atan8_ha_z0(double a);
 233    float __svml_atanf4_ha_l9(float a);
 234    float __svml_atanf8_ha_l9(float a);
 235    float __svml_atanf4_ha_e9(float a);
 236    float __svml_atanf8_ha_e9(float a);
 237    float __svml_atanf16_ha_z0(float a);
 238    float __svml_powf4_ha_l9(float a, float b);
 239    float __svml_powf8_ha_l9(float a, float b);
 240    float __svml_powf4_ha_e9(float a, float b);
 241    float __svml_powf8_ha_e9(float a, float b);
 242    float __svml_powf16_ha_z0(float a, float b);
 243    double __svml_pow1_ha_l9(double a, double b);
 244    double __svml_pow2_ha_l9(double a, double b);
 245    double __svml_pow4_ha_l9(double a, double b);
 246    double __svml_pow1_ha_e9(double a, double b);
 247    double __svml_pow2_ha_e9(double a, double b);
 248    double __svml_pow4_ha_e9(double a, double b);
 249    double __svml_pow8_ha_z0(double a, double b);
 250    float __svml_hypotf4_ha_l9(float a, float b);
 251    float __svml_hypotf8_ha_l9(float a, float b);
 252    float __svml_hypotf4_ha_e9(float a, float b);
 253    float __svml_hypotf8_ha_e9(float a, float b);
 254    float __svml_hypotf16_ha_z0(float a, float b);
 255    double __svml_hypot1_ha_l9(double a, double b);
 256    double __svml_hypot2_ha_l9(double a, double b);
 257    double __svml_hypot4_ha_l9(double a, double b);
 258    double __svml_hypot1_ha_e9(double a, double b);
 259    double __svml_hypot2_ha_e9(double a, double b);
 260    double __svml_hypot4_ha_e9(double a, double b);
 261    double __svml_hypot8_ha_z0(double a, double b);
 262    float __svml_cbrtf4_ha_l9(float a);
 263    float __svml_cbrtf8_ha_l9(float a);
 264    float __svml_cbrtf4_ha_e9(float a);
 265    float __svml_cbrtf8_ha_e9(float a);
 266    float __svml_cbrtf16_ha_z0(float a);
 267    double __svml_cbrt1_ha_l9(double a);
 268    double __svml_cbrt2_ha_l9(double a);
 269    double __svml_cbrt4_ha_l9(double a);
 270    double __svml_cbrt1_ha_e9(double a);
 271    double __svml_cbrt2_ha_e9(double a);
 272    double __svml_cbrt4_ha_e9(double a);
 273    double __svml_cbrt8_ha_z0(double a);
 274    float __svml_atan2f4_ha_l9(float a, float b);
 275    float __svml_atan2f8_ha_l9(float a, float b);
 276    float __svml_atan2f4_ha_e9(float a, float b);
 277    float __svml_atan2f8_ha_e9(float a, float b);
 278    float __svml_atan2f16_ha_z0(float a, float b);
 279    double __svml_atan21_ha_l9(double a, double b);
 280    double __svml_atan22_ha_l9(double a, double b);
 281    double __svml_atan24_ha_l9(double a, double b);
 282    double __svml_atan28_ha_z0(double a, double b);
 283    double __svml_atan21_ha_e9(double a, double b);
 284    double __svml_atan22_ha_e9(double a, double b);
 285    double __svml_atan24_ha_e9(double a, double b);
 286    float __svml_sinf4_ha_ex(float a);
 287    double __svml_sin1_ha_ex(double a);
 288    double __svml_sin2_ha_ex(double a);
 289    float __svml_cosf4_ha_ex(float a);
 290    double __svml_cos1_ha_ex(double a);
 291    double __svml_cos2_ha_ex(double a);
 292    float __svml_tanf4_ha_ex(float a);
 293    double __svml_tan1_ha_ex(double a);
 294    double __svml_tan2_ha_ex(double a);
 295    float __svml_sinhf4_ha_ex(float a);
 296    double __svml_sinh1_ha_ex(double a);
 297    double __svml_sinh2_ha_ex(double a);
 298    float __svml_coshf4_ha_ex(float a);
 299    double __svml_cosh1_ha_ex(double a);
 300    double __svml_cosh2_ha_ex(double a);
 301    float __svml_tanhf4_ha_ex(float a);
 302    double __svml_tanh1_ha_ex(double a);
 303    double __svml_tanh2_ha_ex(double a);
 304    double __svml_log1_ha_ex(double a);
 305    double __svml_log2_ha_ex(double a);
 306    double __svml_log1p1_ha_ex(double a);
 307    double __svml_log1p2_ha_ex(double a);
 308    double __svml_log101_ha_ex(double a);
 309    double __svml_log102_ha_ex(double a);
 310    float __svml_logf4_ha_ex(float a);
 311    float __svml_log1pf4_ha_ex(float a);
 312    float __svml_log10f4_ha_ex(float a);
 313    double __svml_atan21_ha_ex(double a); 
 314    double __svml_atan22_ha_ex(double a); 
 315    float __svml_atan2f4_ha_ex(float a);
 316    float __svml_hypotf4_ha_ex(float a);
 317    double __svml_hypot1_ha_ex(double a);
 318    double __svml_hypot2_ha_ex(double a);
 319    double __svml_pow1_ha_ex(double a);
 320    double __svml_pow2_ha_ex(double a);
 321    float __svml_powf4_ha_ex(float a);
 322    double __svml_cbrt1_ha_ex(double a);
 323    double __svml_cbrt2_ha_ex(double a);
 324    float __svml_cbrtf4_ha_ex(float a);
 325 }
 326 #endif
 327 
 328 // Declaration and definition of StubGenerator (no .hpp file).
 329 // For a more detailed description of the stub routine structure
 330 // see the comment in stubRoutines.hpp
 331 
 332 #define __ _masm->
 333 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
 334 #define a__ ((Assembler*)_masm)->
 335 
 336 #ifdef PRODUCT
 337 #define BLOCK_COMMENT(str) /* nothing */
 338 #else
 339 #define BLOCK_COMMENT(str) __ block_comment(str)
 340 #endif
 341 
 342 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
 343 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
 344 
 345 // Stub Code definitions
 346 
 347 class StubGenerator: public StubCodeGenerator {
 348  private:
 349 
 350 #ifdef PRODUCT
 351 #define inc_counter_np(counter) ((void)0)
 352 #else
 353   void inc_counter_np_(int& counter) {
 354     // This can destroy rscratch1 if counter is far from the code cache
 355     __ incrementl(ExternalAddress((address)&counter));
 356   }
 357 #define inc_counter_np(counter) \
 358   BLOCK_COMMENT("inc_counter " #counter); \
 359   inc_counter_np_(counter);
 360 #endif
 361 
 362   // Call stubs are used to call Java from C
 363   //
 364   // Linux Arguments:
 365   //    c_rarg0:   call wrapper address                   address
 366   //    c_rarg1:   result                                 address
 367   //    c_rarg2:   result type                            BasicType
 368   //    c_rarg3:   method                                 Method*
 369   //    c_rarg4:   (interpreter) entry point              address
 370   //    c_rarg5:   parameters                             intptr_t*
 371   //    16(rbp): parameter size (in words)              int
 372   //    24(rbp): thread                                 Thread*
 373   //
 374   //     [ return_from_Java     ] <--- rsp
 375   //     [ argument word n      ]
 376   //      ...
 377   // -12 [ argument word 1      ]
 378   // -11 [ saved r15            ] <--- rsp_after_call
 379   // -10 [ saved r14            ]
 380   //  -9 [ saved r13            ]
 381   //  -8 [ saved r12            ]
 382   //  -7 [ saved rbx            ]
 383   //  -6 [ call wrapper         ]
 384   //  -5 [ result               ]
 385   //  -4 [ result type          ]
 386   //  -3 [ method               ]
 387   //  -2 [ entry point          ]
 388   //  -1 [ parameters           ]
 389   //   0 [ saved rbp            ] <--- rbp
 390   //   1 [ return address       ]
 391   //   2 [ parameter size       ]
 392   //   3 [ thread               ]
 393   //
 394   // Windows Arguments:
 395   //    c_rarg0:   call wrapper address                   address
 396   //    c_rarg1:   result                                 address
 397   //    c_rarg2:   result type                            BasicType
 398   //    c_rarg3:   method                                 Method*
 399   //    48(rbp): (interpreter) entry point              address
 400   //    56(rbp): parameters                             intptr_t*
 401   //    64(rbp): parameter size (in words)              int
 402   //    72(rbp): thread                                 Thread*
 403   //
 404   //     [ return_from_Java     ] <--- rsp
 405   //     [ argument word n      ]
 406   //      ...
 407   // -60 [ argument word 1      ]
 408   // -59 [ saved xmm31          ] <--- rsp after_call
 409   //     [ saved xmm16-xmm30    ] (EVEX enabled, else the space is blank)
 410   // -27 [ saved xmm15          ]
 411   //     [ saved xmm7-xmm14     ]
 412   //  -9 [ saved xmm6           ] (each xmm register takes 2 slots)
 413   //  -7 [ saved r15            ]
 414   //  -6 [ saved r14            ]
 415   //  -5 [ saved r13            ]
 416   //  -4 [ saved r12            ]
 417   //  -3 [ saved rdi            ]
 418   //  -2 [ saved rsi            ]
 419   //  -1 [ saved rbx            ]
 420   //   0 [ saved rbp            ] <--- rbp
 421   //   1 [ return address       ]
 422   //   2 [ call wrapper         ]
 423   //   3 [ result               ]
 424   //   4 [ result type          ]
 425   //   5 [ method               ]
 426   //   6 [ entry point          ]
 427   //   7 [ parameters           ]
 428   //   8 [ parameter size       ]
 429   //   9 [ thread               ]
 430   //
 431   //    Windows reserves the callers stack space for arguments 1-4.
 432   //    We spill c_rarg0-c_rarg3 to this space.
 433 
 434   // Call stub stack layout word offsets from rbp
 435   enum call_stub_layout {
 436 #ifdef _WIN64
 437     xmm_save_first     = 6,  // save from xmm6
 438     xmm_save_last      = 31, // to xmm31
 439     xmm_save_base      = -9,
 440     rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
 441     r15_off            = -7,
 442     r14_off            = -6,
 443     r13_off            = -5,
 444     r12_off            = -4,
 445     rdi_off            = -3,
 446     rsi_off            = -2,
 447     rbx_off            = -1,
 448     rbp_off            =  0,
 449     retaddr_off        =  1,
 450     call_wrapper_off   =  2,
 451     result_off         =  3,
 452     result_type_off    =  4,
 453     method_off         =  5,
 454     entry_point_off    =  6,
 455     parameters_off     =  7,
 456     parameter_size_off =  8,
 457     thread_off         =  9
 458 #else
 459     rsp_after_call_off = -12,
 460     mxcsr_off          = rsp_after_call_off,
 461     r15_off            = -11,
 462     r14_off            = -10,
 463     r13_off            = -9,
 464     r12_off            = -8,
 465     rbx_off            = -7,
 466     call_wrapper_off   = -6,
 467     result_off         = -5,
 468     result_type_off    = -4,
 469     method_off         = -3,
 470     entry_point_off    = -2,
 471     parameters_off     = -1,
 472     rbp_off            =  0,
 473     retaddr_off        =  1,
 474     parameter_size_off =  2,
 475     thread_off         =  3
 476 #endif
 477   };
 478 
 479 #ifdef _WIN64
 480   Address xmm_save(int reg) {
 481     assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range");
 482     return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize);
 483   }
 484 #endif
 485 
 486   address generate_call_stub(address& return_address) {
 487     assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 &&
 488            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 489            "adjust this code");
 490     StubCodeMark mark(this, "StubRoutines", "call_stub");
 491     address start = __ pc();
 492 
 493     // same as in generate_catch_exception()!
 494     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 495 
 496     const Address call_wrapper  (rbp, call_wrapper_off   * wordSize);
 497     const Address result        (rbp, result_off         * wordSize);
 498     const Address result_type   (rbp, result_type_off    * wordSize);
 499     const Address method        (rbp, method_off         * wordSize);
 500     const Address entry_point   (rbp, entry_point_off    * wordSize);
 501     const Address parameters    (rbp, parameters_off     * wordSize);
 502     const Address parameter_size(rbp, parameter_size_off * wordSize);
 503 
 504     // same as in generate_catch_exception()!
 505     const Address thread        (rbp, thread_off         * wordSize);
 506 
 507     const Address r15_save(rbp, r15_off * wordSize);
 508     const Address r14_save(rbp, r14_off * wordSize);
 509     const Address r13_save(rbp, r13_off * wordSize);
 510     const Address r12_save(rbp, r12_off * wordSize);
 511     const Address rbx_save(rbp, rbx_off * wordSize);
 512 
 513     // stub code
 514     __ enter();
 515     __ subptr(rsp, -rsp_after_call_off * wordSize);
 516 
 517     // save register parameters
 518 #ifndef _WIN64
 519     __ movptr(parameters,   c_rarg5); // parameters
 520     __ movptr(entry_point,  c_rarg4); // entry_point
 521 #endif
 522 
 523     __ movptr(method,       c_rarg3); // method
 524     __ movl(result_type,  c_rarg2);   // result type
 525     __ movptr(result,       c_rarg1); // result
 526     __ movptr(call_wrapper, c_rarg0); // call wrapper
 527 
 528     // save regs belonging to calling function
 529     __ movptr(rbx_save, rbx);
 530     __ movptr(r12_save, r12);
 531     __ movptr(r13_save, r13);
 532     __ movptr(r14_save, r14);
 533     __ movptr(r15_save, r15);
 534     if (UseAVX > 2) {
 535       __ movl(rbx, 0xffff);
 536       __ kmovwl(k1, rbx);
 537     }
 538 #ifdef _WIN64
 539     int last_reg = 15;
 540     if (UseAVX > 2) {
 541       last_reg = 31;
 542     }
 543     if (VM_Version::supports_evex()) {
 544       for (int i = xmm_save_first; i <= last_reg; i++) {
 545         __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0);
 546       }
 547     } else {
 548       for (int i = xmm_save_first; i <= last_reg; i++) {
 549         __ movdqu(xmm_save(i), as_XMMRegister(i));
 550       }
 551     }
 552 
 553     const Address rdi_save(rbp, rdi_off * wordSize);
 554     const Address rsi_save(rbp, rsi_off * wordSize);
 555 
 556     __ movptr(rsi_save, rsi);
 557     __ movptr(rdi_save, rdi);
 558 #else
 559     const Address mxcsr_save(rbp, mxcsr_off * wordSize);
 560     {
 561       Label skip_ldmx;
 562       __ stmxcsr(mxcsr_save);
 563       __ movl(rax, mxcsr_save);
 564       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 565       ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
 566       __ cmp32(rax, mxcsr_std);
 567       __ jcc(Assembler::equal, skip_ldmx);
 568       __ ldmxcsr(mxcsr_std);
 569       __ bind(skip_ldmx);
 570     }
 571 #endif
 572 
 573     // Load up thread register
 574     __ movptr(r15_thread, thread);
 575     __ reinit_heapbase();
 576 
 577 #ifdef ASSERT
 578     // make sure we have no pending exceptions
 579     {
 580       Label L;
 581       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 582       __ jcc(Assembler::equal, L);
 583       __ stop("StubRoutines::call_stub: entered with pending exception");
 584       __ bind(L);
 585     }
 586 #endif
 587 
 588     // pass parameters if any
 589     BLOCK_COMMENT("pass parameters if any");
 590     Label parameters_done;
 591     __ movl(c_rarg3, parameter_size);
 592     __ testl(c_rarg3, c_rarg3);
 593     __ jcc(Assembler::zero, parameters_done);
 594 
 595     Label loop;
 596     __ movptr(c_rarg2, parameters);       // parameter pointer
 597     __ movl(c_rarg1, c_rarg3);            // parameter counter is in c_rarg1
 598     __ BIND(loop);
 599     __ movptr(rax, Address(c_rarg2, 0));// get parameter
 600     __ addptr(c_rarg2, wordSize);       // advance to next parameter
 601     __ decrementl(c_rarg1);             // decrement counter
 602     __ push(rax);                       // pass parameter
 603     __ jcc(Assembler::notZero, loop);
 604 
 605     // call Java function
 606     __ BIND(parameters_done);
 607     __ movptr(rbx, method);             // get Method*
 608     __ movptr(c_rarg1, entry_point);    // get entry_point
 609     __ mov(r13, rsp);                   // set sender sp
 610     BLOCK_COMMENT("call Java function");
 611     __ call(c_rarg1);
 612 
 613     BLOCK_COMMENT("call_stub_return_address:");
 614     return_address = __ pc();
 615 
 616     // store result depending on type (everything that is not
 617     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 618     __ movptr(c_rarg0, result);
 619     Label is_long, is_float, is_double, exit;
 620     __ movl(c_rarg1, result_type);
 621     __ cmpl(c_rarg1, T_OBJECT);
 622     __ jcc(Assembler::equal, is_long);
 623     __ cmpl(c_rarg1, T_LONG);
 624     __ jcc(Assembler::equal, is_long);
 625     __ cmpl(c_rarg1, T_FLOAT);
 626     __ jcc(Assembler::equal, is_float);
 627     __ cmpl(c_rarg1, T_DOUBLE);
 628     __ jcc(Assembler::equal, is_double);
 629 
 630     // handle T_INT case
 631     __ movl(Address(c_rarg0, 0), rax);
 632 
 633     __ BIND(exit);
 634 
 635     // pop parameters
 636     __ lea(rsp, rsp_after_call);
 637 
 638 #ifdef ASSERT
 639     // verify that threads correspond
 640     {
 641      Label L1, L2, L3;
 642       __ cmpptr(r15_thread, thread);
 643       __ jcc(Assembler::equal, L1);
 644       __ stop("StubRoutines::call_stub: r15_thread is corrupted");
 645       __ bind(L1);
 646       __ get_thread(rbx);
 647       __ cmpptr(r15_thread, thread);
 648       __ jcc(Assembler::equal, L2);
 649       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
 650       __ bind(L2);
 651       __ cmpptr(r15_thread, rbx);
 652       __ jcc(Assembler::equal, L3);
 653       __ stop("StubRoutines::call_stub: threads must correspond");
 654       __ bind(L3);
 655     }
 656 #endif
 657 
 658     // restore regs belonging to calling function
 659 #ifdef _WIN64
 660     // emit the restores for xmm regs
 661     if (VM_Version::supports_evex()) {
 662       for (int i = xmm_save_first; i <= last_reg; i++) {
 663         __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
 664       }
 665     } else {
 666       for (int i = xmm_save_first; i <= last_reg; i++) {
 667         __ movdqu(as_XMMRegister(i), xmm_save(i));
 668       }
 669     }
 670 #endif
 671     __ movptr(r15, r15_save);
 672     __ movptr(r14, r14_save);
 673     __ movptr(r13, r13_save);
 674     __ movptr(r12, r12_save);
 675     __ movptr(rbx, rbx_save);
 676 
 677 #ifdef _WIN64
 678     __ movptr(rdi, rdi_save);
 679     __ movptr(rsi, rsi_save);
 680 #else
 681     __ ldmxcsr(mxcsr_save);
 682 #endif
 683 
 684     // restore rsp
 685     __ addptr(rsp, -rsp_after_call_off * wordSize);
 686 
 687     // return
 688     __ vzeroupper();
 689     __ pop(rbp);
 690     __ ret(0);
 691 
 692     // handle return types different from T_INT
 693     __ BIND(is_long);
 694     __ movq(Address(c_rarg0, 0), rax);
 695     __ jmp(exit);
 696 
 697     __ BIND(is_float);
 698     __ movflt(Address(c_rarg0, 0), xmm0);
 699     __ jmp(exit);
 700 
 701     __ BIND(is_double);
 702     __ movdbl(Address(c_rarg0, 0), xmm0);
 703     __ jmp(exit);
 704 
 705     return start;
 706   }
 707 
 708   // Return point for a Java call if there's an exception thrown in
 709   // Java code.  The exception is caught and transformed into a
 710   // pending exception stored in JavaThread that can be tested from
 711   // within the VM.
 712   //
 713   // Note: Usually the parameters are removed by the callee. In case
 714   // of an exception crossing an activation frame boundary, that is
 715   // not the case if the callee is compiled code => need to setup the
 716   // rsp.
 717   //
 718   // rax: exception oop
 719 
 720   address generate_catch_exception() {
 721     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 722     address start = __ pc();
 723 
 724     // same as in generate_call_stub():
 725     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 726     const Address thread        (rbp, thread_off         * wordSize);
 727 
 728 #ifdef ASSERT
 729     // verify that threads correspond
 730     {
 731       Label L1, L2, L3;
 732       __ cmpptr(r15_thread, thread);
 733       __ jcc(Assembler::equal, L1);
 734       __ stop("StubRoutines::catch_exception: r15_thread is corrupted");
 735       __ bind(L1);
 736       __ get_thread(rbx);
 737       __ cmpptr(r15_thread, thread);
 738       __ jcc(Assembler::equal, L2);
 739       __ stop("StubRoutines::catch_exception: r15_thread is modified by call");
 740       __ bind(L2);
 741       __ cmpptr(r15_thread, rbx);
 742       __ jcc(Assembler::equal, L3);
 743       __ stop("StubRoutines::catch_exception: threads must correspond");
 744       __ bind(L3);
 745     }
 746 #endif
 747 
 748     // set pending exception
 749     __ verify_oop(rax);
 750 
 751     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
 752     __ lea(rscratch1, ExternalAddress((address)__FILE__));
 753     __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
 754     __ movl(Address(r15_thread, Thread::exception_line_offset()), (int)  __LINE__);
 755 
 756     // complete return to VM
 757     assert(StubRoutines::_call_stub_return_address != NULL,
 758            "_call_stub_return_address must have been generated before");
 759     __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
 760 
 761     return start;
 762   }
 763 
 764   // Continuation point for runtime calls returning with a pending
 765   // exception.  The pending exception check happened in the runtime
 766   // or native call stub.  The pending exception in Thread is
 767   // converted into a Java-level exception.
 768   //
 769   // Contract with Java-level exception handlers:
 770   // rax: exception
 771   // rdx: throwing pc
 772   //
 773   // NOTE: At entry of this stub, exception-pc must be on stack !!
 774 
 775   address generate_forward_exception() {
 776     StubCodeMark mark(this, "StubRoutines", "forward exception");
 777     address start = __ pc();
 778 
 779     // Upon entry, the sp points to the return address returning into
 780     // Java (interpreted or compiled) code; i.e., the return address
 781     // becomes the throwing pc.
 782     //
 783     // Arguments pushed before the runtime call are still on the stack
 784     // but the exception handler will reset the stack pointer ->
 785     // ignore them.  A potential result in registers can be ignored as
 786     // well.
 787 
 788 #ifdef ASSERT
 789     // make sure this code is only executed if there is a pending exception
 790     {
 791       Label L;
 792       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL);
 793       __ jcc(Assembler::notEqual, L);
 794       __ stop("StubRoutines::forward exception: no pending exception (1)");
 795       __ bind(L);
 796     }
 797 #endif
 798 
 799     // compute exception handler into rbx
 800     __ movptr(c_rarg0, Address(rsp, 0));
 801     BLOCK_COMMENT("call exception_handler_for_return_address");
 802     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 803                          SharedRuntime::exception_handler_for_return_address),
 804                     r15_thread, c_rarg0);
 805     __ mov(rbx, rax);
 806 
 807     // setup rax & rdx, remove return address & clear pending exception
 808     __ pop(rdx);
 809     __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 810     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 811 
 812 #ifdef ASSERT
 813     // make sure exception is set
 814     {
 815       Label L;
 816       __ testptr(rax, rax);
 817       __ jcc(Assembler::notEqual, L);
 818       __ stop("StubRoutines::forward exception: no pending exception (2)");
 819       __ bind(L);
 820     }
 821 #endif
 822 
 823     // continue at exception handler (return address removed)
 824     // rax: exception
 825     // rbx: exception handler
 826     // rdx: throwing pc
 827     __ verify_oop(rax);
 828     __ jmp(rbx);
 829 
 830     return start;
 831   }
 832 
 833   // Support for jint atomic::xchg(jint exchange_value, volatile jint* dest)
 834   //
 835   // Arguments :
 836   //    c_rarg0: exchange_value
 837   //    c_rarg0: dest
 838   //
 839   // Result:
 840   //    *dest <- ex, return (orig *dest)
 841   address generate_atomic_xchg() {
 842     StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
 843     address start = __ pc();
 844 
 845     __ movl(rax, c_rarg0); // Copy to eax we need a return value anyhow
 846     __ xchgl(rax, Address(c_rarg1, 0)); // automatic LOCK
 847     __ ret(0);
 848 
 849     return start;
 850   }
 851 
 852   // Support for intptr_t atomic::xchg_long(jlong exchange_value, volatile jlong* dest)
 853   //
 854   // Arguments :
 855   //    c_rarg0: exchange_value
 856   //    c_rarg1: dest
 857   //
 858   // Result:
 859   //    *dest <- ex, return (orig *dest)
 860   address generate_atomic_xchg_long() {
 861     StubCodeMark mark(this, "StubRoutines", "atomic_xchg_long");
 862     address start = __ pc();
 863 
 864     __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
 865     __ xchgptr(rax, Address(c_rarg1, 0)); // automatic LOCK
 866     __ ret(0);
 867 
 868     return start;
 869   }
 870 
 871   // Support for jint atomic::atomic_cmpxchg(jint exchange_value, volatile jint* dest,
 872   //                                         jint compare_value)
 873   //
 874   // Arguments :
 875   //    c_rarg0: exchange_value
 876   //    c_rarg1: dest
 877   //    c_rarg2: compare_value
 878   //
 879   // Result:
 880   //    if ( compare_value == *dest ) {
 881   //       *dest = exchange_value
 882   //       return compare_value;
 883   //    else
 884   //       return *dest;
 885   address generate_atomic_cmpxchg() {
 886     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
 887     address start = __ pc();
 888 
 889     __ movl(rax, c_rarg2);
 890    if ( os::is_MP() ) __ lock();
 891     __ cmpxchgl(c_rarg0, Address(c_rarg1, 0));
 892     __ ret(0);
 893 
 894     return start;
 895   }
 896 
 897   // Support for int8_t atomic::atomic_cmpxchg(int8_t exchange_value, volatile int8_t* dest,
 898   //                                           int8_t compare_value)
 899   //
 900   // Arguments :
 901   //    c_rarg0: exchange_value
 902   //    c_rarg1: dest
 903   //    c_rarg2: compare_value
 904   //
 905   // Result:
 906   //    if ( compare_value == *dest ) {
 907   //       *dest = exchange_value
 908   //       return compare_value;
 909   //    else
 910   //       return *dest;
 911   address generate_atomic_cmpxchg_byte() {
 912     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_byte");
 913     address start = __ pc();
 914 
 915     __ movsbq(rax, c_rarg2);
 916    if ( os::is_MP() ) __ lock();
 917     __ cmpxchgb(c_rarg0, Address(c_rarg1, 0));
 918     __ ret(0);
 919 
 920     return start;
 921   }
 922 
 923   // Support for int64_t atomic::atomic_cmpxchg(int64_t exchange_value,
 924   //                                            volatile int64_t* dest,
 925   //                                            int64_t compare_value)
 926   // Arguments :
 927   //    c_rarg0: exchange_value
 928   //    c_rarg1: dest
 929   //    c_rarg2: compare_value
 930   //
 931   // Result:
 932   //    if ( compare_value == *dest ) {
 933   //       *dest = exchange_value
 934   //       return compare_value;
 935   //    else
 936   //       return *dest;
 937   address generate_atomic_cmpxchg_long() {
 938     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
 939     address start = __ pc();
 940 
 941     __ movq(rax, c_rarg2);
 942    if ( os::is_MP() ) __ lock();
 943     __ cmpxchgq(c_rarg0, Address(c_rarg1, 0));
 944     __ ret(0);
 945 
 946     return start;
 947   }
 948 
 949   // Support for jint atomic::add(jint add_value, volatile jint* dest)
 950   //
 951   // Arguments :
 952   //    c_rarg0: add_value
 953   //    c_rarg1: dest
 954   //
 955   // Result:
 956   //    *dest += add_value
 957   //    return *dest;
 958   address generate_atomic_add() {
 959     StubCodeMark mark(this, "StubRoutines", "atomic_add");
 960     address start = __ pc();
 961 
 962     __ movl(rax, c_rarg0);
 963    if ( os::is_MP() ) __ lock();
 964     __ xaddl(Address(c_rarg1, 0), c_rarg0);
 965     __ addl(rax, c_rarg0);
 966     __ ret(0);
 967 
 968     return start;
 969   }
 970 
 971   // Support for intptr_t atomic::add_ptr(intptr_t add_value, volatile intptr_t* dest)
 972   //
 973   // Arguments :
 974   //    c_rarg0: add_value
 975   //    c_rarg1: dest
 976   //
 977   // Result:
 978   //    *dest += add_value
 979   //    return *dest;
 980   address generate_atomic_add_long() {
 981     StubCodeMark mark(this, "StubRoutines", "atomic_add_long");
 982     address start = __ pc();
 983 
 984     __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
 985    if ( os::is_MP() ) __ lock();
 986     __ xaddptr(Address(c_rarg1, 0), c_rarg0);
 987     __ addptr(rax, c_rarg0);
 988     __ ret(0);
 989 
 990     return start;
 991   }
 992 
 993   // Support for intptr_t OrderAccess::fence()
 994   //
 995   // Arguments :
 996   //
 997   // Result:
 998   address generate_orderaccess_fence() {
 999     StubCodeMark mark(this, "StubRoutines", "orderaccess_fence");
1000     address start = __ pc();
1001     __ membar(Assembler::StoreLoad);
1002     __ ret(0);
1003 
1004     return start;
1005   }
1006 
1007   // Support for intptr_t get_previous_fp()
1008   //
1009   // This routine is used to find the previous frame pointer for the
1010   // caller (current_frame_guess). This is used as part of debugging
1011   // ps() is seemingly lost trying to find frames.
1012   // This code assumes that caller current_frame_guess) has a frame.
1013   address generate_get_previous_fp() {
1014     StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
1015     const Address old_fp(rbp, 0);
1016     const Address older_fp(rax, 0);
1017     address start = __ pc();
1018 
1019     __ enter();
1020     __ movptr(rax, old_fp); // callers fp
1021     __ movptr(rax, older_fp); // the frame for ps()
1022     __ pop(rbp);
1023     __ ret(0);
1024 
1025     return start;
1026   }
1027 
1028   // Support for intptr_t get_previous_sp()
1029   //
1030   // This routine is used to find the previous stack pointer for the
1031   // caller.
1032   address generate_get_previous_sp() {
1033     StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
1034     address start = __ pc();
1035 
1036     __ movptr(rax, rsp);
1037     __ addptr(rax, 8); // return address is at the top of the stack.
1038     __ ret(0);
1039 
1040     return start;
1041   }
1042 
1043   //----------------------------------------------------------------------------------------------------
1044   // Support for void verify_mxcsr()
1045   //
1046   // This routine is used with -Xcheck:jni to verify that native
1047   // JNI code does not return to Java code without restoring the
1048   // MXCSR register to our expected state.
1049 
1050   address generate_verify_mxcsr() {
1051     StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
1052     address start = __ pc();
1053 
1054     const Address mxcsr_save(rsp, 0);
1055 
1056     if (CheckJNICalls) {
1057       Label ok_ret;
1058       ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
1059       __ push(rax);
1060       __ subptr(rsp, wordSize);      // allocate a temp location
1061       __ stmxcsr(mxcsr_save);
1062       __ movl(rax, mxcsr_save);
1063       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
1064       __ cmp32(rax, mxcsr_std);
1065       __ jcc(Assembler::equal, ok_ret);
1066 
1067       __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
1068 
1069       __ ldmxcsr(mxcsr_std);
1070 
1071       __ bind(ok_ret);
1072       __ addptr(rsp, wordSize);
1073       __ pop(rax);
1074     }
1075 
1076     __ ret(0);
1077 
1078     return start;
1079   }
1080 
1081   address generate_f2i_fixup() {
1082     StubCodeMark mark(this, "StubRoutines", "f2i_fixup");
1083     Address inout(rsp, 5 * wordSize); // return address + 4 saves
1084 
1085     address start = __ pc();
1086 
1087     Label L;
1088 
1089     __ push(rax);
1090     __ push(c_rarg3);
1091     __ push(c_rarg2);
1092     __ push(c_rarg1);
1093 
1094     __ movl(rax, 0x7f800000);
1095     __ xorl(c_rarg3, c_rarg3);
1096     __ movl(c_rarg2, inout);
1097     __ movl(c_rarg1, c_rarg2);
1098     __ andl(c_rarg1, 0x7fffffff);
1099     __ cmpl(rax, c_rarg1); // NaN? -> 0
1100     __ jcc(Assembler::negative, L);
1101     __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint
1102     __ movl(c_rarg3, 0x80000000);
1103     __ movl(rax, 0x7fffffff);
1104     __ cmovl(Assembler::positive, c_rarg3, rax);
1105 
1106     __ bind(L);
1107     __ movptr(inout, c_rarg3);
1108 
1109     __ pop(c_rarg1);
1110     __ pop(c_rarg2);
1111     __ pop(c_rarg3);
1112     __ pop(rax);
1113 
1114     __ ret(0);
1115 
1116     return start;
1117   }
1118 
1119   address generate_f2l_fixup() {
1120     StubCodeMark mark(this, "StubRoutines", "f2l_fixup");
1121     Address inout(rsp, 5 * wordSize); // return address + 4 saves
1122     address start = __ pc();
1123 
1124     Label L;
1125 
1126     __ push(rax);
1127     __ push(c_rarg3);
1128     __ push(c_rarg2);
1129     __ push(c_rarg1);
1130 
1131     __ movl(rax, 0x7f800000);
1132     __ xorl(c_rarg3, c_rarg3);
1133     __ movl(c_rarg2, inout);
1134     __ movl(c_rarg1, c_rarg2);
1135     __ andl(c_rarg1, 0x7fffffff);
1136     __ cmpl(rax, c_rarg1); // NaN? -> 0
1137     __ jcc(Assembler::negative, L);
1138     __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
1139     __ mov64(c_rarg3, 0x8000000000000000);
1140     __ mov64(rax, 0x7fffffffffffffff);
1141     __ cmov(Assembler::positive, c_rarg3, rax);
1142 
1143     __ bind(L);
1144     __ movptr(inout, c_rarg3);
1145 
1146     __ pop(c_rarg1);
1147     __ pop(c_rarg2);
1148     __ pop(c_rarg3);
1149     __ pop(rax);
1150 
1151     __ ret(0);
1152 
1153     return start;
1154   }
1155 
1156   address generate_d2i_fixup() {
1157     StubCodeMark mark(this, "StubRoutines", "d2i_fixup");
1158     Address inout(rsp, 6 * wordSize); // return address + 5 saves
1159 
1160     address start = __ pc();
1161 
1162     Label L;
1163 
1164     __ push(rax);
1165     __ push(c_rarg3);
1166     __ push(c_rarg2);
1167     __ push(c_rarg1);
1168     __ push(c_rarg0);
1169 
1170     __ movl(rax, 0x7ff00000);
1171     __ movq(c_rarg2, inout);
1172     __ movl(c_rarg3, c_rarg2);
1173     __ mov(c_rarg1, c_rarg2);
1174     __ mov(c_rarg0, c_rarg2);
1175     __ negl(c_rarg3);
1176     __ shrptr(c_rarg1, 0x20);
1177     __ orl(c_rarg3, c_rarg2);
1178     __ andl(c_rarg1, 0x7fffffff);
1179     __ xorl(c_rarg2, c_rarg2);
1180     __ shrl(c_rarg3, 0x1f);
1181     __ orl(c_rarg1, c_rarg3);
1182     __ cmpl(rax, c_rarg1);
1183     __ jcc(Assembler::negative, L); // NaN -> 0
1184     __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
1185     __ movl(c_rarg2, 0x80000000);
1186     __ movl(rax, 0x7fffffff);
1187     __ cmov(Assembler::positive, c_rarg2, rax);
1188 
1189     __ bind(L);
1190     __ movptr(inout, c_rarg2);
1191 
1192     __ pop(c_rarg0);
1193     __ pop(c_rarg1);
1194     __ pop(c_rarg2);
1195     __ pop(c_rarg3);
1196     __ pop(rax);
1197 
1198     __ ret(0);
1199 
1200     return start;
1201   }
1202 
1203   address generate_d2l_fixup() {
1204     StubCodeMark mark(this, "StubRoutines", "d2l_fixup");
1205     Address inout(rsp, 6 * wordSize); // return address + 5 saves
1206 
1207     address start = __ pc();
1208 
1209     Label L;
1210 
1211     __ push(rax);
1212     __ push(c_rarg3);
1213     __ push(c_rarg2);
1214     __ push(c_rarg1);
1215     __ push(c_rarg0);
1216 
1217     __ movl(rax, 0x7ff00000);
1218     __ movq(c_rarg2, inout);
1219     __ movl(c_rarg3, c_rarg2);
1220     __ mov(c_rarg1, c_rarg2);
1221     __ mov(c_rarg0, c_rarg2);
1222     __ negl(c_rarg3);
1223     __ shrptr(c_rarg1, 0x20);
1224     __ orl(c_rarg3, c_rarg2);
1225     __ andl(c_rarg1, 0x7fffffff);
1226     __ xorl(c_rarg2, c_rarg2);
1227     __ shrl(c_rarg3, 0x1f);
1228     __ orl(c_rarg1, c_rarg3);
1229     __ cmpl(rax, c_rarg1);
1230     __ jcc(Assembler::negative, L); // NaN -> 0
1231     __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong
1232     __ mov64(c_rarg2, 0x8000000000000000);
1233     __ mov64(rax, 0x7fffffffffffffff);
1234     __ cmovq(Assembler::positive, c_rarg2, rax);
1235 
1236     __ bind(L);
1237     __ movq(inout, c_rarg2);
1238 
1239     __ pop(c_rarg0);
1240     __ pop(c_rarg1);
1241     __ pop(c_rarg2);
1242     __ pop(c_rarg3);
1243     __ pop(rax);
1244 
1245     __ ret(0);
1246 
1247     return start;
1248   }
1249 
1250   address generate_fp_mask(const char *stub_name, int64_t mask) {
1251     __ align(CodeEntryAlignment);
1252     StubCodeMark mark(this, "StubRoutines", stub_name);
1253     address start = __ pc();
1254 
1255     __ emit_data64( mask, relocInfo::none );
1256     __ emit_data64( mask, relocInfo::none );
1257 
1258     return start;
1259   }
1260 
1261   address generate_vector_fp_mask(const char *stub_name, int64_t mask) {
1262     __ align(CodeEntryAlignment);
1263     StubCodeMark mark(this, "StubRoutines", stub_name);
1264     address start = __ pc();
1265 
1266     __ emit_data64(mask, relocInfo::none);
1267     __ emit_data64(mask, relocInfo::none);
1268     __ emit_data64(mask, relocInfo::none);
1269     __ emit_data64(mask, relocInfo::none);
1270     __ emit_data64(mask, relocInfo::none);
1271     __ emit_data64(mask, relocInfo::none);
1272     __ emit_data64(mask, relocInfo::none);
1273     __ emit_data64(mask, relocInfo::none);
1274 
1275     return start;
1276   }
1277 
1278   address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len,
1279                                      int32_t val0, int32_t val1, int32_t val2, int32_t val3,
1280                                      int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0,
1281                                      int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0,
1282                                      int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0) {
1283     __ align(CodeEntryAlignment);
1284     StubCodeMark mark(this, "StubRoutines", stub_name);
1285     address start = __ pc();
1286 
1287     assert(len != Assembler::AVX_NoVec, "vector len must be specified");
1288     __ emit_data(val0, relocInfo::none, 0);
1289     __ emit_data(val1, relocInfo::none, 0);
1290     __ emit_data(val2, relocInfo::none, 0);
1291     __ emit_data(val3, relocInfo::none, 0);
1292     if (len >= Assembler::AVX_256bit) {
1293       __ emit_data(val4, relocInfo::none, 0);
1294       __ emit_data(val5, relocInfo::none, 0);
1295       __ emit_data(val6, relocInfo::none, 0);
1296       __ emit_data(val7, relocInfo::none, 0);
1297       if (len >= Assembler::AVX_512bit) {
1298         __ emit_data(val8, relocInfo::none, 0);
1299         __ emit_data(val9, relocInfo::none, 0);
1300         __ emit_data(val10, relocInfo::none, 0);
1301         __ emit_data(val11, relocInfo::none, 0);
1302         __ emit_data(val12, relocInfo::none, 0);
1303         __ emit_data(val13, relocInfo::none, 0);
1304         __ emit_data(val14, relocInfo::none, 0);
1305         __ emit_data(val15, relocInfo::none, 0);
1306       }
1307     }
1308 
1309     return start;
1310   }
1311 
1312   // Non-destructive plausibility checks for oops
1313   //
1314   // Arguments:
1315   //    all args on stack!
1316   //
1317   // Stack after saving c_rarg3:
1318   //    [tos + 0]: saved c_rarg3
1319   //    [tos + 1]: saved c_rarg2
1320   //    [tos + 2]: saved r12 (several TemplateTable methods use it)
1321   //    [tos + 3]: saved flags
1322   //    [tos + 4]: return address
1323   //  * [tos + 5]: error message (char*)
1324   //  * [tos + 6]: object to verify (oop)
1325   //  * [tos + 7]: saved rax - saved by caller and bashed
1326   //  * [tos + 8]: saved r10 (rscratch1) - saved by caller
1327   //  * = popped on exit
1328   address generate_verify_oop() {
1329     StubCodeMark mark(this, "StubRoutines", "verify_oop");
1330     address start = __ pc();
1331 
1332     Label exit, error;
1333 
1334     __ pushf();
1335     __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
1336 
1337     __ push(r12);
1338 
1339     // save c_rarg2 and c_rarg3
1340     __ push(c_rarg2);
1341     __ push(c_rarg3);
1342 
1343     enum {
1344            // After previous pushes.
1345            oop_to_verify = 6 * wordSize,
1346            saved_rax     = 7 * wordSize,
1347            saved_r10     = 8 * wordSize,
1348 
1349            // Before the call to MacroAssembler::debug(), see below.
1350            return_addr   = 16 * wordSize,
1351            error_msg     = 17 * wordSize
1352     };
1353 
1354     // get object
1355     __ movptr(rax, Address(rsp, oop_to_verify));
1356 
1357     // make sure object is 'reasonable'
1358     __ testptr(rax, rax);
1359     __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
1360 
1361 #if INCLUDE_ZGC
1362     if (UseZGC) {
1363       // Check if metadata bits indicate a bad oop
1364       __ testptr(rax, Address(r15_thread, ZThreadLocalData::address_bad_mask_offset()));
1365       __ jcc(Assembler::notZero, error);
1366     }
1367 #endif
1368 
1369     // Check if the oop is in the right area of memory
1370     __ movptr(c_rarg2, rax);
1371     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
1372     __ andptr(c_rarg2, c_rarg3);
1373     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
1374     __ cmpptr(c_rarg2, c_rarg3);
1375     __ jcc(Assembler::notZero, error);
1376 
1377     // set r12 to heapbase for load_klass()
1378     __ reinit_heapbase();
1379 
1380     // make sure klass is 'reasonable', which is not zero.
1381     __ load_klass(rax, rax);  // get klass
1382     __ testptr(rax, rax);
1383     __ jcc(Assembler::zero, error); // if klass is NULL it is broken
1384 
1385     // return if everything seems ok
1386     __ bind(exit);
1387     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1388     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1389     __ pop(c_rarg3);                             // restore c_rarg3
1390     __ pop(c_rarg2);                             // restore c_rarg2
1391     __ pop(r12);                                 // restore r12
1392     __ popf();                                   // restore flags
1393     __ ret(4 * wordSize);                        // pop caller saved stuff
1394 
1395     // handle errors
1396     __ bind(error);
1397     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1398     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1399     __ pop(c_rarg3);                             // get saved c_rarg3 back
1400     __ pop(c_rarg2);                             // get saved c_rarg2 back
1401     __ pop(r12);                                 // get saved r12 back
1402     __ popf();                                   // get saved flags off stack --
1403                                                  // will be ignored
1404 
1405     __ pusha();                                  // push registers
1406                                                  // (rip is already
1407                                                  // already pushed)
1408     // debug(char* msg, int64_t pc, int64_t regs[])
1409     // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and
1410     // pushed all the registers, so now the stack looks like:
1411     //     [tos +  0] 16 saved registers
1412     //     [tos + 16] return address
1413     //   * [tos + 17] error message (char*)
1414     //   * [tos + 18] object to verify (oop)
1415     //   * [tos + 19] saved rax - saved by caller and bashed
1416     //   * [tos + 20] saved r10 (rscratch1) - saved by caller
1417     //   * = popped on exit
1418 
1419     __ movptr(c_rarg0, Address(rsp, error_msg));    // pass address of error message
1420     __ movptr(c_rarg1, Address(rsp, return_addr));  // pass return address
1421     __ movq(c_rarg2, rsp);                          // pass address of regs on stack
1422     __ mov(r12, rsp);                               // remember rsp
1423     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1424     __ andptr(rsp, -16);                            // align stack as required by ABI
1425     BLOCK_COMMENT("call MacroAssembler::debug");
1426     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
1427     __ mov(rsp, r12);                               // restore rsp
1428     __ popa();                                      // pop registers (includes r12)
1429     __ ret(4 * wordSize);                           // pop caller saved stuff
1430 
1431     return start;
1432   }
1433 
1434   //
1435   // Verify that a register contains clean 32-bits positive value
1436   // (high 32-bits are 0) so it could be used in 64-bits shifts.
1437   //
1438   //  Input:
1439   //    Rint  -  32-bits value
1440   //    Rtmp  -  scratch
1441   //
1442   void assert_clean_int(Register Rint, Register Rtmp) {
1443 #ifdef ASSERT
1444     Label L;
1445     assert_different_registers(Rtmp, Rint);
1446     __ movslq(Rtmp, Rint);
1447     __ cmpq(Rtmp, Rint);
1448     __ jcc(Assembler::equal, L);
1449     __ stop("high 32-bits of int value are not 0");
1450     __ bind(L);
1451 #endif
1452   }
1453 
1454   //  Generate overlap test for array copy stubs
1455   //
1456   //  Input:
1457   //     c_rarg0 - from
1458   //     c_rarg1 - to
1459   //     c_rarg2 - element count
1460   //
1461   //  Output:
1462   //     rax   - &from[element count - 1]
1463   //
1464   void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
1465     assert(no_overlap_target != NULL, "must be generated");
1466     array_overlap_test(no_overlap_target, NULL, sf);
1467   }
1468   void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
1469     array_overlap_test(NULL, &L_no_overlap, sf);
1470   }
1471   void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
1472     const Register from     = c_rarg0;
1473     const Register to       = c_rarg1;
1474     const Register count    = c_rarg2;
1475     const Register end_from = rax;
1476 
1477     __ cmpptr(to, from);
1478     __ lea(end_from, Address(from, count, sf, 0));
1479     if (NOLp == NULL) {
1480       ExternalAddress no_overlap(no_overlap_target);
1481       __ jump_cc(Assembler::belowEqual, no_overlap);
1482       __ cmpptr(to, end_from);
1483       __ jump_cc(Assembler::aboveEqual, no_overlap);
1484     } else {
1485       __ jcc(Assembler::belowEqual, (*NOLp));
1486       __ cmpptr(to, end_from);
1487       __ jcc(Assembler::aboveEqual, (*NOLp));
1488     }
1489   }
1490 
1491   // Shuffle first three arg regs on Windows into Linux/Solaris locations.
1492   //
1493   // Outputs:
1494   //    rdi - rcx
1495   //    rsi - rdx
1496   //    rdx - r8
1497   //    rcx - r9
1498   //
1499   // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter
1500   // are non-volatile.  r9 and r10 should not be used by the caller.
1501   //
1502   void setup_arg_regs(int nargs = 3) {
1503     const Register saved_rdi = r9;
1504     const Register saved_rsi = r10;
1505     assert(nargs == 3 || nargs == 4, "else fix");
1506 #ifdef _WIN64
1507     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1508            "unexpected argument registers");
1509     if (nargs >= 4)
1510       __ mov(rax, r9);  // r9 is also saved_rdi
1511     __ movptr(saved_rdi, rdi);
1512     __ movptr(saved_rsi, rsi);
1513     __ mov(rdi, rcx); // c_rarg0
1514     __ mov(rsi, rdx); // c_rarg1
1515     __ mov(rdx, r8);  // c_rarg2
1516     if (nargs >= 4)
1517       __ mov(rcx, rax); // c_rarg3 (via rax)
1518 #else
1519     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1520            "unexpected argument registers");
1521 #endif
1522   }
1523 
1524   void restore_arg_regs() {
1525     const Register saved_rdi = r9;
1526     const Register saved_rsi = r10;
1527 #ifdef _WIN64
1528     __ movptr(rdi, saved_rdi);
1529     __ movptr(rsi, saved_rsi);
1530 #endif
1531   }
1532 
1533 
1534   // Copy big chunks forward
1535   //
1536   // Inputs:
1537   //   end_from     - source arrays end address
1538   //   end_to       - destination array end address
1539   //   qword_count  - 64-bits element count, negative
1540   //   to           - scratch
1541   //   L_copy_bytes - entry label
1542   //   L_copy_8_bytes  - exit  label
1543   //
1544   void copy_bytes_forward(Register end_from, Register end_to,
1545                              Register qword_count, Register to,
1546                              Label& L_copy_bytes, Label& L_copy_8_bytes) {
1547     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1548     Label L_loop;
1549     __ align(OptoLoopAlignment);
1550     if (UseUnalignedLoadStores) {
1551       Label L_end;
1552       if (UseAVX > 2) {
1553         __ movl(to, 0xffff);
1554         __ kmovwl(k1, to);
1555       }
1556       // Copy 64-bytes per iteration
1557       __ BIND(L_loop);
1558       if (UseAVX > 2) {
1559         __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
1560         __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
1561       } else if (UseAVX == 2) {
1562         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1563         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1564         __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
1565         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1566       } else {
1567         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1568         __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1569         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
1570         __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
1571         __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
1572         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
1573         __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
1574         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
1575       }
1576       __ BIND(L_copy_bytes);
1577       __ addptr(qword_count, 8);
1578       __ jcc(Assembler::lessEqual, L_loop);
1579       __ subptr(qword_count, 4);  // sub(8) and add(4)
1580       __ jccb(Assembler::greater, L_end);
1581       // Copy trailing 32 bytes
1582       if (UseAVX >= 2) {
1583         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1584         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1585       } else {
1586         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1587         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1588         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1589         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1590       }
1591       __ addptr(qword_count, 4);
1592       __ BIND(L_end);
1593       if (UseAVX >= 2) {
1594         // clean upper bits of YMM registers
1595         __ vpxor(xmm0, xmm0);
1596         __ vpxor(xmm1, xmm1);
1597       }
1598     } else {
1599       // Copy 32-bytes per iteration
1600       __ BIND(L_loop);
1601       __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1602       __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1603       __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1604       __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1605       __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1606       __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1607       __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1608       __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1609 
1610       __ BIND(L_copy_bytes);
1611       __ addptr(qword_count, 4);
1612       __ jcc(Assembler::lessEqual, L_loop);
1613     }
1614     __ subptr(qword_count, 4);
1615     __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1616   }
1617 
1618   // Copy big chunks backward
1619   //
1620   // Inputs:
1621   //   from         - source arrays address
1622   //   dest         - destination array address
1623   //   qword_count  - 64-bits element count
1624   //   to           - scratch
1625   //   L_copy_bytes - entry label
1626   //   L_copy_8_bytes  - exit  label
1627   //
1628   void copy_bytes_backward(Register from, Register dest,
1629                               Register qword_count, Register to,
1630                               Label& L_copy_bytes, Label& L_copy_8_bytes) {
1631     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1632     Label L_loop;
1633     __ align(OptoLoopAlignment);
1634     if (UseUnalignedLoadStores) {
1635       Label L_end;
1636       if (UseAVX > 2) {
1637         __ movl(to, 0xffff);
1638         __ kmovwl(k1, to);
1639       }
1640       // Copy 64-bytes per iteration
1641       __ BIND(L_loop);
1642       if (UseAVX > 2) {
1643         __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit);
1644         __ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit);
1645       } else if (UseAVX == 2) {
1646         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
1647         __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1648         __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1649         __ vmovdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1650       } else {
1651         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
1652         __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
1653         __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
1654         __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
1655         __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
1656         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
1657         __ movdqu(xmm3, Address(from, qword_count, Address::times_8,  0));
1658         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm3);
1659       }
1660       __ BIND(L_copy_bytes);
1661       __ subptr(qword_count, 8);
1662       __ jcc(Assembler::greaterEqual, L_loop);
1663 
1664       __ addptr(qword_count, 4);  // add(8) and sub(4)
1665       __ jccb(Assembler::less, L_end);
1666       // Copy trailing 32 bytes
1667       if (UseAVX >= 2) {
1668         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
1669         __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
1670       } else {
1671         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1672         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1673         __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1674         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1675       }
1676       __ subptr(qword_count, 4);
1677       __ BIND(L_end);
1678       if (UseAVX >= 2) {
1679         // clean upper bits of YMM registers
1680         __ vpxor(xmm0, xmm0);
1681         __ vpxor(xmm1, xmm1);
1682       }
1683     } else {
1684       // Copy 32-bytes per iteration
1685       __ BIND(L_loop);
1686       __ movq(to, Address(from, qword_count, Address::times_8, 24));
1687       __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1688       __ movq(to, Address(from, qword_count, Address::times_8, 16));
1689       __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1690       __ movq(to, Address(from, qword_count, Address::times_8,  8));
1691       __ movq(Address(dest, qword_count, Address::times_8,  8), to);
1692       __ movq(to, Address(from, qword_count, Address::times_8,  0));
1693       __ movq(Address(dest, qword_count, Address::times_8,  0), to);
1694 
1695       __ BIND(L_copy_bytes);
1696       __ subptr(qword_count, 4);
1697       __ jcc(Assembler::greaterEqual, L_loop);
1698     }
1699     __ addptr(qword_count, 4);
1700     __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1701   }
1702 
1703 
1704   // Arguments:
1705   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1706   //             ignored
1707   //   name    - stub name string
1708   //
1709   // Inputs:
1710   //   c_rarg0   - source array address
1711   //   c_rarg1   - destination array address
1712   //   c_rarg2   - element count, treated as ssize_t, can be zero
1713   //
1714   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1715   // we let the hardware handle it.  The one to eight bytes within words,
1716   // dwords or qwords that span cache line boundaries will still be loaded
1717   // and stored atomically.
1718   //
1719   // Side Effects:
1720   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1721   //   used by generate_conjoint_byte_copy().
1722   //
1723   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1724     __ align(CodeEntryAlignment);
1725     StubCodeMark mark(this, "StubRoutines", name);
1726     address start = __ pc();
1727 
1728     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1729     Label L_copy_byte, L_exit;
1730     const Register from        = rdi;  // source array address
1731     const Register to          = rsi;  // destination array address
1732     const Register count       = rdx;  // elements count
1733     const Register byte_count  = rcx;
1734     const Register qword_count = count;
1735     const Register end_from    = from; // source array end address
1736     const Register end_to      = to;   // destination array end address
1737     // End pointers are inclusive, and if count is not zero they point
1738     // to the last unit copied:  end_to[0] := end_from[0]
1739 
1740     __ enter(); // required for proper stackwalking of RuntimeStub frame
1741     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1742 
1743     if (entry != NULL) {
1744       *entry = __ pc();
1745        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1746       BLOCK_COMMENT("Entry:");
1747     }
1748 
1749     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1750                       // r9 and r10 may be used to save non-volatile registers
1751 
1752     // 'from', 'to' and 'count' are now valid
1753     __ movptr(byte_count, count);
1754     __ shrptr(count, 3); // count => qword_count
1755 
1756     // Copy from low to high addresses.  Use 'to' as scratch.
1757     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1758     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1759     __ negptr(qword_count); // make the count negative
1760     __ jmp(L_copy_bytes);
1761 
1762     // Copy trailing qwords
1763   __ BIND(L_copy_8_bytes);
1764     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1765     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1766     __ increment(qword_count);
1767     __ jcc(Assembler::notZero, L_copy_8_bytes);
1768 
1769     // Check for and copy trailing dword
1770   __ BIND(L_copy_4_bytes);
1771     __ testl(byte_count, 4);
1772     __ jccb(Assembler::zero, L_copy_2_bytes);
1773     __ movl(rax, Address(end_from, 8));
1774     __ movl(Address(end_to, 8), rax);
1775 
1776     __ addptr(end_from, 4);
1777     __ addptr(end_to, 4);
1778 
1779     // Check for and copy trailing word
1780   __ BIND(L_copy_2_bytes);
1781     __ testl(byte_count, 2);
1782     __ jccb(Assembler::zero, L_copy_byte);
1783     __ movw(rax, Address(end_from, 8));
1784     __ movw(Address(end_to, 8), rax);
1785 
1786     __ addptr(end_from, 2);
1787     __ addptr(end_to, 2);
1788 
1789     // Check for and copy trailing byte
1790   __ BIND(L_copy_byte);
1791     __ testl(byte_count, 1);
1792     __ jccb(Assembler::zero, L_exit);
1793     __ movb(rax, Address(end_from, 8));
1794     __ movb(Address(end_to, 8), rax);
1795 
1796   __ BIND(L_exit);
1797     restore_arg_regs();
1798     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1799     __ xorptr(rax, rax); // return 0
1800     __ vzeroupper();
1801     __ leave(); // required for proper stackwalking of RuntimeStub frame
1802     __ ret(0);
1803 
1804     // Copy in multi-bytes chunks
1805     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1806     __ jmp(L_copy_4_bytes);
1807 
1808     return start;
1809   }
1810 
1811   // Arguments:
1812   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1813   //             ignored
1814   //   name    - stub name string
1815   //
1816   // Inputs:
1817   //   c_rarg0   - source array address
1818   //   c_rarg1   - destination array address
1819   //   c_rarg2   - element count, treated as ssize_t, can be zero
1820   //
1821   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1822   // we let the hardware handle it.  The one to eight bytes within words,
1823   // dwords or qwords that span cache line boundaries will still be loaded
1824   // and stored atomically.
1825   //
1826   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1827                                       address* entry, const char *name) {
1828     __ align(CodeEntryAlignment);
1829     StubCodeMark mark(this, "StubRoutines", name);
1830     address start = __ pc();
1831 
1832     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1833     const Register from        = rdi;  // source array address
1834     const Register to          = rsi;  // destination array address
1835     const Register count       = rdx;  // elements count
1836     const Register byte_count  = rcx;
1837     const Register qword_count = count;
1838 
1839     __ enter(); // required for proper stackwalking of RuntimeStub frame
1840     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1841 
1842     if (entry != NULL) {
1843       *entry = __ pc();
1844       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1845       BLOCK_COMMENT("Entry:");
1846     }
1847 
1848     array_overlap_test(nooverlap_target, Address::times_1);
1849     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1850                       // r9 and r10 may be used to save non-volatile registers
1851 
1852     // 'from', 'to' and 'count' are now valid
1853     __ movptr(byte_count, count);
1854     __ shrptr(count, 3);   // count => qword_count
1855 
1856     // Copy from high to low addresses.
1857 
1858     // Check for and copy trailing byte
1859     __ testl(byte_count, 1);
1860     __ jcc(Assembler::zero, L_copy_2_bytes);
1861     __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1862     __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1863     __ decrement(byte_count); // Adjust for possible trailing word
1864 
1865     // Check for and copy trailing word
1866   __ BIND(L_copy_2_bytes);
1867     __ testl(byte_count, 2);
1868     __ jcc(Assembler::zero, L_copy_4_bytes);
1869     __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1870     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1871 
1872     // Check for and copy trailing dword
1873   __ BIND(L_copy_4_bytes);
1874     __ testl(byte_count, 4);
1875     __ jcc(Assembler::zero, L_copy_bytes);
1876     __ movl(rax, Address(from, qword_count, Address::times_8));
1877     __ movl(Address(to, qword_count, Address::times_8), rax);
1878     __ jmp(L_copy_bytes);
1879 
1880     // Copy trailing qwords
1881   __ BIND(L_copy_8_bytes);
1882     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1883     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1884     __ decrement(qword_count);
1885     __ jcc(Assembler::notZero, L_copy_8_bytes);
1886 
1887     restore_arg_regs();
1888     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1889     __ xorptr(rax, rax); // return 0
1890     __ vzeroupper();
1891     __ leave(); // required for proper stackwalking of RuntimeStub frame
1892     __ ret(0);
1893 
1894     // Copy in multi-bytes chunks
1895     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1896 
1897     restore_arg_regs();
1898     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1899     __ xorptr(rax, rax); // return 0
1900     __ vzeroupper();
1901     __ leave(); // required for proper stackwalking of RuntimeStub frame
1902     __ ret(0);
1903 
1904     return start;
1905   }
1906 
1907   // Arguments:
1908   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1909   //             ignored
1910   //   name    - stub name string
1911   //
1912   // Inputs:
1913   //   c_rarg0   - source array address
1914   //   c_rarg1   - destination array address
1915   //   c_rarg2   - element count, treated as ssize_t, can be zero
1916   //
1917   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1918   // let the hardware handle it.  The two or four words within dwords
1919   // or qwords that span cache line boundaries will still be loaded
1920   // and stored atomically.
1921   //
1922   // Side Effects:
1923   //   disjoint_short_copy_entry is set to the no-overlap entry point
1924   //   used by generate_conjoint_short_copy().
1925   //
1926   address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
1927     __ align(CodeEntryAlignment);
1928     StubCodeMark mark(this, "StubRoutines", name);
1929     address start = __ pc();
1930 
1931     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1932     const Register from        = rdi;  // source array address
1933     const Register to          = rsi;  // destination array address
1934     const Register count       = rdx;  // elements count
1935     const Register word_count  = rcx;
1936     const Register qword_count = count;
1937     const Register end_from    = from; // source array end address
1938     const Register end_to      = to;   // destination array end address
1939     // End pointers are inclusive, and if count is not zero they point
1940     // to the last unit copied:  end_to[0] := end_from[0]
1941 
1942     __ enter(); // required for proper stackwalking of RuntimeStub frame
1943     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1944 
1945     if (entry != NULL) {
1946       *entry = __ pc();
1947       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1948       BLOCK_COMMENT("Entry:");
1949     }
1950 
1951     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1952                       // r9 and r10 may be used to save non-volatile registers
1953 
1954     // 'from', 'to' and 'count' are now valid
1955     __ movptr(word_count, count);
1956     __ shrptr(count, 2); // count => qword_count
1957 
1958     // Copy from low to high addresses.  Use 'to' as scratch.
1959     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1960     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1961     __ negptr(qword_count);
1962     __ jmp(L_copy_bytes);
1963 
1964     // Copy trailing qwords
1965   __ BIND(L_copy_8_bytes);
1966     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1967     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1968     __ increment(qword_count);
1969     __ jcc(Assembler::notZero, L_copy_8_bytes);
1970 
1971     // Original 'dest' is trashed, so we can't use it as a
1972     // base register for a possible trailing word copy
1973 
1974     // Check for and copy trailing dword
1975   __ BIND(L_copy_4_bytes);
1976     __ testl(word_count, 2);
1977     __ jccb(Assembler::zero, L_copy_2_bytes);
1978     __ movl(rax, Address(end_from, 8));
1979     __ movl(Address(end_to, 8), rax);
1980 
1981     __ addptr(end_from, 4);
1982     __ addptr(end_to, 4);
1983 
1984     // Check for and copy trailing word
1985   __ BIND(L_copy_2_bytes);
1986     __ testl(word_count, 1);
1987     __ jccb(Assembler::zero, L_exit);
1988     __ movw(rax, Address(end_from, 8));
1989     __ movw(Address(end_to, 8), rax);
1990 
1991   __ BIND(L_exit);
1992     restore_arg_regs();
1993     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1994     __ xorptr(rax, rax); // return 0
1995     __ vzeroupper();
1996     __ leave(); // required for proper stackwalking of RuntimeStub frame
1997     __ ret(0);
1998 
1999     // Copy in multi-bytes chunks
2000     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2001     __ jmp(L_copy_4_bytes);
2002 
2003     return start;
2004   }
2005 
2006   address generate_fill(BasicType t, bool aligned, const char *name) {
2007     __ align(CodeEntryAlignment);
2008     StubCodeMark mark(this, "StubRoutines", name);
2009     address start = __ pc();
2010 
2011     BLOCK_COMMENT("Entry:");
2012 
2013     const Register to       = c_rarg0;  // source array address
2014     const Register value    = c_rarg1;  // value
2015     const Register count    = c_rarg2;  // elements count
2016 
2017     __ enter(); // required for proper stackwalking of RuntimeStub frame
2018 
2019     __ generate_fill(t, aligned, to, value, count, rax, xmm0);
2020 
2021     __ vzeroupper();
2022     __ leave(); // required for proper stackwalking of RuntimeStub frame
2023     __ ret(0);
2024     return start;
2025   }
2026 
2027   // Arguments:
2028   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2029   //             ignored
2030   //   name    - stub name string
2031   //
2032   // Inputs:
2033   //   c_rarg0   - source array address
2034   //   c_rarg1   - destination array address
2035   //   c_rarg2   - element count, treated as ssize_t, can be zero
2036   //
2037   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
2038   // let the hardware handle it.  The two or four words within dwords
2039   // or qwords that span cache line boundaries will still be loaded
2040   // and stored atomically.
2041   //
2042   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
2043                                        address *entry, const char *name) {
2044     __ align(CodeEntryAlignment);
2045     StubCodeMark mark(this, "StubRoutines", name);
2046     address start = __ pc();
2047 
2048     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
2049     const Register from        = rdi;  // source array address
2050     const Register to          = rsi;  // destination array address
2051     const Register count       = rdx;  // elements count
2052     const Register word_count  = rcx;
2053     const Register qword_count = count;
2054 
2055     __ enter(); // required for proper stackwalking of RuntimeStub frame
2056     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2057 
2058     if (entry != NULL) {
2059       *entry = __ pc();
2060       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2061       BLOCK_COMMENT("Entry:");
2062     }
2063 
2064     array_overlap_test(nooverlap_target, Address::times_2);
2065     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2066                       // r9 and r10 may be used to save non-volatile registers
2067 
2068     // 'from', 'to' and 'count' are now valid
2069     __ movptr(word_count, count);
2070     __ shrptr(count, 2); // count => qword_count
2071 
2072     // Copy from high to low addresses.  Use 'to' as scratch.
2073 
2074     // Check for and copy trailing word
2075     __ testl(word_count, 1);
2076     __ jccb(Assembler::zero, L_copy_4_bytes);
2077     __ movw(rax, Address(from, word_count, Address::times_2, -2));
2078     __ movw(Address(to, word_count, Address::times_2, -2), rax);
2079 
2080     // Check for and copy trailing dword
2081   __ BIND(L_copy_4_bytes);
2082     __ testl(word_count, 2);
2083     __ jcc(Assembler::zero, L_copy_bytes);
2084     __ movl(rax, Address(from, qword_count, Address::times_8));
2085     __ movl(Address(to, qword_count, Address::times_8), rax);
2086     __ jmp(L_copy_bytes);
2087 
2088     // Copy trailing qwords
2089   __ BIND(L_copy_8_bytes);
2090     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2091     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2092     __ decrement(qword_count);
2093     __ jcc(Assembler::notZero, L_copy_8_bytes);
2094 
2095     restore_arg_regs();
2096     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2097     __ xorptr(rax, rax); // return 0
2098     __ vzeroupper();
2099     __ leave(); // required for proper stackwalking of RuntimeStub frame
2100     __ ret(0);
2101 
2102     // Copy in multi-bytes chunks
2103     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2104 
2105     restore_arg_regs();
2106     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2107     __ xorptr(rax, rax); // return 0
2108     __ vzeroupper();
2109     __ leave(); // required for proper stackwalking of RuntimeStub frame
2110     __ ret(0);
2111 
2112     return start;
2113   }
2114 
2115   // Arguments:
2116   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2117   //             ignored
2118   //   is_oop  - true => oop array, so generate store check code
2119   //   name    - stub name string
2120   //
2121   // Inputs:
2122   //   c_rarg0   - source array address
2123   //   c_rarg1   - destination array address
2124   //   c_rarg2   - element count, treated as ssize_t, can be zero
2125   //
2126   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2127   // the hardware handle it.  The two dwords within qwords that span
2128   // cache line boundaries will still be loaded and stored atomicly.
2129   //
2130   // Side Effects:
2131   //   disjoint_int_copy_entry is set to the no-overlap entry point
2132   //   used by generate_conjoint_int_oop_copy().
2133   //
2134   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
2135                                          const char *name, bool dest_uninitialized = false) {
2136     __ align(CodeEntryAlignment);
2137     StubCodeMark mark(this, "StubRoutines", name);
2138     address start = __ pc();
2139 
2140     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
2141     const Register from        = rdi;  // source array address
2142     const Register to          = rsi;  // destination array address
2143     const Register count       = rdx;  // elements count
2144     const Register dword_count = rcx;
2145     const Register qword_count = count;
2146     const Register end_from    = from; // source array end address
2147     const Register end_to      = to;   // destination array end address
2148     // End pointers are inclusive, and if count is not zero they point
2149     // to the last unit copied:  end_to[0] := end_from[0]
2150 
2151     __ enter(); // required for proper stackwalking of RuntimeStub frame
2152     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2153 
2154     if (entry != NULL) {
2155       *entry = __ pc();
2156       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2157       BLOCK_COMMENT("Entry:");
2158     }
2159 
2160     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2161                       // r9 and r10 may be used to save non-volatile registers
2162 
2163     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2164     if (dest_uninitialized) {
2165       decorators |= IS_DEST_UNINITIALIZED;
2166     }
2167     if (aligned) {
2168       decorators |= ARRAYCOPY_ALIGNED;
2169     }
2170 
2171     BasicType type = is_oop ? T_OBJECT : T_INT;
2172     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2173     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2174 
2175     // 'from', 'to' and 'count' are now valid
2176     __ movptr(dword_count, count);
2177     __ shrptr(count, 1); // count => qword_count
2178 
2179     // Copy from low to high addresses.  Use 'to' as scratch.
2180     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2181     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2182     __ negptr(qword_count);
2183     __ jmp(L_copy_bytes);
2184 
2185     // Copy trailing qwords
2186   __ BIND(L_copy_8_bytes);
2187     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2188     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2189     __ increment(qword_count);
2190     __ jcc(Assembler::notZero, L_copy_8_bytes);
2191 
2192     // Check for and copy trailing dword
2193   __ BIND(L_copy_4_bytes);
2194     __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
2195     __ jccb(Assembler::zero, L_exit);
2196     __ movl(rax, Address(end_from, 8));
2197     __ movl(Address(end_to, 8), rax);
2198 
2199   __ BIND(L_exit);
2200     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2201     restore_arg_regs();
2202     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2203     __ vzeroupper();
2204     __ xorptr(rax, rax); // return 0
2205     __ leave(); // required for proper stackwalking of RuntimeStub frame
2206     __ ret(0);
2207 
2208     // Copy in multi-bytes chunks
2209     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2210     __ jmp(L_copy_4_bytes);
2211 
2212     return start;
2213   }
2214 
2215   // Arguments:
2216   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2217   //             ignored
2218   //   is_oop  - true => oop array, so generate store check code
2219   //   name    - stub name string
2220   //
2221   // Inputs:
2222   //   c_rarg0   - source array address
2223   //   c_rarg1   - destination array address
2224   //   c_rarg2   - element count, treated as ssize_t, can be zero
2225   //
2226   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2227   // the hardware handle it.  The two dwords within qwords that span
2228   // cache line boundaries will still be loaded and stored atomicly.
2229   //
2230   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
2231                                          address *entry, const char *name,
2232                                          bool dest_uninitialized = false) {
2233     __ align(CodeEntryAlignment);
2234     StubCodeMark mark(this, "StubRoutines", name);
2235     address start = __ pc();
2236 
2237     Label L_copy_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit;
2238     const Register from        = rdi;  // source array address
2239     const Register to          = rsi;  // destination array address
2240     const Register count       = rdx;  // elements count
2241     const Register dword_count = rcx;
2242     const Register qword_count = count;
2243 
2244     __ enter(); // required for proper stackwalking of RuntimeStub frame
2245     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2246 
2247     if (entry != NULL) {
2248       *entry = __ pc();
2249        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2250       BLOCK_COMMENT("Entry:");
2251     }
2252 
2253     array_overlap_test(nooverlap_target, Address::times_4);
2254     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2255                       // r9 and r10 may be used to save non-volatile registers
2256 
2257     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2258     if (dest_uninitialized) {
2259       decorators |= IS_DEST_UNINITIALIZED;
2260     }
2261     if (aligned) {
2262       decorators |= ARRAYCOPY_ALIGNED;
2263     }
2264 
2265     BasicType type = is_oop ? T_OBJECT : T_INT;
2266     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2267     // no registers are destroyed by this call
2268     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2269 
2270     assert_clean_int(count, rax); // Make sure 'count' is clean int.
2271     // 'from', 'to' and 'count' are now valid
2272     __ movptr(dword_count, count);
2273     __ shrptr(count, 1); // count => qword_count
2274 
2275     // Copy from high to low addresses.  Use 'to' as scratch.
2276 
2277     // Check for and copy trailing dword
2278     __ testl(dword_count, 1);
2279     __ jcc(Assembler::zero, L_copy_bytes);
2280     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2281     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2282     __ jmp(L_copy_bytes);
2283 
2284     // Copy trailing qwords
2285   __ BIND(L_copy_8_bytes);
2286     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2287     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2288     __ decrement(qword_count);
2289     __ jcc(Assembler::notZero, L_copy_8_bytes);
2290 
2291     if (is_oop) {
2292       __ jmp(L_exit);
2293     }
2294     restore_arg_regs();
2295     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2296     __ xorptr(rax, rax); // return 0
2297     __ vzeroupper();
2298     __ leave(); // required for proper stackwalking of RuntimeStub frame
2299     __ ret(0);
2300 
2301     // Copy in multi-bytes chunks
2302     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2303 
2304   __ BIND(L_exit);
2305     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2306     restore_arg_regs();
2307     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2308     __ xorptr(rax, rax); // return 0
2309     __ vzeroupper();
2310     __ leave(); // required for proper stackwalking of RuntimeStub frame
2311     __ ret(0);
2312 
2313     return start;
2314   }
2315 
2316   // Arguments:
2317   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2318   //             ignored
2319   //   is_oop  - true => oop array, so generate store check code
2320   //   name    - stub name string
2321   //
2322   // Inputs:
2323   //   c_rarg0   - source array address
2324   //   c_rarg1   - destination array address
2325   //   c_rarg2   - element count, treated as ssize_t, can be zero
2326   //
2327  // Side Effects:
2328   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2329   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
2330   //
2331   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
2332                                           const char *name, bool dest_uninitialized = false) {
2333     __ align(CodeEntryAlignment);
2334     StubCodeMark mark(this, "StubRoutines", name);
2335     address start = __ pc();
2336 
2337     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2338     const Register from        = rdi;  // source array address
2339     const Register to          = rsi;  // destination array address
2340     const Register qword_count = rdx;  // elements count
2341     const Register end_from    = from; // source array end address
2342     const Register end_to      = rcx;  // destination array end address
2343     const Register saved_count = r11;
2344     // End pointers are inclusive, and if count is not zero they point
2345     // to the last unit copied:  end_to[0] := end_from[0]
2346 
2347     __ enter(); // required for proper stackwalking of RuntimeStub frame
2348     // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2349     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2350 
2351     if (entry != NULL) {
2352       *entry = __ pc();
2353       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2354       BLOCK_COMMENT("Entry:");
2355     }
2356 
2357     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2358                       // r9 and r10 may be used to save non-volatile registers
2359     // 'from', 'to' and 'qword_count' are now valid
2360 
2361     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2362     if (dest_uninitialized) {
2363       decorators |= IS_DEST_UNINITIALIZED;
2364     }
2365     if (aligned) {
2366       decorators |= ARRAYCOPY_ALIGNED;
2367     }
2368 
2369     BasicType type = is_oop ? T_OBJECT : T_LONG;
2370     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2371     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2372 
2373     // Copy from low to high addresses.  Use 'to' as scratch.
2374     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2375     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2376     __ negptr(qword_count);
2377     __ jmp(L_copy_bytes);
2378 
2379     // Copy trailing qwords
2380   __ BIND(L_copy_8_bytes);
2381     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2382     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2383     __ increment(qword_count);
2384     __ jcc(Assembler::notZero, L_copy_8_bytes);
2385 
2386     if (is_oop) {
2387       __ jmp(L_exit);
2388     } else {
2389       restore_arg_regs();
2390       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2391       __ xorptr(rax, rax); // return 0
2392       __ vzeroupper();
2393       __ leave(); // required for proper stackwalking of RuntimeStub frame
2394       __ ret(0);
2395     }
2396 
2397     // Copy in multi-bytes chunks
2398     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2399 
2400     __ BIND(L_exit);
2401     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2402     restore_arg_regs();
2403     if (is_oop) {
2404       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2405     } else {
2406       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2407     }
2408     __ vzeroupper();
2409     __ xorptr(rax, rax); // return 0
2410     __ leave(); // required for proper stackwalking of RuntimeStub frame
2411     __ ret(0);
2412 
2413     return start;
2414   }
2415 
2416   // Arguments:
2417   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2418   //             ignored
2419   //   is_oop  - true => oop array, so generate store check code
2420   //   name    - stub name string
2421   //
2422   // Inputs:
2423   //   c_rarg0   - source array address
2424   //   c_rarg1   - destination array address
2425   //   c_rarg2   - element count, treated as ssize_t, can be zero
2426   //
2427   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2428                                           address nooverlap_target, address *entry,
2429                                           const char *name, bool dest_uninitialized = false) {
2430     __ align(CodeEntryAlignment);
2431     StubCodeMark mark(this, "StubRoutines", name);
2432     address start = __ pc();
2433 
2434     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2435     const Register from        = rdi;  // source array address
2436     const Register to          = rsi;  // destination array address
2437     const Register qword_count = rdx;  // elements count
2438     const Register saved_count = rcx;
2439 
2440     __ enter(); // required for proper stackwalking of RuntimeStub frame
2441     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2442 
2443     if (entry != NULL) {
2444       *entry = __ pc();
2445       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2446       BLOCK_COMMENT("Entry:");
2447     }
2448 
2449     array_overlap_test(nooverlap_target, Address::times_8);
2450     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2451                       // r9 and r10 may be used to save non-volatile registers
2452     // 'from', 'to' and 'qword_count' are now valid
2453 
2454     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2455     if (dest_uninitialized) {
2456       decorators |= IS_DEST_UNINITIALIZED;
2457     }
2458     if (aligned) {
2459       decorators |= ARRAYCOPY_ALIGNED;
2460     }
2461 
2462     BasicType type = is_oop ? T_OBJECT : T_LONG;
2463     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2464     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2465 
2466     __ jmp(L_copy_bytes);
2467 
2468     // Copy trailing qwords
2469   __ BIND(L_copy_8_bytes);
2470     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2471     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2472     __ decrement(qword_count);
2473     __ jcc(Assembler::notZero, L_copy_8_bytes);
2474 
2475     if (is_oop) {
2476       __ jmp(L_exit);
2477     } else {
2478       restore_arg_regs();
2479       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2480       __ xorptr(rax, rax); // return 0
2481       __ vzeroupper();
2482       __ leave(); // required for proper stackwalking of RuntimeStub frame
2483       __ ret(0);
2484     }
2485 
2486     // Copy in multi-bytes chunks
2487     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2488 
2489     __ BIND(L_exit);
2490     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2491     restore_arg_regs();
2492     if (is_oop) {
2493       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2494     } else {
2495       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2496     }
2497     __ vzeroupper();
2498     __ xorptr(rax, rax); // return 0
2499     __ leave(); // required for proper stackwalking of RuntimeStub frame
2500     __ ret(0);
2501 
2502     return start;
2503   }
2504 
2505 
2506   // Helper for generating a dynamic type check.
2507   // Smashes no registers.
2508   void generate_type_check(Register sub_klass,
2509                            Register super_check_offset,
2510                            Register super_klass,
2511                            Label& L_success) {
2512     assert_different_registers(sub_klass, super_check_offset, super_klass);
2513 
2514     BLOCK_COMMENT("type_check:");
2515 
2516     Label L_miss;
2517 
2518     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
2519                                      super_check_offset);
2520     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
2521 
2522     // Fall through on failure!
2523     __ BIND(L_miss);
2524   }
2525 
2526   //
2527   //  Generate checkcasting array copy stub
2528   //
2529   //  Input:
2530   //    c_rarg0   - source array address
2531   //    c_rarg1   - destination array address
2532   //    c_rarg2   - element count, treated as ssize_t, can be zero
2533   //    c_rarg3   - size_t ckoff (super_check_offset)
2534   // not Win64
2535   //    c_rarg4   - oop ckval (super_klass)
2536   // Win64
2537   //    rsp+40    - oop ckval (super_klass)
2538   //
2539   //  Output:
2540   //    rax ==  0  -  success
2541   //    rax == -1^K - failure, where K is partial transfer count
2542   //
2543   address generate_checkcast_copy(const char *name, address *entry,
2544                                   bool dest_uninitialized = false) {
2545 
2546     Label L_load_element, L_store_element, L_do_card_marks, L_done;
2547 
2548     // Input registers (after setup_arg_regs)
2549     const Register from        = rdi;   // source array address
2550     const Register to          = rsi;   // destination array address
2551     const Register length      = rdx;   // elements count
2552     const Register ckoff       = rcx;   // super_check_offset
2553     const Register ckval       = r8;    // super_klass
2554 
2555     // Registers used as temps (r13, r14 are save-on-entry)
2556     const Register end_from    = from;  // source array end address
2557     const Register end_to      = r13;   // destination array end address
2558     const Register count       = rdx;   // -(count_remaining)
2559     const Register r14_length  = r14;   // saved copy of length
2560     // End pointers are inclusive, and if length is not zero they point
2561     // to the last unit copied:  end_to[0] := end_from[0]
2562 
2563     const Register rax_oop    = rax;    // actual oop copied
2564     const Register r11_klass  = r11;    // oop._klass
2565 
2566     //---------------------------------------------------------------
2567     // Assembler stub will be used for this call to arraycopy
2568     // if the two arrays are subtypes of Object[] but the
2569     // destination array type is not equal to or a supertype
2570     // of the source type.  Each element must be separately
2571     // checked.
2572 
2573     __ align(CodeEntryAlignment);
2574     StubCodeMark mark(this, "StubRoutines", name);
2575     address start = __ pc();
2576 
2577     __ enter(); // required for proper stackwalking of RuntimeStub frame
2578 
2579 #ifdef ASSERT
2580     // caller guarantees that the arrays really are different
2581     // otherwise, we would have to make conjoint checks
2582     { Label L;
2583       array_overlap_test(L, TIMES_OOP);
2584       __ stop("checkcast_copy within a single array");
2585       __ bind(L);
2586     }
2587 #endif //ASSERT
2588 
2589     setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
2590                        // ckoff => rcx, ckval => r8
2591                        // r9 and r10 may be used to save non-volatile registers
2592 #ifdef _WIN64
2593     // last argument (#4) is on stack on Win64
2594     __ movptr(ckval, Address(rsp, 6 * wordSize));
2595 #endif
2596 
2597     // Caller of this entry point must set up the argument registers.
2598     if (entry != NULL) {
2599       *entry = __ pc();
2600       BLOCK_COMMENT("Entry:");
2601     }
2602 
2603     // allocate spill slots for r13, r14
2604     enum {
2605       saved_r13_offset,
2606       saved_r14_offset,
2607       saved_rbp_offset
2608     };
2609     __ subptr(rsp, saved_rbp_offset * wordSize);
2610     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2611     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2612 
2613     // check that int operands are properly extended to size_t
2614     assert_clean_int(length, rax);
2615     assert_clean_int(ckoff, rax);
2616 
2617 #ifdef ASSERT
2618     BLOCK_COMMENT("assert consistent ckoff/ckval");
2619     // The ckoff and ckval must be mutually consistent,
2620     // even though caller generates both.
2621     { Label L;
2622       int sco_offset = in_bytes(Klass::super_check_offset_offset());
2623       __ cmpl(ckoff, Address(ckval, sco_offset));
2624       __ jcc(Assembler::equal, L);
2625       __ stop("super_check_offset inconsistent");
2626       __ bind(L);
2627     }
2628 #endif //ASSERT
2629 
2630     // Loop-invariant addresses.  They are exclusive end pointers.
2631     Address end_from_addr(from, length, TIMES_OOP, 0);
2632     Address   end_to_addr(to,   length, TIMES_OOP, 0);
2633     // Loop-variant addresses.  They assume post-incremented count < 0.
2634     Address from_element_addr(end_from, count, TIMES_OOP, 0);
2635     Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2636 
2637     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
2638     if (dest_uninitialized) {
2639       decorators |= IS_DEST_UNINITIALIZED;
2640     }
2641 
2642     BasicType type = T_OBJECT;
2643     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2644     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2645 
2646     // Copy from low to high addresses, indexed from the end of each array.
2647     __ lea(end_from, end_from_addr);
2648     __ lea(end_to,   end_to_addr);
2649     __ movptr(r14_length, length);        // save a copy of the length
2650     assert(length == count, "");          // else fix next line:
2651     __ negptr(count);                     // negate and test the length
2652     __ jcc(Assembler::notZero, L_load_element);
2653 
2654     // Empty array:  Nothing to do.
2655     __ xorptr(rax, rax);                  // return 0 on (trivial) success
2656     __ jmp(L_done);
2657 
2658     // ======== begin loop ========
2659     // (Loop is rotated; its entry is L_load_element.)
2660     // Loop control:
2661     //   for (count = -count; count != 0; count++)
2662     // Base pointers src, dst are biased by 8*(count-1),to last element.
2663     __ align(OptoLoopAlignment);
2664 
2665     __ BIND(L_store_element);
2666     __ store_heap_oop(to_element_addr, rax_oop, noreg, noreg, AS_RAW);  // store the oop
2667     __ increment(count);               // increment the count toward zero
2668     __ jcc(Assembler::zero, L_do_card_marks);
2669 
2670     // ======== loop entry is here ========
2671     __ BIND(L_load_element);
2672     __ load_heap_oop(rax_oop, from_element_addr, noreg, noreg, AS_RAW); // load the oop
2673     __ testptr(rax_oop, rax_oop);
2674     __ jcc(Assembler::zero, L_store_element);
2675 
2676     __ load_klass(r11_klass, rax_oop);// query the object klass
2677     generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2678     // ======== end loop ========
2679 
2680     // It was a real error; we must depend on the caller to finish the job.
2681     // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2682     // Emit GC store barriers for the oops we have copied (r14 + rdx),
2683     // and report their number to the caller.
2684     assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2685     Label L_post_barrier;
2686     __ addptr(r14_length, count);     // K = (original - remaining) oops
2687     __ movptr(rax, r14_length);       // save the value
2688     __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
2689     __ jccb(Assembler::notZero, L_post_barrier);
2690     __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2691 
2692     // Come here on success only.
2693     __ BIND(L_do_card_marks);
2694     __ xorptr(rax, rax);              // return 0 on success
2695 
2696     __ BIND(L_post_barrier);
2697     bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2698 
2699     // Common exit point (success or failure).
2700     __ BIND(L_done);
2701     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2702     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2703     restore_arg_regs();
2704     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free
2705     __ leave(); // required for proper stackwalking of RuntimeStub frame
2706     __ ret(0);
2707 
2708     return start;
2709   }
2710 
2711   //
2712   //  Generate 'unsafe' array copy stub
2713   //  Though just as safe as the other stubs, it takes an unscaled
2714   //  size_t argument instead of an element count.
2715   //
2716   //  Input:
2717   //    c_rarg0   - source array address
2718   //    c_rarg1   - destination array address
2719   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2720   //
2721   // Examines the alignment of the operands and dispatches
2722   // to a long, int, short, or byte copy loop.
2723   //
2724   address generate_unsafe_copy(const char *name,
2725                                address byte_copy_entry, address short_copy_entry,
2726                                address int_copy_entry, address long_copy_entry) {
2727 
2728     Label L_long_aligned, L_int_aligned, L_short_aligned;
2729 
2730     // Input registers (before setup_arg_regs)
2731     const Register from        = c_rarg0;  // source array address
2732     const Register to          = c_rarg1;  // destination array address
2733     const Register size        = c_rarg2;  // byte count (size_t)
2734 
2735     // Register used as a temp
2736     const Register bits        = rax;      // test copy of low bits
2737 
2738     __ align(CodeEntryAlignment);
2739     StubCodeMark mark(this, "StubRoutines", name);
2740     address start = __ pc();
2741 
2742     __ enter(); // required for proper stackwalking of RuntimeStub frame
2743 
2744     // bump this on entry, not on exit:
2745     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2746 
2747     __ mov(bits, from);
2748     __ orptr(bits, to);
2749     __ orptr(bits, size);
2750 
2751     __ testb(bits, BytesPerLong-1);
2752     __ jccb(Assembler::zero, L_long_aligned);
2753 
2754     __ testb(bits, BytesPerInt-1);
2755     __ jccb(Assembler::zero, L_int_aligned);
2756 
2757     __ testb(bits, BytesPerShort-1);
2758     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2759 
2760     __ BIND(L_short_aligned);
2761     __ shrptr(size, LogBytesPerShort); // size => short_count
2762     __ jump(RuntimeAddress(short_copy_entry));
2763 
2764     __ BIND(L_int_aligned);
2765     __ shrptr(size, LogBytesPerInt); // size => int_count
2766     __ jump(RuntimeAddress(int_copy_entry));
2767 
2768     __ BIND(L_long_aligned);
2769     __ shrptr(size, LogBytesPerLong); // size => qword_count
2770     __ jump(RuntimeAddress(long_copy_entry));
2771 
2772     return start;
2773   }
2774 
2775   // Perform range checks on the proposed arraycopy.
2776   // Kills temp, but nothing else.
2777   // Also, clean the sign bits of src_pos and dst_pos.
2778   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2779                               Register src_pos, // source position (c_rarg1)
2780                               Register dst,     // destination array oo (c_rarg2)
2781                               Register dst_pos, // destination position (c_rarg3)
2782                               Register length,
2783                               Register temp,
2784                               Label& L_failed) {
2785     BLOCK_COMMENT("arraycopy_range_checks:");
2786 
2787     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2788     __ movl(temp, length);
2789     __ addl(temp, src_pos);             // src_pos + length
2790     __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2791     __ jcc(Assembler::above, L_failed);
2792 
2793     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2794     __ movl(temp, length);
2795     __ addl(temp, dst_pos);             // dst_pos + length
2796     __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2797     __ jcc(Assembler::above, L_failed);
2798 
2799     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2800     // Move with sign extension can be used since they are positive.
2801     __ movslq(src_pos, src_pos);
2802     __ movslq(dst_pos, dst_pos);
2803 
2804     BLOCK_COMMENT("arraycopy_range_checks done");
2805   }
2806 
2807   //
2808   //  Generate generic array copy stubs
2809   //
2810   //  Input:
2811   //    c_rarg0    -  src oop
2812   //    c_rarg1    -  src_pos (32-bits)
2813   //    c_rarg2    -  dst oop
2814   //    c_rarg3    -  dst_pos (32-bits)
2815   // not Win64
2816   //    c_rarg4    -  element count (32-bits)
2817   // Win64
2818   //    rsp+40     -  element count (32-bits)
2819   //
2820   //  Output:
2821   //    rax ==  0  -  success
2822   //    rax == -1^K - failure, where K is partial transfer count
2823   //
2824   address generate_generic_copy(const char *name,
2825                                 address byte_copy_entry, address short_copy_entry,
2826                                 address int_copy_entry, address oop_copy_entry,
2827                                 address long_copy_entry, address checkcast_copy_entry) {
2828 
2829     Label L_failed, L_failed_0, L_objArray;
2830     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2831 
2832     // Input registers
2833     const Register src        = c_rarg0;  // source array oop
2834     const Register src_pos    = c_rarg1;  // source position
2835     const Register dst        = c_rarg2;  // destination array oop
2836     const Register dst_pos    = c_rarg3;  // destination position
2837 #ifndef _WIN64
2838     const Register length     = c_rarg4;
2839 #else
2840     const Address  length(rsp, 6 * wordSize);  // elements count is on stack on Win64
2841 #endif
2842 
2843     { int modulus = CodeEntryAlignment;
2844       int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
2845       int advance = target - (__ offset() % modulus);
2846       if (advance < 0)  advance += modulus;
2847       if (advance > 0)  __ nop(advance);
2848     }
2849     StubCodeMark mark(this, "StubRoutines", name);
2850 
2851     // Short-hop target to L_failed.  Makes for denser prologue code.
2852     __ BIND(L_failed_0);
2853     __ jmp(L_failed);
2854     assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2855 
2856     __ align(CodeEntryAlignment);
2857     address start = __ pc();
2858 
2859     __ enter(); // required for proper stackwalking of RuntimeStub frame
2860 
2861     // bump this on entry, not on exit:
2862     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2863 
2864     //-----------------------------------------------------------------------
2865     // Assembler stub will be used for this call to arraycopy
2866     // if the following conditions are met:
2867     //
2868     // (1) src and dst must not be null.
2869     // (2) src_pos must not be negative.
2870     // (3) dst_pos must not be negative.
2871     // (4) length  must not be negative.
2872     // (5) src klass and dst klass should be the same and not NULL.
2873     // (6) src and dst should be arrays.
2874     // (7) src_pos + length must not exceed length of src.
2875     // (8) dst_pos + length must not exceed length of dst.
2876     //
2877 
2878     //  if (src == NULL) return -1;
2879     __ testptr(src, src);         // src oop
2880     size_t j1off = __ offset();
2881     __ jccb(Assembler::zero, L_failed_0);
2882 
2883     //  if (src_pos < 0) return -1;
2884     __ testl(src_pos, src_pos); // src_pos (32-bits)
2885     __ jccb(Assembler::negative, L_failed_0);
2886 
2887     //  if (dst == NULL) return -1;
2888     __ testptr(dst, dst);         // dst oop
2889     __ jccb(Assembler::zero, L_failed_0);
2890 
2891     //  if (dst_pos < 0) return -1;
2892     __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
2893     size_t j4off = __ offset();
2894     __ jccb(Assembler::negative, L_failed_0);
2895 
2896     // The first four tests are very dense code,
2897     // but not quite dense enough to put four
2898     // jumps in a 16-byte instruction fetch buffer.
2899     // That's good, because some branch predicters
2900     // do not like jumps so close together.
2901     // Make sure of this.
2902     guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
2903 
2904     // registers used as temp
2905     const Register r11_length    = r11; // elements count to copy
2906     const Register r10_src_klass = r10; // array klass
2907 
2908     //  if (length < 0) return -1;
2909     __ movl(r11_length, length);        // length (elements count, 32-bits value)
2910     __ testl(r11_length, r11_length);
2911     __ jccb(Assembler::negative, L_failed_0);
2912 
2913     __ load_klass(r10_src_klass, src);
2914 #ifdef ASSERT
2915     //  assert(src->klass() != NULL);
2916     {
2917       BLOCK_COMMENT("assert klasses not null {");
2918       Label L1, L2;
2919       __ testptr(r10_src_klass, r10_src_klass);
2920       __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
2921       __ bind(L1);
2922       __ stop("broken null klass");
2923       __ bind(L2);
2924       __ load_klass(rax, dst);
2925       __ cmpq(rax, 0);
2926       __ jcc(Assembler::equal, L1);     // this would be broken also
2927       BLOCK_COMMENT("} assert klasses not null done");
2928     }
2929 #endif
2930 
2931     // Load layout helper (32-bits)
2932     //
2933     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2934     // 32        30    24            16              8     2                 0
2935     //
2936     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2937     //
2938 
2939     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2940 
2941     // Handle objArrays completely differently...
2942     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2943     __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
2944     __ jcc(Assembler::equal, L_objArray);
2945 
2946     //  if (src->klass() != dst->klass()) return -1;
2947     __ load_klass(rax, dst);
2948     __ cmpq(r10_src_klass, rax);
2949     __ jcc(Assembler::notEqual, L_failed);
2950 
2951     const Register rax_lh = rax;  // layout helper
2952     __ movl(rax_lh, Address(r10_src_klass, lh_offset));
2953 
2954     //  if (!src->is_Array()) return -1;
2955     __ cmpl(rax_lh, Klass::_lh_neutral_value);
2956     __ jcc(Assembler::greaterEqual, L_failed);
2957 
2958     // At this point, it is known to be a typeArray (array_tag 0x3).
2959 #ifdef ASSERT
2960     {
2961       BLOCK_COMMENT("assert primitive array {");
2962       Label L;
2963       __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
2964       __ jcc(Assembler::greaterEqual, L);
2965       __ stop("must be a primitive array");
2966       __ bind(L);
2967       BLOCK_COMMENT("} assert primitive array done");
2968     }
2969 #endif
2970 
2971     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2972                            r10, L_failed);
2973 
2974     // TypeArrayKlass
2975     //
2976     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2977     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2978     //
2979 
2980     const Register r10_offset = r10;    // array offset
2981     const Register rax_elsize = rax_lh; // element size
2982 
2983     __ movl(r10_offset, rax_lh);
2984     __ shrl(r10_offset, Klass::_lh_header_size_shift);
2985     __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
2986     __ addptr(src, r10_offset);           // src array offset
2987     __ addptr(dst, r10_offset);           // dst array offset
2988     BLOCK_COMMENT("choose copy loop based on element size");
2989     __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
2990 
2991     // next registers should be set before the jump to corresponding stub
2992     const Register from     = c_rarg0;  // source array address
2993     const Register to       = c_rarg1;  // destination array address
2994     const Register count    = c_rarg2;  // elements count
2995 
2996     // 'from', 'to', 'count' registers should be set in such order
2997     // since they are the same as 'src', 'src_pos', 'dst'.
2998 
2999   __ BIND(L_copy_bytes);
3000     __ cmpl(rax_elsize, 0);
3001     __ jccb(Assembler::notEqual, L_copy_shorts);
3002     __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
3003     __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
3004     __ movl2ptr(count, r11_length); // length
3005     __ jump(RuntimeAddress(byte_copy_entry));
3006 
3007   __ BIND(L_copy_shorts);
3008     __ cmpl(rax_elsize, LogBytesPerShort);
3009     __ jccb(Assembler::notEqual, L_copy_ints);
3010     __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
3011     __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
3012     __ movl2ptr(count, r11_length); // length
3013     __ jump(RuntimeAddress(short_copy_entry));
3014 
3015   __ BIND(L_copy_ints);
3016     __ cmpl(rax_elsize, LogBytesPerInt);
3017     __ jccb(Assembler::notEqual, L_copy_longs);
3018     __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
3019     __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
3020     __ movl2ptr(count, r11_length); // length
3021     __ jump(RuntimeAddress(int_copy_entry));
3022 
3023   __ BIND(L_copy_longs);
3024 #ifdef ASSERT
3025     {
3026       BLOCK_COMMENT("assert long copy {");
3027       Label L;
3028       __ cmpl(rax_elsize, LogBytesPerLong);
3029       __ jcc(Assembler::equal, L);
3030       __ stop("must be long copy, but elsize is wrong");
3031       __ bind(L);
3032       BLOCK_COMMENT("} assert long copy done");
3033     }
3034 #endif
3035     __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
3036     __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
3037     __ movl2ptr(count, r11_length); // length
3038     __ jump(RuntimeAddress(long_copy_entry));
3039 
3040     // ObjArrayKlass
3041   __ BIND(L_objArray);
3042     // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
3043 
3044     Label L_plain_copy, L_checkcast_copy;
3045     //  test array classes for subtyping
3046     __ load_klass(rax, dst);
3047     __ cmpq(r10_src_klass, rax); // usual case is exact equality
3048     __ jcc(Assembler::notEqual, L_checkcast_copy);
3049 
3050     // Identically typed arrays can be copied without element-wise checks.
3051     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3052                            r10, L_failed);
3053 
3054     __ lea(from, Address(src, src_pos, TIMES_OOP,
3055                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
3056     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3057                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
3058     __ movl2ptr(count, r11_length); // length
3059   __ BIND(L_plain_copy);
3060     __ jump(RuntimeAddress(oop_copy_entry));
3061 
3062   __ BIND(L_checkcast_copy);
3063     // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
3064     {
3065       // Before looking at dst.length, make sure dst is also an objArray.
3066       __ cmpl(Address(rax, lh_offset), objArray_lh);
3067       __ jcc(Assembler::notEqual, L_failed);
3068 
3069       // It is safe to examine both src.length and dst.length.
3070       arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3071                              rax, L_failed);
3072 
3073       const Register r11_dst_klass = r11;
3074       __ load_klass(r11_dst_klass, dst); // reload
3075 
3076       // Marshal the base address arguments now, freeing registers.
3077       __ lea(from, Address(src, src_pos, TIMES_OOP,
3078                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3079       __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3080                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3081       __ movl(count, length);           // length (reloaded)
3082       Register sco_temp = c_rarg3;      // this register is free now
3083       assert_different_registers(from, to, count, sco_temp,
3084                                  r11_dst_klass, r10_src_klass);
3085       assert_clean_int(count, sco_temp);
3086 
3087       // Generate the type check.
3088       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3089       __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3090       assert_clean_int(sco_temp, rax);
3091       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3092 
3093       // Fetch destination element klass from the ObjArrayKlass header.
3094       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3095       __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3096       __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
3097       assert_clean_int(sco_temp, rax);
3098 
3099       // the checkcast_copy loop needs two extra arguments:
3100       assert(c_rarg3 == sco_temp, "#3 already in place");
3101       // Set up arguments for checkcast_copy_entry.
3102       setup_arg_regs(4);
3103       __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3104       __ jump(RuntimeAddress(checkcast_copy_entry));
3105     }
3106 
3107   __ BIND(L_failed);
3108     __ xorptr(rax, rax);
3109     __ notptr(rax); // return -1
3110     __ leave();   // required for proper stackwalking of RuntimeStub frame
3111     __ ret(0);
3112 
3113     return start;
3114   }
3115 
3116   void generate_arraycopy_stubs() {
3117     address entry;
3118     address entry_jbyte_arraycopy;
3119     address entry_jshort_arraycopy;
3120     address entry_jint_arraycopy;
3121     address entry_oop_arraycopy;
3122     address entry_jlong_arraycopy;
3123     address entry_checkcast_arraycopy;
3124 
3125     StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, &entry,
3126                                                                            "jbyte_disjoint_arraycopy");
3127     StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
3128                                                                            "jbyte_arraycopy");
3129 
3130     StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
3131                                                                             "jshort_disjoint_arraycopy");
3132     StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
3133                                                                             "jshort_arraycopy");
3134 
3135     StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, &entry,
3136                                                                               "jint_disjoint_arraycopy");
3137     StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(false, false, entry,
3138                                                                               &entry_jint_arraycopy, "jint_arraycopy");
3139 
3140     StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, false, &entry,
3141                                                                                "jlong_disjoint_arraycopy");
3142     StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(false, false, entry,
3143                                                                                &entry_jlong_arraycopy, "jlong_arraycopy");
3144 
3145 
3146     if (UseCompressedOops) {
3147       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(false, true, &entry,
3148                                                                               "oop_disjoint_arraycopy");
3149       StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(false, true, entry,
3150                                                                               &entry_oop_arraycopy, "oop_arraycopy");
3151       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(false, true, &entry,
3152                                                                                      "oop_disjoint_arraycopy_uninit",
3153                                                                                      /*dest_uninitialized*/true);
3154       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(false, true, entry,
3155                                                                                      NULL, "oop_arraycopy_uninit",
3156                                                                                      /*dest_uninitialized*/true);
3157     } else {
3158       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, true, &entry,
3159                                                                                "oop_disjoint_arraycopy");
3160       StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(false, true, entry,
3161                                                                                &entry_oop_arraycopy, "oop_arraycopy");
3162       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(false, true, &entry,
3163                                                                                       "oop_disjoint_arraycopy_uninit",
3164                                                                                       /*dest_uninitialized*/true);
3165       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(false, true, entry,
3166                                                                                       NULL, "oop_arraycopy_uninit",
3167                                                                                       /*dest_uninitialized*/true);
3168     }
3169 
3170     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3171     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3172                                                                         /*dest_uninitialized*/true);
3173 
3174     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
3175                                                               entry_jbyte_arraycopy,
3176                                                               entry_jshort_arraycopy,
3177                                                               entry_jint_arraycopy,
3178                                                               entry_jlong_arraycopy);
3179     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
3180                                                                entry_jbyte_arraycopy,
3181                                                                entry_jshort_arraycopy,
3182                                                                entry_jint_arraycopy,
3183                                                                entry_oop_arraycopy,
3184                                                                entry_jlong_arraycopy,
3185                                                                entry_checkcast_arraycopy);
3186 
3187     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3188     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3189     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3190     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3191     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3192     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3193 
3194     // We don't generate specialized code for HeapWord-aligned source
3195     // arrays, so just use the code we've already generated
3196     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
3197     StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
3198 
3199     StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
3200     StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
3201 
3202     StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
3203     StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
3204 
3205     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
3206     StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
3207 
3208     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
3209     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
3210 
3211     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
3212     StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
3213   }
3214 
3215   // AES intrinsic stubs
3216   enum {AESBlockSize = 16};
3217 
3218   address generate_key_shuffle_mask() {
3219     __ align(16);
3220     StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
3221     address start = __ pc();
3222     __ emit_data64( 0x0405060700010203, relocInfo::none );
3223     __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
3224     return start;
3225   }
3226 
3227   address generate_counter_shuffle_mask() {
3228     __ align(16);
3229     StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
3230     address start = __ pc();
3231     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3232     __ emit_data64(0x0001020304050607, relocInfo::none);
3233     return start;
3234   }
3235 
3236   // Utility routine for loading a 128-bit key word in little endian format
3237   // can optionally specify that the shuffle mask is already in an xmmregister
3238   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
3239     __ movdqu(xmmdst, Address(key, offset));
3240     if (xmm_shuf_mask != NULL) {
3241       __ pshufb(xmmdst, xmm_shuf_mask);
3242     } else {
3243       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3244     }
3245   }
3246 
3247   // Utility routine for increase 128bit counter (iv in CTR mode)
3248   void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
3249     __ pextrq(reg, xmmdst, 0x0);
3250     __ addq(reg, inc_delta);
3251     __ pinsrq(xmmdst, reg, 0x0);
3252     __ jcc(Assembler::carryClear, next_block); // jump if no carry
3253     __ pextrq(reg, xmmdst, 0x01); // Carry
3254     __ addq(reg, 0x01);
3255     __ pinsrq(xmmdst, reg, 0x01); //Carry end
3256     __ BIND(next_block);          // next instruction
3257   }
3258 
3259   // Arguments:
3260   //
3261   // Inputs:
3262   //   c_rarg0   - source byte array address
3263   //   c_rarg1   - destination byte array address
3264   //   c_rarg2   - K (key) in little endian int array
3265   //
3266   address generate_aescrypt_encryptBlock() {
3267     assert(UseAES, "need AES instructions and misaligned SSE support");
3268     __ align(CodeEntryAlignment);
3269     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3270     Label L_doLast;
3271     address start = __ pc();
3272 
3273     const Register from        = c_rarg0;  // source array address
3274     const Register to          = c_rarg1;  // destination array address
3275     const Register key         = c_rarg2;  // key array address
3276     const Register keylen      = rax;
3277 
3278     const XMMRegister xmm_result = xmm0;
3279     const XMMRegister xmm_key_shuf_mask = xmm1;
3280     // On win64 xmm6-xmm15 must be preserved so don't use them.
3281     const XMMRegister xmm_temp1  = xmm2;
3282     const XMMRegister xmm_temp2  = xmm3;
3283     const XMMRegister xmm_temp3  = xmm4;
3284     const XMMRegister xmm_temp4  = xmm5;
3285 
3286     __ enter(); // required for proper stackwalking of RuntimeStub frame
3287 
3288     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3289     // context for the registers used, where all instructions below are using 128-bit mode
3290     // On EVEX without VL and BW, these instructions will all be AVX.
3291     if (VM_Version::supports_avx512vlbw()) {
3292       __ movl(rax, 0xffff);
3293       __ kmovql(k1, rax);
3294     }
3295 
3296     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3297     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3298 
3299     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3300     __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
3301 
3302     // For encryption, the java expanded key ordering is just what we need
3303     // we don't know if the key is aligned, hence not using load-execute form
3304 
3305     load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
3306     __ pxor(xmm_result, xmm_temp1);
3307 
3308     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3309     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3310     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3311     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3312 
3313     __ aesenc(xmm_result, xmm_temp1);
3314     __ aesenc(xmm_result, xmm_temp2);
3315     __ aesenc(xmm_result, xmm_temp3);
3316     __ aesenc(xmm_result, xmm_temp4);
3317 
3318     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3319     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3320     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3321     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3322 
3323     __ aesenc(xmm_result, xmm_temp1);
3324     __ aesenc(xmm_result, xmm_temp2);
3325     __ aesenc(xmm_result, xmm_temp3);
3326     __ aesenc(xmm_result, xmm_temp4);
3327 
3328     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3329     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3330 
3331     __ cmpl(keylen, 44);
3332     __ jccb(Assembler::equal, L_doLast);
3333 
3334     __ aesenc(xmm_result, xmm_temp1);
3335     __ aesenc(xmm_result, xmm_temp2);
3336 
3337     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3338     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3339 
3340     __ cmpl(keylen, 52);
3341     __ jccb(Assembler::equal, L_doLast);
3342 
3343     __ aesenc(xmm_result, xmm_temp1);
3344     __ aesenc(xmm_result, xmm_temp2);
3345 
3346     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3347     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3348 
3349     __ BIND(L_doLast);
3350     __ aesenc(xmm_result, xmm_temp1);
3351     __ aesenclast(xmm_result, xmm_temp2);
3352     __ movdqu(Address(to, 0), xmm_result);        // store the result
3353     __ xorptr(rax, rax); // return 0
3354     __ leave(); // required for proper stackwalking of RuntimeStub frame
3355     __ ret(0);
3356 
3357     return start;
3358   }
3359 
3360 
3361   // Arguments:
3362   //
3363   // Inputs:
3364   //   c_rarg0   - source byte array address
3365   //   c_rarg1   - destination byte array address
3366   //   c_rarg2   - K (key) in little endian int array
3367   //
3368   address generate_aescrypt_decryptBlock() {
3369     assert(UseAES, "need AES instructions and misaligned SSE support");
3370     __ align(CodeEntryAlignment);
3371     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3372     Label L_doLast;
3373     address start = __ pc();
3374 
3375     const Register from        = c_rarg0;  // source array address
3376     const Register to          = c_rarg1;  // destination array address
3377     const Register key         = c_rarg2;  // key array address
3378     const Register keylen      = rax;
3379 
3380     const XMMRegister xmm_result = xmm0;
3381     const XMMRegister xmm_key_shuf_mask = xmm1;
3382     // On win64 xmm6-xmm15 must be preserved so don't use them.
3383     const XMMRegister xmm_temp1  = xmm2;
3384     const XMMRegister xmm_temp2  = xmm3;
3385     const XMMRegister xmm_temp3  = xmm4;
3386     const XMMRegister xmm_temp4  = xmm5;
3387 
3388     __ enter(); // required for proper stackwalking of RuntimeStub frame
3389 
3390     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3391     // context for the registers used, where all instructions below are using 128-bit mode
3392     // On EVEX without VL and BW, these instructions will all be AVX.
3393     if (VM_Version::supports_avx512vlbw()) {
3394       __ movl(rax, 0xffff);
3395       __ kmovql(k1, rax);
3396     }
3397 
3398     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3399     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3400 
3401     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3402     __ movdqu(xmm_result, Address(from, 0));
3403 
3404     // for decryption java expanded key ordering is rotated one position from what we want
3405     // so we start from 0x10 here and hit 0x00 last
3406     // we don't know if the key is aligned, hence not using load-execute form
3407     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3408     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3409     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3410     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3411 
3412     __ pxor  (xmm_result, xmm_temp1);
3413     __ aesdec(xmm_result, xmm_temp2);
3414     __ aesdec(xmm_result, xmm_temp3);
3415     __ aesdec(xmm_result, xmm_temp4);
3416 
3417     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3418     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3419     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3420     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3421 
3422     __ aesdec(xmm_result, xmm_temp1);
3423     __ aesdec(xmm_result, xmm_temp2);
3424     __ aesdec(xmm_result, xmm_temp3);
3425     __ aesdec(xmm_result, xmm_temp4);
3426 
3427     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3428     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3429     load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
3430 
3431     __ cmpl(keylen, 44);
3432     __ jccb(Assembler::equal, L_doLast);
3433 
3434     __ aesdec(xmm_result, xmm_temp1);
3435     __ aesdec(xmm_result, xmm_temp2);
3436 
3437     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3438     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3439 
3440     __ cmpl(keylen, 52);
3441     __ jccb(Assembler::equal, L_doLast);
3442 
3443     __ aesdec(xmm_result, xmm_temp1);
3444     __ aesdec(xmm_result, xmm_temp2);
3445 
3446     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3447     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3448 
3449     __ BIND(L_doLast);
3450     __ aesdec(xmm_result, xmm_temp1);
3451     __ aesdec(xmm_result, xmm_temp2);
3452 
3453     // for decryption the aesdeclast operation is always on key+0x00
3454     __ aesdeclast(xmm_result, xmm_temp3);
3455     __ movdqu(Address(to, 0), xmm_result);  // store the result
3456     __ xorptr(rax, rax); // return 0
3457     __ leave(); // required for proper stackwalking of RuntimeStub frame
3458     __ ret(0);
3459 
3460     return start;
3461   }
3462 
3463 
3464   // Arguments:
3465   //
3466   // Inputs:
3467   //   c_rarg0   - source byte array address
3468   //   c_rarg1   - destination byte array address
3469   //   c_rarg2   - K (key) in little endian int array
3470   //   c_rarg3   - r vector byte array address
3471   //   c_rarg4   - input length
3472   //
3473   // Output:
3474   //   rax       - input length
3475   //
3476   address generate_cipherBlockChaining_encryptAESCrypt() {
3477     assert(UseAES, "need AES instructions and misaligned SSE support");
3478     __ align(CodeEntryAlignment);
3479     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3480     address start = __ pc();
3481 
3482     Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3483     const Register from        = c_rarg0;  // source array address
3484     const Register to          = c_rarg1;  // destination array address
3485     const Register key         = c_rarg2;  // key array address
3486     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3487                                            // and left with the results of the last encryption block
3488 #ifndef _WIN64
3489     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3490 #else
3491     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3492     const Register len_reg     = r11;      // pick the volatile windows register
3493 #endif
3494     const Register pos         = rax;
3495 
3496     // xmm register assignments for the loops below
3497     const XMMRegister xmm_result = xmm0;
3498     const XMMRegister xmm_temp   = xmm1;
3499     // keys 0-10 preloaded into xmm2-xmm12
3500     const int XMM_REG_NUM_KEY_FIRST = 2;
3501     const int XMM_REG_NUM_KEY_LAST  = 15;
3502     const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3503     const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3504     const XMMRegister xmm_key11  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3505     const XMMRegister xmm_key12  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3506     const XMMRegister xmm_key13  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3507 
3508     __ enter(); // required for proper stackwalking of RuntimeStub frame
3509 
3510     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3511     // context for the registers used, where all instructions below are using 128-bit mode
3512     // On EVEX without VL and BW, these instructions will all be AVX.
3513     if (VM_Version::supports_avx512vlbw()) {
3514       __ movl(rax, 0xffff);
3515       __ kmovql(k1, rax);
3516     }
3517 
3518 #ifdef _WIN64
3519     // on win64, fill len_reg from stack position
3520     __ movl(len_reg, len_mem);
3521 #else
3522     __ push(len_reg); // Save
3523 #endif
3524 
3525     const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
3526     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3527     // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3528     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3529       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3530       offset += 0x10;
3531     }
3532     __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
3533 
3534     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3535     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3536     __ cmpl(rax, 44);
3537     __ jcc(Assembler::notEqual, L_key_192_256);
3538 
3539     // 128 bit code follows here
3540     __ movptr(pos, 0);
3541     __ align(OptoLoopAlignment);
3542 
3543     __ BIND(L_loopTop_128);
3544     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3545     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3546     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3547     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3548       __ aesenc(xmm_result, as_XMMRegister(rnum));
3549     }
3550     __ aesenclast(xmm_result, xmm_key10);
3551     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3552     // no need to store r to memory until we exit
3553     __ addptr(pos, AESBlockSize);
3554     __ subptr(len_reg, AESBlockSize);
3555     __ jcc(Assembler::notEqual, L_loopTop_128);
3556 
3557     __ BIND(L_exit);
3558     __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
3559 
3560 #ifdef _WIN64
3561     __ movl(rax, len_mem);
3562 #else
3563     __ pop(rax); // return length
3564 #endif
3565     __ leave(); // required for proper stackwalking of RuntimeStub frame
3566     __ ret(0);
3567 
3568     __ BIND(L_key_192_256);
3569     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3570     load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3571     load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3572     __ cmpl(rax, 52);
3573     __ jcc(Assembler::notEqual, L_key_256);
3574 
3575     // 192-bit code follows here (could be changed to use more xmm registers)
3576     __ movptr(pos, 0);
3577     __ align(OptoLoopAlignment);
3578 
3579     __ BIND(L_loopTop_192);
3580     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3581     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3582     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3583     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
3584       __ aesenc(xmm_result, as_XMMRegister(rnum));
3585     }
3586     __ aesenclast(xmm_result, xmm_key12);
3587     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3588     // no need to store r to memory until we exit
3589     __ addptr(pos, AESBlockSize);
3590     __ subptr(len_reg, AESBlockSize);
3591     __ jcc(Assembler::notEqual, L_loopTop_192);
3592     __ jmp(L_exit);
3593 
3594     __ BIND(L_key_256);
3595     // 256-bit code follows here (could be changed to use more xmm registers)
3596     load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
3597     __ movptr(pos, 0);
3598     __ align(OptoLoopAlignment);
3599 
3600     __ BIND(L_loopTop_256);
3601     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3602     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3603     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3604     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
3605       __ aesenc(xmm_result, as_XMMRegister(rnum));
3606     }
3607     load_key(xmm_temp, key, 0xe0);
3608     __ aesenclast(xmm_result, xmm_temp);
3609     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3610     // no need to store r to memory until we exit
3611     __ addptr(pos, AESBlockSize);
3612     __ subptr(len_reg, AESBlockSize);
3613     __ jcc(Assembler::notEqual, L_loopTop_256);
3614     __ jmp(L_exit);
3615 
3616     return start;
3617   }
3618 
3619   // Safefetch stubs.
3620   void generate_safefetch(const char* name, int size, address* entry,
3621                           address* fault_pc, address* continuation_pc) {
3622     // safefetch signatures:
3623     //   int      SafeFetch32(int*      adr, int      errValue);
3624     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3625     //
3626     // arguments:
3627     //   c_rarg0 = adr
3628     //   c_rarg1 = errValue
3629     //
3630     // result:
3631     //   PPC_RET  = *adr or errValue
3632 
3633     StubCodeMark mark(this, "StubRoutines", name);
3634 
3635     // Entry point, pc or function descriptor.
3636     *entry = __ pc();
3637 
3638     // Load *adr into c_rarg1, may fault.
3639     *fault_pc = __ pc();
3640     switch (size) {
3641       case 4:
3642         // int32_t
3643         __ movl(c_rarg1, Address(c_rarg0, 0));
3644         break;
3645       case 8:
3646         // int64_t
3647         __ movq(c_rarg1, Address(c_rarg0, 0));
3648         break;
3649       default:
3650         ShouldNotReachHere();
3651     }
3652 
3653     // return errValue or *adr
3654     *continuation_pc = __ pc();
3655     __ movq(rax, c_rarg1);
3656     __ ret(0);
3657   }
3658 
3659   // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3660   // to hide instruction latency
3661   //
3662   // Arguments:
3663   //
3664   // Inputs:
3665   //   c_rarg0   - source byte array address
3666   //   c_rarg1   - destination byte array address
3667   //   c_rarg2   - K (key) in little endian int array
3668   //   c_rarg3   - r vector byte array address
3669   //   c_rarg4   - input length
3670   //
3671   // Output:
3672   //   rax       - input length
3673   //
3674   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3675     assert(UseAES, "need AES instructions and misaligned SSE support");
3676     __ align(CodeEntryAlignment);
3677     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3678     address start = __ pc();
3679 
3680     const Register from        = c_rarg0;  // source array address
3681     const Register to          = c_rarg1;  // destination array address
3682     const Register key         = c_rarg2;  // key array address
3683     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3684                                            // and left with the results of the last encryption block
3685 #ifndef _WIN64
3686     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3687 #else
3688     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3689     const Register len_reg     = r11;      // pick the volatile windows register
3690 #endif
3691     const Register pos         = rax;
3692 
3693     const int PARALLEL_FACTOR = 4;
3694     const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256
3695 
3696     Label L_exit;
3697     Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
3698     Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
3699     Label L_singleBlock_loopTop[3]; // 128, 192, 256
3700     Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
3701     Label L_multiBlock_loopTop[3]; // 128, 192, 256
3702 
3703     // keys 0-10 preloaded into xmm5-xmm15
3704     const int XMM_REG_NUM_KEY_FIRST = 5;
3705     const int XMM_REG_NUM_KEY_LAST  = 15;
3706     const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3707     const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3708 
3709     __ enter(); // required for proper stackwalking of RuntimeStub frame
3710 
3711     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3712     // context for the registers used, where all instructions below are using 128-bit mode
3713     // On EVEX without VL and BW, these instructions will all be AVX.
3714     if (VM_Version::supports_avx512vlbw()) {
3715       __ movl(rax, 0xffff);
3716       __ kmovql(k1, rax);
3717     }
3718 
3719 #ifdef _WIN64
3720     // on win64, fill len_reg from stack position
3721     __ movl(len_reg, len_mem);
3722 #else
3723     __ push(len_reg); // Save
3724 #endif
3725     __ push(rbx);
3726     // the java expanded key ordering is rotated one position from what we want
3727     // so we start from 0x10 here and hit 0x00 last
3728     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
3729     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3730     // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3731     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3732       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3733       offset += 0x10;
3734     }
3735     load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3736 
3737     const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
3738 
3739     // registers holding the four results in the parallelized loop
3740     const XMMRegister xmm_result0 = xmm0;
3741     const XMMRegister xmm_result1 = xmm2;
3742     const XMMRegister xmm_result2 = xmm3;
3743     const XMMRegister xmm_result3 = xmm4;
3744 
3745     __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
3746 
3747     __ xorptr(pos, pos);
3748 
3749     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3750     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3751     __ cmpl(rbx, 52);
3752     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
3753     __ cmpl(rbx, 60);
3754     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
3755 
3756 #define DoFour(opc, src_reg)           \
3757   __ opc(xmm_result0, src_reg);         \
3758   __ opc(xmm_result1, src_reg);         \
3759   __ opc(xmm_result2, src_reg);         \
3760   __ opc(xmm_result3, src_reg);         \
3761 
3762     for (int k = 0; k < 3; ++k) {
3763       __ BIND(L_multiBlock_loopTopHead[k]);
3764       if (k != 0) {
3765         __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3766         __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
3767       }
3768       if (k == 1) {
3769         __ subptr(rsp, 6 * wordSize);
3770         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3771         load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
3772         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3773         load_key(xmm1, key, 0xc0);  // 0xc0;
3774         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3775       } else if (k == 2) {
3776         __ subptr(rsp, 10 * wordSize);
3777         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3778         load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes upto 0xe0
3779         __ movdqu(Address(rsp, 6 * wordSize), xmm15);
3780         load_key(xmm1, key, 0xe0);  // 0xe0;
3781         __ movdqu(Address(rsp, 8 * wordSize), xmm1);
3782         load_key(xmm15, key, 0xb0); // 0xb0;
3783         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3784         load_key(xmm1, key, 0xc0);  // 0xc0;
3785         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3786       }
3787       __ align(OptoLoopAlignment);
3788       __ BIND(L_multiBlock_loopTop[k]);
3789       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3790       __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);
3791 
3792       if  (k != 0) {
3793         __ movdqu(xmm15, Address(rsp, 2 * wordSize));
3794         __ movdqu(xmm1, Address(rsp, 4 * wordSize));
3795       }
3796 
3797       __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
3798       __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3799       __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3800       __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
3801 
3802       DoFour(pxor, xmm_key_first);
3803       if (k == 0) {
3804         for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
3805           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3806         }
3807         DoFour(aesdeclast, xmm_key_last);
3808       } else if (k == 1) {
3809         for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
3810           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3811         }
3812         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3813         DoFour(aesdec, xmm1);  // key : 0xc0
3814         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
3815         DoFour(aesdeclast, xmm_key_last);
3816       } else if (k == 2) {
3817         for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
3818           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3819         }
3820         DoFour(aesdec, xmm1);  // key : 0xc0
3821         __ movdqu(xmm15, Address(rsp, 6 * wordSize));
3822         __ movdqu(xmm1, Address(rsp, 8 * wordSize));
3823         DoFour(aesdec, xmm15);  // key : 0xd0
3824         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3825         DoFour(aesdec, xmm1);  // key : 0xe0
3826         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
3827         DoFour(aesdeclast, xmm_key_last);
3828       }
3829 
3830       // for each result, xor with the r vector of previous cipher block
3831       __ pxor(xmm_result0, xmm_prev_block_cipher);
3832       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
3833       __ pxor(xmm_result1, xmm_prev_block_cipher);
3834       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3835       __ pxor(xmm_result2, xmm_prev_block_cipher);
3836       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3837       __ pxor(xmm_result3, xmm_prev_block_cipher);
3838       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize));   // this will carry over to next set of blocks
3839       if (k != 0) {
3840         __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
3841       }
3842 
3843       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
3844       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
3845       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
3846       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
3847 
3848       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
3849       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
3850       __ jmp(L_multiBlock_loopTop[k]);
3851 
3852       // registers used in the non-parallelized loops
3853       // xmm register assignments for the loops below
3854       const XMMRegister xmm_result = xmm0;
3855       const XMMRegister xmm_prev_block_cipher_save = xmm2;
3856       const XMMRegister xmm_key11 = xmm3;
3857       const XMMRegister xmm_key12 = xmm4;
3858       const XMMRegister key_tmp = xmm4;
3859 
3860       __ BIND(L_singleBlock_loopTopHead[k]);
3861       if (k == 1) {
3862         __ addptr(rsp, 6 * wordSize);
3863       } else if (k == 2) {
3864         __ addptr(rsp, 10 * wordSize);
3865       }
3866       __ cmpptr(len_reg, 0); // any blocks left??
3867       __ jcc(Assembler::equal, L_exit);
3868       __ BIND(L_singleBlock_loopTopHead2[k]);
3869       if (k == 1) {
3870         load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes upto 0xc0
3871         load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes upto 0xc0
3872       }
3873       if (k == 2) {
3874         load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes upto 0xe0
3875       }
3876       __ align(OptoLoopAlignment);
3877       __ BIND(L_singleBlock_loopTop[k]);
3878       __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
3879       __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
3880       __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
3881       for (int rnum = 1; rnum <= 9 ; rnum++) {
3882           __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3883       }
3884       if (k == 1) {
3885         __ aesdec(xmm_result, xmm_key11);
3886         __ aesdec(xmm_result, xmm_key12);
3887       }
3888       if (k == 2) {
3889         __ aesdec(xmm_result, xmm_key11);
3890         load_key(key_tmp, key, 0xc0);
3891         __ aesdec(xmm_result, key_tmp);
3892         load_key(key_tmp, key, 0xd0);
3893         __ aesdec(xmm_result, key_tmp);
3894         load_key(key_tmp, key, 0xe0);
3895         __ aesdec(xmm_result, key_tmp);
3896       }
3897 
3898       __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
3899       __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3900       __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3901       // no need to store r to memory until we exit
3902       __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3903       __ addptr(pos, AESBlockSize);
3904       __ subptr(len_reg, AESBlockSize);
3905       __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
3906       if (k != 2) {
3907         __ jmp(L_exit);
3908       }
3909     } //for 128/192/256
3910 
3911     __ BIND(L_exit);
3912     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
3913     __ pop(rbx);
3914 #ifdef _WIN64
3915     __ movl(rax, len_mem);
3916 #else
3917     __ pop(rax); // return length
3918 #endif
3919     __ leave(); // required for proper stackwalking of RuntimeStub frame
3920     __ ret(0);
3921     return start;
3922 }
3923 
3924   address generate_upper_word_mask() {
3925     __ align(64);
3926     StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
3927     address start = __ pc();
3928     __ emit_data64(0x0000000000000000, relocInfo::none);
3929     __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
3930     return start;
3931   }
3932 
3933   address generate_shuffle_byte_flip_mask() {
3934     __ align(64);
3935     StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
3936     address start = __ pc();
3937     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3938     __ emit_data64(0x0001020304050607, relocInfo::none);
3939     return start;
3940   }
3941 
3942   // ofs and limit are use for multi-block byte array.
3943   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3944   address generate_sha1_implCompress(bool multi_block, const char *name) {
3945     __ align(CodeEntryAlignment);
3946     StubCodeMark mark(this, "StubRoutines", name);
3947     address start = __ pc();
3948 
3949     Register buf = c_rarg0;
3950     Register state = c_rarg1;
3951     Register ofs = c_rarg2;
3952     Register limit = c_rarg3;
3953 
3954     const XMMRegister abcd = xmm0;
3955     const XMMRegister e0 = xmm1;
3956     const XMMRegister e1 = xmm2;
3957     const XMMRegister msg0 = xmm3;
3958 
3959     const XMMRegister msg1 = xmm4;
3960     const XMMRegister msg2 = xmm5;
3961     const XMMRegister msg3 = xmm6;
3962     const XMMRegister shuf_mask = xmm7;
3963 
3964     __ enter();
3965 
3966     __ subptr(rsp, 4 * wordSize);
3967 
3968     __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
3969       buf, state, ofs, limit, rsp, multi_block);
3970 
3971     __ addptr(rsp, 4 * wordSize);
3972 
3973     __ leave();
3974     __ ret(0);
3975     return start;
3976   }
3977 
3978   address generate_pshuffle_byte_flip_mask() {
3979     __ align(64);
3980     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
3981     address start = __ pc();
3982     __ emit_data64(0x0405060700010203, relocInfo::none);
3983     __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3984 
3985     if (VM_Version::supports_avx2()) {
3986       __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
3987       __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3988       // _SHUF_00BA
3989       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3990       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3991       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3992       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3993       // _SHUF_DC00
3994       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3995       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3996       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3997       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3998     }
3999 
4000     return start;
4001   }
4002 
4003   //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
4004   address generate_pshuffle_byte_flip_mask_sha512() {
4005     __ align(32);
4006     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
4007     address start = __ pc();
4008     if (VM_Version::supports_avx2()) {
4009       __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK
4010       __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4011       __ emit_data64(0x1011121314151617, relocInfo::none);
4012       __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none);
4013       __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO
4014       __ emit_data64(0x0000000000000000, relocInfo::none);
4015       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4016       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4017     }
4018 
4019     return start;
4020   }
4021 
4022 // ofs and limit are use for multi-block byte array.
4023 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
4024   address generate_sha256_implCompress(bool multi_block, const char *name) {
4025     assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), "");
4026     __ align(CodeEntryAlignment);
4027     StubCodeMark mark(this, "StubRoutines", name);
4028     address start = __ pc();
4029 
4030     Register buf = c_rarg0;
4031     Register state = c_rarg1;
4032     Register ofs = c_rarg2;
4033     Register limit = c_rarg3;
4034 
4035     const XMMRegister msg = xmm0;
4036     const XMMRegister state0 = xmm1;
4037     const XMMRegister state1 = xmm2;
4038     const XMMRegister msgtmp0 = xmm3;
4039 
4040     const XMMRegister msgtmp1 = xmm4;
4041     const XMMRegister msgtmp2 = xmm5;
4042     const XMMRegister msgtmp3 = xmm6;
4043     const XMMRegister msgtmp4 = xmm7;
4044 
4045     const XMMRegister shuf_mask = xmm8;
4046 
4047     __ enter();
4048 
4049     __ subptr(rsp, 4 * wordSize);
4050 
4051     if (VM_Version::supports_sha()) {
4052       __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4053         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4054     } else if (VM_Version::supports_avx2()) {
4055       __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4056         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4057     }
4058     __ addptr(rsp, 4 * wordSize);
4059     __ vzeroupper();
4060     __ leave();
4061     __ ret(0);
4062     return start;
4063   }
4064 
4065   address generate_sha512_implCompress(bool multi_block, const char *name) {
4066     assert(VM_Version::supports_avx2(), "");
4067     assert(VM_Version::supports_bmi2(), "");
4068     __ align(CodeEntryAlignment);
4069     StubCodeMark mark(this, "StubRoutines", name);
4070     address start = __ pc();
4071 
4072     Register buf = c_rarg0;
4073     Register state = c_rarg1;
4074     Register ofs = c_rarg2;
4075     Register limit = c_rarg3;
4076 
4077     const XMMRegister msg = xmm0;
4078     const XMMRegister state0 = xmm1;
4079     const XMMRegister state1 = xmm2;
4080     const XMMRegister msgtmp0 = xmm3;
4081     const XMMRegister msgtmp1 = xmm4;
4082     const XMMRegister msgtmp2 = xmm5;
4083     const XMMRegister msgtmp3 = xmm6;
4084     const XMMRegister msgtmp4 = xmm7;
4085 
4086     const XMMRegister shuf_mask = xmm8;
4087 
4088     __ enter();
4089 
4090     __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4091     buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4092 
4093     __ vzeroupper();
4094     __ leave();
4095     __ ret(0);
4096     return start;
4097   }
4098 
4099   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
4100   // to hide instruction latency
4101   //
4102   // Arguments:
4103   //
4104   // Inputs:
4105   //   c_rarg0   - source byte array address
4106   //   c_rarg1   - destination byte array address
4107   //   c_rarg2   - K (key) in little endian int array
4108   //   c_rarg3   - counter vector byte array address
4109   //   Linux
4110   //     c_rarg4   -          input length
4111   //     c_rarg5   -          saved encryptedCounter start
4112   //     rbp + 6 * wordSize - saved used length
4113   //   Windows
4114   //     rbp + 6 * wordSize - input length
4115   //     rbp + 7 * wordSize - saved encryptedCounter start
4116   //     rbp + 8 * wordSize - saved used length
4117   //
4118   // Output:
4119   //   rax       - input length
4120   //
4121   address generate_counterMode_AESCrypt_Parallel() {
4122     assert(UseAES, "need AES instructions and misaligned SSE support");
4123     __ align(CodeEntryAlignment);
4124     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
4125     address start = __ pc();
4126     const Register from = c_rarg0; // source array address
4127     const Register to = c_rarg1; // destination array address
4128     const Register key = c_rarg2; // key array address
4129     const Register counter = c_rarg3; // counter byte array initialized from counter array address
4130                                       // and updated with the incremented counter in the end
4131 #ifndef _WIN64
4132     const Register len_reg = c_rarg4;
4133     const Register saved_encCounter_start = c_rarg5;
4134     const Register used_addr = r10;
4135     const Address  used_mem(rbp, 2 * wordSize);
4136     const Register used = r11;
4137 #else
4138     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4139     const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
4140     const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
4141     const Register len_reg = r10; // pick the first volatile windows register
4142     const Register saved_encCounter_start = r11;
4143     const Register used_addr = r13;
4144     const Register used = r14;
4145 #endif
4146     const Register pos = rax;
4147 
4148     const int PARALLEL_FACTOR = 6;
4149     const XMMRegister xmm_counter_shuf_mask = xmm0;
4150     const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
4151     const XMMRegister xmm_curr_counter = xmm2;
4152 
4153     const XMMRegister xmm_key_tmp0 = xmm3;
4154     const XMMRegister xmm_key_tmp1 = xmm4;
4155 
4156     // registers holding the four results in the parallelized loop
4157     const XMMRegister xmm_result0 = xmm5;
4158     const XMMRegister xmm_result1 = xmm6;
4159     const XMMRegister xmm_result2 = xmm7;
4160     const XMMRegister xmm_result3 = xmm8;
4161     const XMMRegister xmm_result4 = xmm9;
4162     const XMMRegister xmm_result5 = xmm10;
4163 
4164     const XMMRegister xmm_from0 = xmm11;
4165     const XMMRegister xmm_from1 = xmm12;
4166     const XMMRegister xmm_from2 = xmm13;
4167     const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
4168     const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
4169     const XMMRegister xmm_from5 = xmm4;
4170 
4171     //for key_128, key_192, key_256
4172     const int rounds[3] = {10, 12, 14};
4173     Label L_exit_preLoop, L_preLoop_start;
4174     Label L_multiBlock_loopTop[3];
4175     Label L_singleBlockLoopTop[3];
4176     Label L__incCounter[3][6]; //for 6 blocks
4177     Label L__incCounter_single[3]; //for single block, key128, key192, key256
4178     Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
4179     Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
4180 
4181     Label L_exit;
4182 
4183     __ enter(); // required for proper stackwalking of RuntimeStub frame
4184 
4185     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
4186     // context for the registers used, where all instructions below are using 128-bit mode
4187     // On EVEX without VL and BW, these instructions will all be AVX.
4188     if (VM_Version::supports_avx512vlbw()) {
4189         __ movl(rax, 0xffff);
4190         __ kmovql(k1, rax);
4191     }
4192 
4193 #ifdef _WIN64
4194     // allocate spill slots for r13, r14
4195     enum {
4196         saved_r13_offset,
4197         saved_r14_offset
4198     };
4199     __ subptr(rsp, 2 * wordSize);
4200     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
4201     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
4202 
4203     // on win64, fill len_reg from stack position
4204     __ movl(len_reg, len_mem);
4205     __ movptr(saved_encCounter_start, saved_encCounter_mem);
4206     __ movptr(used_addr, used_mem);
4207     __ movl(used, Address(used_addr, 0));
4208 #else
4209     __ push(len_reg); // Save
4210     __ movptr(used_addr, used_mem);
4211     __ movl(used, Address(used_addr, 0));
4212 #endif
4213 
4214     __ push(rbx); // Save RBX
4215     __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
4216     __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()), pos); // pos as scratch
4217     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
4218     __ movptr(pos, 0);
4219 
4220     // Use the partially used encrpyted counter from last invocation
4221     __ BIND(L_preLoop_start);
4222     __ cmpptr(used, 16);
4223     __ jcc(Assembler::aboveEqual, L_exit_preLoop);
4224       __ cmpptr(len_reg, 0);
4225       __ jcc(Assembler::lessEqual, L_exit_preLoop);
4226       __ movb(rbx, Address(saved_encCounter_start, used));
4227       __ xorb(rbx, Address(from, pos));
4228       __ movb(Address(to, pos), rbx);
4229       __ addptr(pos, 1);
4230       __ addptr(used, 1);
4231       __ subptr(len_reg, 1);
4232 
4233     __ jmp(L_preLoop_start);
4234 
4235     __ BIND(L_exit_preLoop);
4236     __ movl(Address(used_addr, 0), used);
4237 
4238     // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
4239     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); // rbx as scratch
4240     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4241     __ cmpl(rbx, 52);
4242     __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
4243     __ cmpl(rbx, 60);
4244     __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
4245 
4246 #define CTR_DoSix(opc, src_reg)                \
4247     __ opc(xmm_result0, src_reg);              \
4248     __ opc(xmm_result1, src_reg);              \
4249     __ opc(xmm_result2, src_reg);              \
4250     __ opc(xmm_result3, src_reg);              \
4251     __ opc(xmm_result4, src_reg);              \
4252     __ opc(xmm_result5, src_reg);
4253 
4254     // k == 0 :  generate code for key_128
4255     // k == 1 :  generate code for key_192
4256     // k == 2 :  generate code for key_256
4257     for (int k = 0; k < 3; ++k) {
4258       //multi blocks starts here
4259       __ align(OptoLoopAlignment);
4260       __ BIND(L_multiBlock_loopTop[k]);
4261       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
4262       __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
4263       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4264 
4265       //load, then increase counters
4266       CTR_DoSix(movdqa, xmm_curr_counter);
4267       inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
4268       inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
4269       inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
4270       inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
4271       inc_counter(rbx, xmm_result5,  0x05, L__incCounter[k][4]);
4272       inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
4273       CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
4274       CTR_DoSix(pxor, xmm_key_tmp0);   //PXOR with Round 0 key
4275 
4276       //load two ROUND_KEYs at a time
4277       for (int i = 1; i < rounds[k]; ) {
4278         load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
4279         load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
4280         CTR_DoSix(aesenc, xmm_key_tmp1);
4281         i++;
4282         if (i != rounds[k]) {
4283           CTR_DoSix(aesenc, xmm_key_tmp0);
4284         } else {
4285           CTR_DoSix(aesenclast, xmm_key_tmp0);
4286         }
4287         i++;
4288       }
4289 
4290       // get next PARALLEL_FACTOR blocks into xmm_result registers
4291       __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4292       __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4293       __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4294       __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4295       __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
4296       __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
4297 
4298       __ pxor(xmm_result0, xmm_from0);
4299       __ pxor(xmm_result1, xmm_from1);
4300       __ pxor(xmm_result2, xmm_from2);
4301       __ pxor(xmm_result3, xmm_from3);
4302       __ pxor(xmm_result4, xmm_from4);
4303       __ pxor(xmm_result5, xmm_from5);
4304 
4305       // store 6 results into the next 64 bytes of output
4306       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4307       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4308       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4309       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4310       __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
4311       __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
4312 
4313       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
4314       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
4315       __ jmp(L_multiBlock_loopTop[k]);
4316 
4317       // singleBlock starts here
4318       __ align(OptoLoopAlignment);
4319       __ BIND(L_singleBlockLoopTop[k]);
4320       __ cmpptr(len_reg, 0);
4321       __ jcc(Assembler::lessEqual, L_exit);
4322       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4323       __ movdqa(xmm_result0, xmm_curr_counter);
4324       inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
4325       __ pshufb(xmm_result0, xmm_counter_shuf_mask);
4326       __ pxor(xmm_result0, xmm_key_tmp0);
4327       for (int i = 1; i < rounds[k]; i++) {
4328         load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
4329         __ aesenc(xmm_result0, xmm_key_tmp0);
4330       }
4331       load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
4332       __ aesenclast(xmm_result0, xmm_key_tmp0);
4333       __ cmpptr(len_reg, AESBlockSize);
4334       __ jcc(Assembler::less, L_processTail_insr[k]);
4335         __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4336         __ pxor(xmm_result0, xmm_from0);
4337         __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4338         __ addptr(pos, AESBlockSize);
4339         __ subptr(len_reg, AESBlockSize);
4340         __ jmp(L_singleBlockLoopTop[k]);
4341       __ BIND(L_processTail_insr[k]);                               // Process the tail part of the input array
4342         __ addptr(pos, len_reg);                                    // 1. Insert bytes from src array into xmm_from0 register
4343         __ testptr(len_reg, 8);
4344         __ jcc(Assembler::zero, L_processTail_4_insr[k]);
4345           __ subptr(pos,8);
4346           __ pinsrq(xmm_from0, Address(from, pos), 0);
4347         __ BIND(L_processTail_4_insr[k]);
4348         __ testptr(len_reg, 4);
4349         __ jcc(Assembler::zero, L_processTail_2_insr[k]);
4350           __ subptr(pos,4);
4351           __ pslldq(xmm_from0, 4);
4352           __ pinsrd(xmm_from0, Address(from, pos), 0);
4353         __ BIND(L_processTail_2_insr[k]);
4354         __ testptr(len_reg, 2);
4355         __ jcc(Assembler::zero, L_processTail_1_insr[k]);
4356           __ subptr(pos, 2);
4357           __ pslldq(xmm_from0, 2);
4358           __ pinsrw(xmm_from0, Address(from, pos), 0);
4359         __ BIND(L_processTail_1_insr[k]);
4360         __ testptr(len_reg, 1);
4361         __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
4362           __ subptr(pos, 1);
4363           __ pslldq(xmm_from0, 1);
4364           __ pinsrb(xmm_from0, Address(from, pos), 0);
4365         __ BIND(L_processTail_exit_insr[k]);
4366 
4367         __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);  // 2. Perform pxor of the encrypted counter and plaintext Bytes.
4368         __ pxor(xmm_result0, xmm_from0);                             //    Also the encrypted counter is saved for next invocation.
4369 
4370         __ testptr(len_reg, 8);
4371         __ jcc(Assembler::zero, L_processTail_4_extr[k]);            // 3. Extract bytes from xmm_result0 into the dest. array
4372           __ pextrq(Address(to, pos), xmm_result0, 0);
4373           __ psrldq(xmm_result0, 8);
4374           __ addptr(pos, 8);
4375         __ BIND(L_processTail_4_extr[k]);
4376         __ testptr(len_reg, 4);
4377         __ jcc(Assembler::zero, L_processTail_2_extr[k]);
4378           __ pextrd(Address(to, pos), xmm_result0, 0);
4379           __ psrldq(xmm_result0, 4);
4380           __ addptr(pos, 4);
4381         __ BIND(L_processTail_2_extr[k]);
4382         __ testptr(len_reg, 2);
4383         __ jcc(Assembler::zero, L_processTail_1_extr[k]);
4384           __ pextrw(Address(to, pos), xmm_result0, 0);
4385           __ psrldq(xmm_result0, 2);
4386           __ addptr(pos, 2);
4387         __ BIND(L_processTail_1_extr[k]);
4388         __ testptr(len_reg, 1);
4389         __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
4390           __ pextrb(Address(to, pos), xmm_result0, 0);
4391 
4392         __ BIND(L_processTail_exit_extr[k]);
4393         __ movl(Address(used_addr, 0), len_reg);
4394         __ jmp(L_exit);
4395 
4396     }
4397 
4398     __ BIND(L_exit);
4399     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
4400     __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
4401     __ pop(rbx); // pop the saved RBX.
4402 #ifdef _WIN64
4403     __ movl(rax, len_mem);
4404     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
4405     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
4406     __ addptr(rsp, 2 * wordSize);
4407 #else
4408     __ pop(rax); // return 'len'
4409 #endif
4410     __ leave(); // required for proper stackwalking of RuntimeStub frame
4411     __ ret(0);
4412     return start;
4413   }
4414 
4415 void roundDec(XMMRegister xmm_reg) {
4416   __ vaesdec(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4417   __ vaesdec(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4418   __ vaesdec(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4419   __ vaesdec(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4420   __ vaesdec(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4421   __ vaesdec(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4422   __ vaesdec(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4423   __ vaesdec(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4424 }
4425 
4426 void roundDeclast(XMMRegister xmm_reg) {
4427   __ vaesdeclast(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4428   __ vaesdeclast(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4429   __ vaesdeclast(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4430   __ vaesdeclast(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4431   __ vaesdeclast(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4432   __ vaesdeclast(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4433   __ vaesdeclast(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4434   __ vaesdeclast(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4435 }
4436 
4437   void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = NULL) {
4438     __ movdqu(xmmdst, Address(key, offset));
4439     if (xmm_shuf_mask != NULL) {
4440       __ pshufb(xmmdst, xmm_shuf_mask);
4441     } else {
4442       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4443     }
4444     __ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
4445 
4446   }
4447 
4448 address generate_cipherBlockChaining_decryptVectorAESCrypt() {
4449     assert(VM_Version::supports_vaes(), "need AES instructions and misaligned SSE support");
4450     __ align(CodeEntryAlignment);
4451     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
4452     address start = __ pc();
4453 
4454     const Register from = c_rarg0;  // source array address
4455     const Register to = c_rarg1;  // destination array address
4456     const Register key = c_rarg2;  // key array address
4457     const Register rvec = c_rarg3;  // r byte array initialized from initvector array address
4458     // and left with the results of the last encryption block
4459 #ifndef _WIN64
4460     const Register len_reg = c_rarg4;  // src len (must be multiple of blocksize 16)
4461 #else
4462     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
4463     const Register len_reg = r11;      // pick the volatile windows register
4464 #endif
4465 
4466     Label Loop, Loop1, L_128, L_256, L_192, KEY_192, KEY_256, Loop2, Lcbc_dec_rem_loop,
4467           Lcbc_dec_rem_last, Lcbc_dec_ret, Lcbc_dec_rem, Lcbc_exit;
4468 
4469     __ enter();
4470 
4471 #ifdef _WIN64
4472   // on win64, fill len_reg from stack position
4473     __ movl(len_reg, len_mem);
4474 #else
4475     __ push(len_reg); // Save
4476 #endif
4477     __ push(rbx);
4478     __ vzeroupper();
4479 
4480     // Temporary variable declaration for swapping key bytes
4481     const XMMRegister xmm_key_shuf_mask = xmm1;
4482     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4483 
4484     // Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
4485     const Register rounds = rbx;
4486     __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4487 
4488     const XMMRegister IV = xmm0;
4489     // Load IV and broadcast value to 512-bits
4490     __ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit);
4491 
4492     // Temporary variables for storing round keys
4493     const XMMRegister RK0 = xmm30;
4494     const XMMRegister RK1 = xmm9;
4495     const XMMRegister RK2 = xmm18;
4496     const XMMRegister RK3 = xmm19;
4497     const XMMRegister RK4 = xmm20;
4498     const XMMRegister RK5 = xmm21;
4499     const XMMRegister RK6 = xmm22;
4500     const XMMRegister RK7 = xmm23;
4501     const XMMRegister RK8 = xmm24;
4502     const XMMRegister RK9 = xmm25;
4503     const XMMRegister RK10 = xmm26;
4504 
4505      // Load and shuffle key
4506     // the java expanded key ordering is rotated one position from what we want
4507     // so we start from 1*16 here and hit 0*16 last
4508     ev_load_key(RK1, key, 1 * 16, xmm_key_shuf_mask);
4509     ev_load_key(RK2, key, 2 * 16, xmm_key_shuf_mask);
4510     ev_load_key(RK3, key, 3 * 16, xmm_key_shuf_mask);
4511     ev_load_key(RK4, key, 4 * 16, xmm_key_shuf_mask);
4512     ev_load_key(RK5, key, 5 * 16, xmm_key_shuf_mask);
4513     ev_load_key(RK6, key, 6 * 16, xmm_key_shuf_mask);
4514     ev_load_key(RK7, key, 7 * 16, xmm_key_shuf_mask);
4515     ev_load_key(RK8, key, 8 * 16, xmm_key_shuf_mask);
4516     ev_load_key(RK9, key, 9 * 16, xmm_key_shuf_mask);
4517     ev_load_key(RK10, key, 10 * 16, xmm_key_shuf_mask);
4518     ev_load_key(RK0, key, 0*16, xmm_key_shuf_mask);
4519 
4520     // Variables for storing source cipher text
4521     const XMMRegister S0 = xmm10;
4522     const XMMRegister S1 = xmm11;
4523     const XMMRegister S2 = xmm12;
4524     const XMMRegister S3 = xmm13;
4525     const XMMRegister S4 = xmm14;
4526     const XMMRegister S5 = xmm15;
4527     const XMMRegister S6 = xmm16;
4528     const XMMRegister S7 = xmm17;
4529 
4530     // Variables for storing decrypted text
4531     const XMMRegister B0 = xmm1;
4532     const XMMRegister B1 = xmm2;
4533     const XMMRegister B2 = xmm3;
4534     const XMMRegister B3 = xmm4;
4535     const XMMRegister B4 = xmm5;
4536     const XMMRegister B5 = xmm6;
4537     const XMMRegister B6 = xmm7;
4538     const XMMRegister B7 = xmm8;
4539 
4540     __ cmpl(rounds, 44);
4541     __ jcc(Assembler::greater, KEY_192);
4542     __ jmp(Loop);
4543 
4544     __ BIND(KEY_192);
4545     const XMMRegister RK11 = xmm27;
4546     const XMMRegister RK12 = xmm28;
4547     ev_load_key(RK11, key, 11*16, xmm_key_shuf_mask);
4548     ev_load_key(RK12, key, 12*16, xmm_key_shuf_mask);
4549 
4550     __ cmpl(rounds, 52);
4551     __ jcc(Assembler::greater, KEY_256);
4552     __ jmp(Loop);
4553 
4554     __ BIND(KEY_256);
4555     const XMMRegister RK13 = xmm29;
4556     const XMMRegister RK14 = xmm31;
4557     ev_load_key(RK13, key, 13*16, xmm_key_shuf_mask);
4558     ev_load_key(RK14, key, 14*16, xmm_key_shuf_mask);
4559 
4560     __ BIND(Loop);
4561     __ cmpl(len_reg, 512);
4562     __ jcc(Assembler::below, Lcbc_dec_rem);
4563     __ BIND(Loop1);
4564     __ subl(len_reg, 512);
4565     __ evmovdquq(S0, Address(from, 0 * 64), Assembler::AVX_512bit);
4566     __ evmovdquq(S1, Address(from, 1 * 64), Assembler::AVX_512bit);
4567     __ evmovdquq(S2, Address(from, 2 * 64), Assembler::AVX_512bit);
4568     __ evmovdquq(S3, Address(from, 3 * 64), Assembler::AVX_512bit);
4569     __ evmovdquq(S4, Address(from, 4 * 64), Assembler::AVX_512bit);
4570     __ evmovdquq(S5, Address(from, 5 * 64), Assembler::AVX_512bit);
4571     __ evmovdquq(S6, Address(from, 6 * 64), Assembler::AVX_512bit);
4572     __ evmovdquq(S7, Address(from, 7 * 64), Assembler::AVX_512bit);
4573     __ leaq(from, Address(from, 8 * 64));
4574 
4575     __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
4576     __ evpxorq(B1, S1, RK1, Assembler::AVX_512bit);
4577     __ evpxorq(B2, S2, RK1, Assembler::AVX_512bit);
4578     __ evpxorq(B3, S3, RK1, Assembler::AVX_512bit);
4579     __ evpxorq(B4, S4, RK1, Assembler::AVX_512bit);
4580     __ evpxorq(B5, S5, RK1, Assembler::AVX_512bit);
4581     __ evpxorq(B6, S6, RK1, Assembler::AVX_512bit);
4582     __ evpxorq(B7, S7, RK1, Assembler::AVX_512bit);
4583 
4584     __ evalignq(IV, S0, IV, 0x06);
4585     __ evalignq(S0, S1, S0, 0x06);
4586     __ evalignq(S1, S2, S1, 0x06);
4587     __ evalignq(S2, S3, S2, 0x06);
4588     __ evalignq(S3, S4, S3, 0x06);
4589     __ evalignq(S4, S5, S4, 0x06);
4590     __ evalignq(S5, S6, S5, 0x06);
4591     __ evalignq(S6, S7, S6, 0x06);
4592 
4593     roundDec(RK2);
4594     roundDec(RK3);
4595     roundDec(RK4);
4596     roundDec(RK5);
4597     roundDec(RK6);
4598     roundDec(RK7);
4599     roundDec(RK8);
4600     roundDec(RK9);
4601     roundDec(RK10);
4602 
4603     __ cmpl(rounds, 44);
4604     __ jcc(Assembler::belowEqual, L_128);
4605     roundDec(RK11);
4606     roundDec(RK12);
4607 
4608     __ cmpl(rounds, 52);
4609     __ jcc(Assembler::belowEqual, L_192);
4610     roundDec(RK13);
4611     roundDec(RK14);
4612 
4613     __ BIND(L_256);
4614     roundDeclast(RK0);
4615     __ jmp(Loop2);
4616 
4617     __ BIND(L_128);
4618     roundDeclast(RK0);
4619     __ jmp(Loop2);
4620 
4621     __ BIND(L_192);
4622     roundDeclast(RK0);
4623 
4624     __ BIND(Loop2);
4625     __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
4626     __ evpxorq(B1, B1, S0, Assembler::AVX_512bit);
4627     __ evpxorq(B2, B2, S1, Assembler::AVX_512bit);
4628     __ evpxorq(B3, B3, S2, Assembler::AVX_512bit);
4629     __ evpxorq(B4, B4, S3, Assembler::AVX_512bit);
4630     __ evpxorq(B5, B5, S4, Assembler::AVX_512bit);
4631     __ evpxorq(B6, B6, S5, Assembler::AVX_512bit);
4632     __ evpxorq(B7, B7, S6, Assembler::AVX_512bit);
4633     __ evmovdquq(IV, S7, Assembler::AVX_512bit);
4634 
4635     __ evmovdquq(Address(to, 0 * 64), B0, Assembler::AVX_512bit);
4636     __ evmovdquq(Address(to, 1 * 64), B1, Assembler::AVX_512bit);
4637     __ evmovdquq(Address(to, 2 * 64), B2, Assembler::AVX_512bit);
4638     __ evmovdquq(Address(to, 3 * 64), B3, Assembler::AVX_512bit);
4639     __ evmovdquq(Address(to, 4 * 64), B4, Assembler::AVX_512bit);
4640     __ evmovdquq(Address(to, 5 * 64), B5, Assembler::AVX_512bit);
4641     __ evmovdquq(Address(to, 6 * 64), B6, Assembler::AVX_512bit);
4642     __ evmovdquq(Address(to, 7 * 64), B7, Assembler::AVX_512bit);
4643     __ leaq(to, Address(to, 8 * 64));
4644     __ jmp(Loop);
4645 
4646     __ BIND(Lcbc_dec_rem);
4647     __ evshufi64x2(IV, IV, IV, 0x03, Assembler::AVX_512bit);
4648 
4649     __ BIND(Lcbc_dec_rem_loop);
4650     __ subl(len_reg, 16);
4651     __ jcc(Assembler::carrySet, Lcbc_dec_ret);
4652 
4653     __ movdqu(S0, Address(from, 0));
4654     __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
4655     __ vaesdec(B0, B0, RK2, Assembler::AVX_512bit);
4656     __ vaesdec(B0, B0, RK3, Assembler::AVX_512bit);
4657     __ vaesdec(B0, B0, RK4, Assembler::AVX_512bit);
4658     __ vaesdec(B0, B0, RK5, Assembler::AVX_512bit);
4659     __ vaesdec(B0, B0, RK6, Assembler::AVX_512bit);
4660     __ vaesdec(B0, B0, RK7, Assembler::AVX_512bit);
4661     __ vaesdec(B0, B0, RK8, Assembler::AVX_512bit);
4662     __ vaesdec(B0, B0, RK9, Assembler::AVX_512bit);
4663     __ vaesdec(B0, B0, RK10, Assembler::AVX_512bit);
4664     __ cmpl(rounds, 44);
4665     __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
4666 
4667     __ vaesdec(B0, B0, RK11, Assembler::AVX_512bit);
4668     __ vaesdec(B0, B0, RK12, Assembler::AVX_512bit);
4669     __ cmpl(rounds, 52);
4670     __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
4671 
4672     __ vaesdec(B0, B0, RK13, Assembler::AVX_512bit);
4673     __ vaesdec(B0, B0, RK14, Assembler::AVX_512bit);
4674 
4675     __ BIND(Lcbc_dec_rem_last);
4676     __ vaesdeclast(B0, B0, RK0, Assembler::AVX_512bit);
4677 
4678     __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
4679     __ evmovdquq(IV, S0, Assembler::AVX_512bit);
4680     __ movdqu(Address(to, 0), B0);
4681     __ leaq(from, Address(from, 16));
4682     __ leaq(to, Address(to, 16));
4683     __ jmp(Lcbc_dec_rem_loop);
4684 
4685     __ BIND(Lcbc_dec_ret);
4686     __ movdqu(Address(rvec, 0), IV);
4687 
4688     // Zero out the round keys
4689     __ evpxorq(RK0, RK0, RK0, Assembler::AVX_512bit);
4690     __ evpxorq(RK1, RK1, RK1, Assembler::AVX_512bit);
4691     __ evpxorq(RK2, RK2, RK2, Assembler::AVX_512bit);
4692     __ evpxorq(RK3, RK3, RK3, Assembler::AVX_512bit);
4693     __ evpxorq(RK4, RK4, RK4, Assembler::AVX_512bit);
4694     __ evpxorq(RK5, RK5, RK5, Assembler::AVX_512bit);
4695     __ evpxorq(RK6, RK6, RK6, Assembler::AVX_512bit);
4696     __ evpxorq(RK7, RK7, RK7, Assembler::AVX_512bit);
4697     __ evpxorq(RK8, RK8, RK8, Assembler::AVX_512bit);
4698     __ evpxorq(RK9, RK9, RK9, Assembler::AVX_512bit);
4699     __ evpxorq(RK10, RK10, RK10, Assembler::AVX_512bit);
4700     __ cmpl(rounds, 44);
4701     __ jcc(Assembler::belowEqual, Lcbc_exit);
4702     __ evpxorq(RK11, RK11, RK11, Assembler::AVX_512bit);
4703     __ evpxorq(RK12, RK12, RK12, Assembler::AVX_512bit);
4704     __ cmpl(rounds, 52);
4705     __ jcc(Assembler::belowEqual, Lcbc_exit);
4706     __ evpxorq(RK13, RK13, RK13, Assembler::AVX_512bit);
4707     __ evpxorq(RK14, RK14, RK14, Assembler::AVX_512bit);
4708 
4709     __ BIND(Lcbc_exit);
4710     __ pop(rbx);
4711 #ifdef _WIN64
4712     __ movl(rax, len_mem);
4713 #else
4714     __ pop(rax); // return length
4715 #endif
4716     __ leave(); // required for proper stackwalking of RuntimeStub frame
4717     __ ret(0);
4718     return start;
4719 }
4720 
4721   // byte swap x86 long
4722   address generate_ghash_long_swap_mask() {
4723     __ align(CodeEntryAlignment);
4724     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
4725     address start = __ pc();
4726     __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
4727     __ emit_data64(0x0706050403020100, relocInfo::none );
4728   return start;
4729   }
4730 
4731   // byte swap x86 byte array
4732   address generate_ghash_byte_swap_mask() {
4733     __ align(CodeEntryAlignment);
4734     StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
4735     address start = __ pc();
4736     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
4737     __ emit_data64(0x0001020304050607, relocInfo::none );
4738   return start;
4739   }
4740 
4741   /* Single and multi-block ghash operations */
4742   address generate_ghash_processBlocks() {
4743     __ align(CodeEntryAlignment);
4744     Label L_ghash_loop, L_exit;
4745     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4746     address start = __ pc();
4747 
4748     const Register state        = c_rarg0;
4749     const Register subkeyH      = c_rarg1;
4750     const Register data         = c_rarg2;
4751     const Register blocks       = c_rarg3;
4752 
4753     const XMMRegister xmm_temp0 = xmm0;
4754     const XMMRegister xmm_temp1 = xmm1;
4755     const XMMRegister xmm_temp2 = xmm2;
4756     const XMMRegister xmm_temp3 = xmm3;
4757     const XMMRegister xmm_temp4 = xmm4;
4758     const XMMRegister xmm_temp5 = xmm5;
4759     const XMMRegister xmm_temp6 = xmm6;
4760     const XMMRegister xmm_temp7 = xmm7;
4761     const XMMRegister xmm_temp8 = xmm8;
4762     const XMMRegister xmm_temp9 = xmm9;
4763     const XMMRegister xmm_temp10 = xmm10;
4764 
4765     __ enter();
4766 
4767     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
4768     // context for the registers used, where all instructions below are using 128-bit mode
4769     // On EVEX without VL and BW, these instructions will all be AVX.
4770     if (VM_Version::supports_avx512vlbw()) {
4771       __ movl(rax, 0xffff);
4772       __ kmovql(k1, rax);
4773     }
4774 
4775     __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
4776 
4777     __ movdqu(xmm_temp0, Address(state, 0));
4778     __ pshufb(xmm_temp0, xmm_temp10);
4779 
4780 
4781     __ BIND(L_ghash_loop);
4782     __ movdqu(xmm_temp2, Address(data, 0));
4783     __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
4784 
4785     __ movdqu(xmm_temp1, Address(subkeyH, 0));
4786     __ pshufb(xmm_temp1, xmm_temp10);
4787 
4788     __ pxor(xmm_temp0, xmm_temp2);
4789 
4790     //
4791     // Multiply with the hash key
4792     //
4793     __ movdqu(xmm_temp3, xmm_temp0);
4794     __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
4795     __ movdqu(xmm_temp4, xmm_temp0);
4796     __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
4797 
4798     __ movdqu(xmm_temp5, xmm_temp0);
4799     __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
4800     __ movdqu(xmm_temp6, xmm_temp0);
4801     __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
4802 
4803     __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
4804 
4805     __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
4806     __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
4807     __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
4808     __ pxor(xmm_temp3, xmm_temp5);
4809     __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
4810                                         // of the carry-less multiplication of
4811                                         // xmm0 by xmm1.
4812 
4813     // We shift the result of the multiplication by one bit position
4814     // to the left to cope for the fact that the bits are reversed.
4815     __ movdqu(xmm_temp7, xmm_temp3);
4816     __ movdqu(xmm_temp8, xmm_temp6);
4817     __ pslld(xmm_temp3, 1);
4818     __ pslld(xmm_temp6, 1);
4819     __ psrld(xmm_temp7, 31);
4820     __ psrld(xmm_temp8, 31);
4821     __ movdqu(xmm_temp9, xmm_temp7);
4822     __ pslldq(xmm_temp8, 4);
4823     __ pslldq(xmm_temp7, 4);
4824     __ psrldq(xmm_temp9, 12);
4825     __ por(xmm_temp3, xmm_temp7);
4826     __ por(xmm_temp6, xmm_temp8);
4827     __ por(xmm_temp6, xmm_temp9);
4828 
4829     //
4830     // First phase of the reduction
4831     //
4832     // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
4833     // independently.
4834     __ movdqu(xmm_temp7, xmm_temp3);
4835     __ movdqu(xmm_temp8, xmm_temp3);
4836     __ movdqu(xmm_temp9, xmm_temp3);
4837     __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
4838     __ pslld(xmm_temp8, 30);    // packed right shift shifting << 30
4839     __ pslld(xmm_temp9, 25);    // packed right shift shifting << 25
4840     __ pxor(xmm_temp7, xmm_temp8);      // xor the shifted versions
4841     __ pxor(xmm_temp7, xmm_temp9);
4842     __ movdqu(xmm_temp8, xmm_temp7);
4843     __ pslldq(xmm_temp7, 12);
4844     __ psrldq(xmm_temp8, 4);
4845     __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
4846 
4847     //
4848     // Second phase of the reduction
4849     //
4850     // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
4851     // shift operations.
4852     __ movdqu(xmm_temp2, xmm_temp3);
4853     __ movdqu(xmm_temp4, xmm_temp3);
4854     __ movdqu(xmm_temp5, xmm_temp3);
4855     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
4856     __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
4857     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
4858     __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
4859     __ pxor(xmm_temp2, xmm_temp5);
4860     __ pxor(xmm_temp2, xmm_temp8);
4861     __ pxor(xmm_temp3, xmm_temp2);
4862     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
4863 
4864     __ decrement(blocks);
4865     __ jcc(Assembler::zero, L_exit);
4866     __ movdqu(xmm_temp0, xmm_temp6);
4867     __ addptr(data, 16);
4868     __ jmp(L_ghash_loop);
4869 
4870     __ BIND(L_exit);
4871     __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
4872     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
4873     __ leave();
4874     __ ret(0);
4875     return start;
4876   }
4877 
4878   //base64 character set
4879   address base64_charset_addr() {
4880     __ align(CodeEntryAlignment);
4881     StubCodeMark mark(this, "StubRoutines", "base64_charset");
4882     address start = __ pc();
4883     __ emit_data64(0x0000004200000041, relocInfo::none);
4884     __ emit_data64(0x0000004400000043, relocInfo::none);
4885     __ emit_data64(0x0000004600000045, relocInfo::none);
4886     __ emit_data64(0x0000004800000047, relocInfo::none);
4887     __ emit_data64(0x0000004a00000049, relocInfo::none);
4888     __ emit_data64(0x0000004c0000004b, relocInfo::none);
4889     __ emit_data64(0x0000004e0000004d, relocInfo::none);
4890     __ emit_data64(0x000000500000004f, relocInfo::none);
4891     __ emit_data64(0x0000005200000051, relocInfo::none);
4892     __ emit_data64(0x0000005400000053, relocInfo::none);
4893     __ emit_data64(0x0000005600000055, relocInfo::none);
4894     __ emit_data64(0x0000005800000057, relocInfo::none);
4895     __ emit_data64(0x0000005a00000059, relocInfo::none);
4896     __ emit_data64(0x0000006200000061, relocInfo::none);
4897     __ emit_data64(0x0000006400000063, relocInfo::none);
4898     __ emit_data64(0x0000006600000065, relocInfo::none);
4899     __ emit_data64(0x0000006800000067, relocInfo::none);
4900     __ emit_data64(0x0000006a00000069, relocInfo::none);
4901     __ emit_data64(0x0000006c0000006b, relocInfo::none);
4902     __ emit_data64(0x0000006e0000006d, relocInfo::none);
4903     __ emit_data64(0x000000700000006f, relocInfo::none);
4904     __ emit_data64(0x0000007200000071, relocInfo::none);
4905     __ emit_data64(0x0000007400000073, relocInfo::none);
4906     __ emit_data64(0x0000007600000075, relocInfo::none);
4907     __ emit_data64(0x0000007800000077, relocInfo::none);
4908     __ emit_data64(0x0000007a00000079, relocInfo::none);
4909     __ emit_data64(0x0000003100000030, relocInfo::none);
4910     __ emit_data64(0x0000003300000032, relocInfo::none);
4911     __ emit_data64(0x0000003500000034, relocInfo::none);
4912     __ emit_data64(0x0000003700000036, relocInfo::none);
4913     __ emit_data64(0x0000003900000038, relocInfo::none);
4914     __ emit_data64(0x0000002f0000002b, relocInfo::none);
4915     return start;
4916   }
4917 
4918   //base64 url character set
4919   address base64url_charset_addr() {
4920     __ align(CodeEntryAlignment);
4921     StubCodeMark mark(this, "StubRoutines", "base64url_charset");
4922     address start = __ pc();
4923     __ emit_data64(0x0000004200000041, relocInfo::none);
4924     __ emit_data64(0x0000004400000043, relocInfo::none);
4925     __ emit_data64(0x0000004600000045, relocInfo::none);
4926     __ emit_data64(0x0000004800000047, relocInfo::none);
4927     __ emit_data64(0x0000004a00000049, relocInfo::none);
4928     __ emit_data64(0x0000004c0000004b, relocInfo::none);
4929     __ emit_data64(0x0000004e0000004d, relocInfo::none);
4930     __ emit_data64(0x000000500000004f, relocInfo::none);
4931     __ emit_data64(0x0000005200000051, relocInfo::none);
4932     __ emit_data64(0x0000005400000053, relocInfo::none);
4933     __ emit_data64(0x0000005600000055, relocInfo::none);
4934     __ emit_data64(0x0000005800000057, relocInfo::none);
4935     __ emit_data64(0x0000005a00000059, relocInfo::none);
4936     __ emit_data64(0x0000006200000061, relocInfo::none);
4937     __ emit_data64(0x0000006400000063, relocInfo::none);
4938     __ emit_data64(0x0000006600000065, relocInfo::none);
4939     __ emit_data64(0x0000006800000067, relocInfo::none);
4940     __ emit_data64(0x0000006a00000069, relocInfo::none);
4941     __ emit_data64(0x0000006c0000006b, relocInfo::none);
4942     __ emit_data64(0x0000006e0000006d, relocInfo::none);
4943     __ emit_data64(0x000000700000006f, relocInfo::none);
4944     __ emit_data64(0x0000007200000071, relocInfo::none);
4945     __ emit_data64(0x0000007400000073, relocInfo::none);
4946     __ emit_data64(0x0000007600000075, relocInfo::none);
4947     __ emit_data64(0x0000007800000077, relocInfo::none);
4948     __ emit_data64(0x0000007a00000079, relocInfo::none);
4949     __ emit_data64(0x0000003100000030, relocInfo::none);
4950     __ emit_data64(0x0000003300000032, relocInfo::none);
4951     __ emit_data64(0x0000003500000034, relocInfo::none);
4952     __ emit_data64(0x0000003700000036, relocInfo::none);
4953     __ emit_data64(0x0000003900000038, relocInfo::none);
4954     __ emit_data64(0x0000005f0000002d, relocInfo::none);
4955 
4956     return start;
4957   }
4958 
4959   address base64_bswap_mask_addr() {
4960     __ align(CodeEntryAlignment);
4961     StubCodeMark mark(this, "StubRoutines", "bswap_mask_base64");
4962     address start = __ pc();
4963     __ emit_data64(0x0504038002010080, relocInfo::none);
4964     __ emit_data64(0x0b0a098008070680, relocInfo::none);
4965     __ emit_data64(0x0908078006050480, relocInfo::none);
4966     __ emit_data64(0x0f0e0d800c0b0a80, relocInfo::none);
4967     __ emit_data64(0x0605048003020180, relocInfo::none);
4968     __ emit_data64(0x0c0b0a8009080780, relocInfo::none);
4969     __ emit_data64(0x0504038002010080, relocInfo::none);
4970     __ emit_data64(0x0b0a098008070680, relocInfo::none);
4971 
4972     return start;
4973   }
4974 
4975   address base64_right_shift_mask_addr() {
4976     __ align(CodeEntryAlignment);
4977     StubCodeMark mark(this, "StubRoutines", "right_shift_mask");
4978     address start = __ pc();
4979     __ emit_data64(0x0006000400020000, relocInfo::none);
4980     __ emit_data64(0x0006000400020000, relocInfo::none);
4981     __ emit_data64(0x0006000400020000, relocInfo::none);
4982     __ emit_data64(0x0006000400020000, relocInfo::none);
4983     __ emit_data64(0x0006000400020000, relocInfo::none);
4984     __ emit_data64(0x0006000400020000, relocInfo::none);
4985     __ emit_data64(0x0006000400020000, relocInfo::none);
4986     __ emit_data64(0x0006000400020000, relocInfo::none);
4987 
4988     return start;
4989   }
4990 
4991   address base64_left_shift_mask_addr() {
4992     __ align(CodeEntryAlignment);
4993     StubCodeMark mark(this, "StubRoutines", "left_shift_mask");
4994     address start = __ pc();
4995     __ emit_data64(0x0000000200040000, relocInfo::none);
4996     __ emit_data64(0x0000000200040000, relocInfo::none);
4997     __ emit_data64(0x0000000200040000, relocInfo::none);
4998     __ emit_data64(0x0000000200040000, relocInfo::none);
4999     __ emit_data64(0x0000000200040000, relocInfo::none);
5000     __ emit_data64(0x0000000200040000, relocInfo::none);
5001     __ emit_data64(0x0000000200040000, relocInfo::none);
5002     __ emit_data64(0x0000000200040000, relocInfo::none);
5003 
5004     return start;
5005   }
5006 
5007   address base64_and_mask_addr() {
5008     __ align(CodeEntryAlignment);
5009     StubCodeMark mark(this, "StubRoutines", "and_mask");
5010     address start = __ pc();
5011     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5012     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5013     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5014     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5015     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5016     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5017     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5018     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5019     return start;
5020   }
5021 
5022   address base64_gather_mask_addr() {
5023     __ align(CodeEntryAlignment);
5024     StubCodeMark mark(this, "StubRoutines", "gather_mask");
5025     address start = __ pc();
5026     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5027     return start;
5028   }
5029 
5030 // Code for generating Base64 encoding.
5031 // Intrinsic function prototype in Base64.java:
5032 // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) {
5033   address generate_base64_encodeBlock() {
5034     __ align(CodeEntryAlignment);
5035     StubCodeMark mark(this, "StubRoutines", "implEncode");
5036     address start = __ pc();
5037     __ enter();
5038 
5039     // Save callee-saved registers before using them
5040     __ push(r12);
5041     __ push(r13);
5042     __ push(r14);
5043     __ push(r15);
5044     __ push(rbx);
5045 
5046     // arguments
5047     const Register source = c_rarg0; // Source Array
5048     const Register start_offset = c_rarg1; // start offset
5049     const Register end_offset = c_rarg2; // end offset
5050     const Register dest = c_rarg3; // destination array
5051 
5052 #ifndef _WIN64
5053     const Register dp = c_rarg4;  // Position for writing to dest array
5054     const Register isURL = c_rarg5;// Base64 or URL character set
5055 #else
5056     const Address  dp_mem(rbp, 6 * wordSize);  // length is on stack on Win64
5057     const Address isURL_mem(rbp, 7 * wordSize);
5058     const Register isURL = r10;      // pick the volatile windows register
5059     const Register dp = r12;
5060     __ movl(dp, dp_mem);
5061     __ movl(isURL, isURL_mem);
5062 #endif
5063 
5064     const Register length = r14;
5065     Label L_process80, L_process32, L_process3, L_exit, L_processdata;
5066 
5067     // calculate length from offsets
5068     __ movl(length, end_offset);
5069     __ subl(length, start_offset);
5070     __ cmpl(length, 0);
5071     __ jcc(Assembler::lessEqual, L_exit);
5072 
5073     // Save k1 value in rbx
5074     __ kmovql(rbx, k1);
5075     __ lea(r11, ExternalAddress(StubRoutines::x86::base64_charset_addr()));
5076     // check if base64 charset(isURL=0) or base64 url charset(isURL=1) needs to be loaded
5077     __ cmpl(isURL, 0);
5078     __ jcc(Assembler::equal, L_processdata);
5079     __ lea(r11, ExternalAddress(StubRoutines::x86::base64url_charset_addr()));
5080 
5081     // load masks required for encoding data
5082     __ BIND(L_processdata);
5083     __ movdqu(xmm16, ExternalAddress(StubRoutines::x86::base64_gather_mask_addr()));
5084     // Set 64 bits of K register.
5085     __ evpcmpeqb(k1, xmm16, xmm16, Assembler::AVX_512bit);
5086     __ evmovdquq(xmm12, ExternalAddress(StubRoutines::x86::base64_bswap_mask_addr()), Assembler::AVX_256bit, r13);
5087     __ evmovdquq(xmm13, ExternalAddress(StubRoutines::x86::base64_right_shift_mask_addr()), Assembler::AVX_512bit, r13);
5088     __ evmovdquq(xmm14, ExternalAddress(StubRoutines::x86::base64_left_shift_mask_addr()), Assembler::AVX_512bit, r13);
5089     __ evmovdquq(xmm15, ExternalAddress(StubRoutines::x86::base64_and_mask_addr()), Assembler::AVX_512bit, r13);
5090 
5091     // Vector Base64 implementation, producing 96 bytes of encoded data
5092     __ BIND(L_process80);
5093     __ cmpl(length, 80);
5094     __ jcc(Assembler::below, L_process32);
5095     __ evmovdquq(xmm0, Address(source, start_offset, Address::times_1, 0), Assembler::AVX_256bit);
5096     __ evmovdquq(xmm1, Address(source, start_offset, Address::times_1, 24), Assembler::AVX_256bit);
5097     __ evmovdquq(xmm2, Address(source, start_offset, Address::times_1, 48), Assembler::AVX_256bit);
5098 
5099     //permute the input data in such a manner that we have continuity of the source
5100     __ vpermq(xmm3, xmm0, 148, Assembler::AVX_256bit);
5101     __ vpermq(xmm4, xmm1, 148, Assembler::AVX_256bit);
5102     __ vpermq(xmm5, xmm2, 148, Assembler::AVX_256bit);
5103 
5104     //shuffle input and group 3 bytes of data and to it add 0 as the 4th byte.
5105     //we can deal with 12 bytes at a time in a 128 bit register
5106     __ vpshufb(xmm3, xmm3, xmm12, Assembler::AVX_256bit);
5107     __ vpshufb(xmm4, xmm4, xmm12, Assembler::AVX_256bit);
5108     __ vpshufb(xmm5, xmm5, xmm12, Assembler::AVX_256bit);
5109 
5110     //convert byte to word. Each 128 bit register will have 6 bytes for processing
5111     __ vpmovzxbw(xmm3, xmm3, Assembler::AVX_512bit);
5112     __ vpmovzxbw(xmm4, xmm4, Assembler::AVX_512bit);
5113     __ vpmovzxbw(xmm5, xmm5, Assembler::AVX_512bit);
5114 
5115     // Extract bits in the following pattern 6, 4+2, 2+4, 6 to convert 3, 8 bit numbers to 4, 6 bit numbers
5116     __ evpsrlvw(xmm0, xmm3, xmm13,  Assembler::AVX_512bit);
5117     __ evpsrlvw(xmm1, xmm4, xmm13, Assembler::AVX_512bit);
5118     __ evpsrlvw(xmm2, xmm5, xmm13, Assembler::AVX_512bit);
5119 
5120     __ evpsllvw(xmm3, xmm3, xmm14, Assembler::AVX_512bit);
5121     __ evpsllvw(xmm4, xmm4, xmm14, Assembler::AVX_512bit);
5122     __ evpsllvw(xmm5, xmm5, xmm14, Assembler::AVX_512bit);
5123 
5124     __ vpsrlq(xmm0, xmm0, 8, Assembler::AVX_512bit);
5125     __ vpsrlq(xmm1, xmm1, 8, Assembler::AVX_512bit);
5126     __ vpsrlq(xmm2, xmm2, 8, Assembler::AVX_512bit);
5127 
5128     __ vpsllq(xmm3, xmm3, 8, Assembler::AVX_512bit);
5129     __ vpsllq(xmm4, xmm4, 8, Assembler::AVX_512bit);
5130     __ vpsllq(xmm5, xmm5, 8, Assembler::AVX_512bit);
5131 
5132     __ vpandq(xmm3, xmm3, xmm15, Assembler::AVX_512bit);
5133     __ vpandq(xmm4, xmm4, xmm15, Assembler::AVX_512bit);
5134     __ vpandq(xmm5, xmm5, xmm15, Assembler::AVX_512bit);
5135 
5136     // Get the final 4*6 bits base64 encoding
5137     __ vporq(xmm3, xmm3, xmm0, Assembler::AVX_512bit);
5138     __ vporq(xmm4, xmm4, xmm1, Assembler::AVX_512bit);
5139     __ vporq(xmm5, xmm5, xmm2, Assembler::AVX_512bit);
5140 
5141     // Shift
5142     __ vpsrlq(xmm3, xmm3, 8, Assembler::AVX_512bit);
5143     __ vpsrlq(xmm4, xmm4, 8, Assembler::AVX_512bit);
5144     __ vpsrlq(xmm5, xmm5, 8, Assembler::AVX_512bit);
5145 
5146     // look up 6 bits in the base64 character set to fetch the encoding
5147     // we are converting word to dword as gather instructions need dword indices for looking up encoding
5148     __ vextracti64x4(xmm6, xmm3, 0);
5149     __ vpmovzxwd(xmm0, xmm6, Assembler::AVX_512bit);
5150     __ vextracti64x4(xmm6, xmm3, 1);
5151     __ vpmovzxwd(xmm1, xmm6, Assembler::AVX_512bit);
5152 
5153     __ vextracti64x4(xmm6, xmm4, 0);
5154     __ vpmovzxwd(xmm2, xmm6, Assembler::AVX_512bit);
5155     __ vextracti64x4(xmm6, xmm4, 1);
5156     __ vpmovzxwd(xmm3, xmm6, Assembler::AVX_512bit);
5157 
5158     __ vextracti64x4(xmm4, xmm5, 0);
5159     __ vpmovzxwd(xmm6, xmm4, Assembler::AVX_512bit);
5160 
5161     __ vextracti64x4(xmm4, xmm5, 1);
5162     __ vpmovzxwd(xmm7, xmm4, Assembler::AVX_512bit);
5163 
5164     __ kmovql(k2, k1);
5165     __ evpgatherdd(xmm4, k2, Address(r11, xmm0, Address::times_4, 0), Assembler::AVX_512bit);
5166     __ kmovql(k2, k1);
5167     __ evpgatherdd(xmm5, k2, Address(r11, xmm1, Address::times_4, 0), Assembler::AVX_512bit);
5168     __ kmovql(k2, k1);
5169     __ evpgatherdd(xmm8, k2, Address(r11, xmm2, Address::times_4, 0), Assembler::AVX_512bit);
5170     __ kmovql(k2, k1);
5171     __ evpgatherdd(xmm9, k2, Address(r11, xmm3, Address::times_4, 0), Assembler::AVX_512bit);
5172     __ kmovql(k2, k1);
5173     __ evpgatherdd(xmm10, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit);
5174     __ kmovql(k2, k1);
5175     __ evpgatherdd(xmm11, k2, Address(r11, xmm7, Address::times_4, 0), Assembler::AVX_512bit);
5176 
5177     //Down convert dword to byte. Final output is 16*6 = 96 bytes long
5178     __ evpmovdb(Address(dest, dp, Address::times_1, 0), xmm4, Assembler::AVX_512bit);
5179     __ evpmovdb(Address(dest, dp, Address::times_1, 16), xmm5, Assembler::AVX_512bit);
5180     __ evpmovdb(Address(dest, dp, Address::times_1, 32), xmm8, Assembler::AVX_512bit);
5181     __ evpmovdb(Address(dest, dp, Address::times_1, 48), xmm9, Assembler::AVX_512bit);
5182     __ evpmovdb(Address(dest, dp, Address::times_1, 64), xmm10, Assembler::AVX_512bit);
5183     __ evpmovdb(Address(dest, dp, Address::times_1, 80), xmm11, Assembler::AVX_512bit);
5184 
5185     __ addq(dest, 96);
5186     __ addq(source, 72);
5187     __ subq(length, 72);
5188     __ jmp(L_process80);
5189 
5190     // Vector Base64 implementation generating 32 bytes of encoded data
5191     __ BIND(L_process32);
5192     __ cmpl(length, 32);
5193     __ jcc(Assembler::below, L_process3);
5194     __ evmovdquq(xmm0, Address(source, start_offset), Assembler::AVX_256bit);
5195     __ vpermq(xmm0, xmm0, 148, Assembler::AVX_256bit);
5196     __ vpshufb(xmm6, xmm0, xmm12, Assembler::AVX_256bit);
5197     __ vpmovzxbw(xmm6, xmm6, Assembler::AVX_512bit);
5198     __ evpsrlvw(xmm2, xmm6, xmm13, Assembler::AVX_512bit);
5199     __ evpsllvw(xmm3, xmm6, xmm14, Assembler::AVX_512bit);
5200 
5201     __ vpsrlq(xmm2, xmm2, 8, Assembler::AVX_512bit);
5202     __ vpsllq(xmm3, xmm3, 8, Assembler::AVX_512bit);
5203     __ vpandq(xmm3, xmm3, xmm15, Assembler::AVX_512bit);
5204     __ vporq(xmm1, xmm2, xmm3, Assembler::AVX_512bit);
5205     __ vpsrlq(xmm1, xmm1, 8, Assembler::AVX_512bit);
5206     __ vextracti64x4(xmm9, xmm1, 0);
5207     __ vpmovzxwd(xmm6, xmm9, Assembler::AVX_512bit);
5208     __ vextracti64x4(xmm9, xmm1, 1);
5209     __ vpmovzxwd(xmm5, xmm9,  Assembler::AVX_512bit);
5210     __ kmovql(k2, k1);
5211     __ evpgatherdd(xmm8, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit);
5212     __ kmovql(k2, k1);
5213     __ evpgatherdd(xmm10, k2, Address(r11, xmm5, Address::times_4, 0), Assembler::AVX_512bit);
5214     __ evpmovdb(Address(dest, dp, Address::times_1, 0), xmm8, Assembler::AVX_512bit);
5215     __ evpmovdb(Address(dest, dp, Address::times_1, 16), xmm10, Assembler::AVX_512bit);
5216     __ subq(length, 24);
5217     __ addq(dest, 32);
5218     __ addq(source, 24);
5219     __ jmp(L_process32);
5220 
5221     // Scalar data processing takes 3 bytes at a time and produces 4 bytes of encoded data
5222     /* This code corresponds to the scalar version of the following snippet in Base64.java
5223     ** int bits = (src[sp0++] & 0xff) << 16 |(src[sp0++] & 0xff) << 8 |(src[sp0++] & 0xff);
5224     ** dst[dp0++] = (byte)base64[(bits >> > 18) & 0x3f];
5225     ** dst[dp0++] = (byte)base64[(bits >> > 12) & 0x3f];
5226     ** dst[dp0++] = (byte)base64[(bits >> > 6) & 0x3f];
5227     ** dst[dp0++] = (byte)base64[bits & 0x3f];*/
5228     __ BIND(L_process3);
5229     __ cmpl(length, 3);
5230     __ jcc(Assembler::below, L_exit);
5231     // Read 1 byte at a time
5232     __ movzbl(rax, Address(source, start_offset));
5233     __ shll(rax, 0x10);
5234     __ movl(r15, rax);
5235     __ movzbl(rax, Address(source, start_offset, Address::times_1, 1));
5236     __ shll(rax, 0x8);
5237     __ movzwl(rax, rax);
5238     __ orl(r15, rax);
5239     __ movzbl(rax, Address(source, start_offset, Address::times_1, 2));
5240     __ orl(rax, r15);
5241     // Save 3 bytes read in r15
5242     __ movl(r15, rax);
5243     __ shrl(rax, 0x12);
5244     __ andl(rax, 0x3f);
5245     // rax contains the index, r11 contains base64 lookup table
5246     __ movb(rax, Address(r11, rax, Address::times_4));
5247     // Write the encoded byte to destination
5248     __ movb(Address(dest, dp, Address::times_1, 0), rax);
5249     __ movl(rax, r15);
5250     __ shrl(rax, 0xc);
5251     __ andl(rax, 0x3f);
5252     __ movb(rax, Address(r11, rax, Address::times_4));
5253     __ movb(Address(dest, dp, Address::times_1, 1), rax);
5254     __ movl(rax, r15);
5255     __ shrl(rax, 0x6);
5256     __ andl(rax, 0x3f);
5257     __ movb(rax, Address(r11, rax, Address::times_4));
5258     __ movb(Address(dest, dp, Address::times_1, 2), rax);
5259     __ movl(rax, r15);
5260     __ andl(rax, 0x3f);
5261     __ movb(rax, Address(r11, rax, Address::times_4));
5262     __ movb(Address(dest, dp, Address::times_1, 3), rax);
5263     __ subl(length, 3);
5264     __ addq(dest, 4);
5265     __ addq(source, 3);
5266     __ jmp(L_process3);
5267     __ BIND(L_exit);
5268     // restore k1 register value
5269     __ kmovql(k1, rbx);
5270     __ pop(rbx);
5271     __ pop(r15);
5272     __ pop(r14);
5273     __ pop(r13);
5274     __ pop(r12);
5275     __ leave();
5276     __ ret(0);
5277     return start;
5278   }
5279 
5280   /**
5281    *  Arguments:
5282    *
5283    * Inputs:
5284    *   c_rarg0   - int crc
5285    *   c_rarg1   - byte* buf
5286    *   c_rarg2   - int length
5287    *
5288    * Ouput:
5289    *       rax   - int crc result
5290    */
5291   address generate_updateBytesCRC32() {
5292     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
5293 
5294     __ align(CodeEntryAlignment);
5295     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
5296 
5297     address start = __ pc();
5298     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5299     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5300     // rscratch1: r10
5301     const Register crc   = c_rarg0;  // crc
5302     const Register buf   = c_rarg1;  // source java byte array address
5303     const Register len   = c_rarg2;  // length
5304     const Register table = c_rarg3;  // crc_table address (reuse register)
5305     const Register tmp   = r11;
5306     assert_different_registers(crc, buf, len, table, tmp, rax);
5307 
5308     BLOCK_COMMENT("Entry:");
5309     __ enter(); // required for proper stackwalking of RuntimeStub frame
5310 
5311     __ kernel_crc32(crc, buf, len, table, tmp);
5312 
5313     __ movl(rax, crc);
5314     __ vzeroupper();
5315     __ leave(); // required for proper stackwalking of RuntimeStub frame
5316     __ ret(0);
5317 
5318     return start;
5319   }
5320 
5321   /**
5322   *  Arguments:
5323   *
5324   * Inputs:
5325   *   c_rarg0   - int crc
5326   *   c_rarg1   - byte* buf
5327   *   c_rarg2   - long length
5328   *   c_rarg3   - table_start - optional (present only when doing a library_call,
5329   *              not used by x86 algorithm)
5330   *
5331   * Ouput:
5332   *       rax   - int crc result
5333   */
5334   address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
5335       assert(UseCRC32CIntrinsics, "need SSE4_2");
5336       __ align(CodeEntryAlignment);
5337       StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
5338       address start = __ pc();
5339       //reg.arg        int#0        int#1        int#2        int#3        int#4        int#5        float regs
5340       //Windows        RCX          RDX          R8           R9           none         none         XMM0..XMM3
5341       //Lin / Sol      RDI          RSI          RDX          RCX          R8           R9           XMM0..XMM7
5342       const Register crc = c_rarg0;  // crc
5343       const Register buf = c_rarg1;  // source java byte array address
5344       const Register len = c_rarg2;  // length
5345       const Register a = rax;
5346       const Register j = r9;
5347       const Register k = r10;
5348       const Register l = r11;
5349 #ifdef _WIN64
5350       const Register y = rdi;
5351       const Register z = rsi;
5352 #else
5353       const Register y = rcx;
5354       const Register z = r8;
5355 #endif
5356       assert_different_registers(crc, buf, len, a, j, k, l, y, z);
5357 
5358       BLOCK_COMMENT("Entry:");
5359       __ enter(); // required for proper stackwalking of RuntimeStub frame
5360 #ifdef _WIN64
5361       __ push(y);
5362       __ push(z);
5363 #endif
5364       __ crc32c_ipl_alg2_alt2(crc, buf, len,
5365                               a, j, k,
5366                               l, y, z,
5367                               c_farg0, c_farg1, c_farg2,
5368                               is_pclmulqdq_supported);
5369       __ movl(rax, crc);
5370 #ifdef _WIN64
5371       __ pop(z);
5372       __ pop(y);
5373 #endif
5374       __ vzeroupper();
5375       __ leave(); // required for proper stackwalking of RuntimeStub frame
5376       __ ret(0);
5377 
5378       return start;
5379   }
5380 
5381   /**
5382    *  Arguments:
5383    *
5384    *  Input:
5385    *    c_rarg0   - x address
5386    *    c_rarg1   - x length
5387    *    c_rarg2   - y address
5388    *    c_rarg3   - y length
5389    * not Win64
5390    *    c_rarg4   - z address
5391    *    c_rarg5   - z length
5392    * Win64
5393    *    rsp+40    - z address
5394    *    rsp+48    - z length
5395    */
5396   address generate_multiplyToLen() {
5397     __ align(CodeEntryAlignment);
5398     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
5399 
5400     address start = __ pc();
5401     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5402     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5403     const Register x     = rdi;
5404     const Register xlen  = rax;
5405     const Register y     = rsi;
5406     const Register ylen  = rcx;
5407     const Register z     = r8;
5408     const Register zlen  = r11;
5409 
5410     // Next registers will be saved on stack in multiply_to_len().
5411     const Register tmp1  = r12;
5412     const Register tmp2  = r13;
5413     const Register tmp3  = r14;
5414     const Register tmp4  = r15;
5415     const Register tmp5  = rbx;
5416 
5417     BLOCK_COMMENT("Entry:");
5418     __ enter(); // required for proper stackwalking of RuntimeStub frame
5419 
5420 #ifndef _WIN64
5421     __ movptr(zlen, r9); // Save r9 in r11 - zlen
5422 #endif
5423     setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
5424                        // ylen => rcx, z => r8, zlen => r11
5425                        // r9 and r10 may be used to save non-volatile registers
5426 #ifdef _WIN64
5427     // last 2 arguments (#4, #5) are on stack on Win64
5428     __ movptr(z, Address(rsp, 6 * wordSize));
5429     __ movptr(zlen, Address(rsp, 7 * wordSize));
5430 #endif
5431 
5432     __ movptr(xlen, rsi);
5433     __ movptr(y,    rdx);
5434     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
5435 
5436     restore_arg_regs();
5437 
5438     __ leave(); // required for proper stackwalking of RuntimeStub frame
5439     __ ret(0);
5440 
5441     return start;
5442   }
5443 
5444   /**
5445   *  Arguments:
5446   *
5447   *  Input:
5448   *    c_rarg0   - obja     address
5449   *    c_rarg1   - objb     address
5450   *    c_rarg3   - length   length
5451   *    c_rarg4   - scale    log2_array_indxscale
5452   *
5453   *  Output:
5454   *        rax   - int >= mismatched index, < 0 bitwise complement of tail
5455   */
5456   address generate_vectorizedMismatch() {
5457     __ align(CodeEntryAlignment);
5458     StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
5459     address start = __ pc();
5460 
5461     BLOCK_COMMENT("Entry:");
5462     __ enter();
5463 
5464 #ifdef _WIN64  // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5465     const Register scale = c_rarg0;  //rcx, will exchange with r9
5466     const Register objb = c_rarg1;   //rdx
5467     const Register length = c_rarg2; //r8
5468     const Register obja = c_rarg3;   //r9
5469     __ xchgq(obja, scale);  //now obja and scale contains the correct contents
5470 
5471     const Register tmp1 = r10;
5472     const Register tmp2 = r11;
5473 #endif
5474 #ifndef _WIN64 // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5475     const Register obja = c_rarg0;   //U:rdi
5476     const Register objb = c_rarg1;   //U:rsi
5477     const Register length = c_rarg2; //U:rdx
5478     const Register scale = c_rarg3;  //U:rcx
5479     const Register tmp1 = r8;
5480     const Register tmp2 = r9;
5481 #endif
5482     const Register result = rax; //return value
5483     const XMMRegister vec0 = xmm0;
5484     const XMMRegister vec1 = xmm1;
5485     const XMMRegister vec2 = xmm2;
5486 
5487     __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
5488 
5489     __ vzeroupper();
5490     __ leave();
5491     __ ret(0);
5492 
5493     return start;
5494   }
5495 
5496 /**
5497    *  Arguments:
5498    *
5499   //  Input:
5500   //    c_rarg0   - x address
5501   //    c_rarg1   - x length
5502   //    c_rarg2   - z address
5503   //    c_rarg3   - z lenth
5504    *
5505    */
5506   address generate_squareToLen() {
5507 
5508     __ align(CodeEntryAlignment);
5509     StubCodeMark mark(this, "StubRoutines", "squareToLen");
5510 
5511     address start = __ pc();
5512     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5513     // Unix:  rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
5514     const Register x      = rdi;
5515     const Register len    = rsi;
5516     const Register z      = r8;
5517     const Register zlen   = rcx;
5518 
5519    const Register tmp1      = r12;
5520    const Register tmp2      = r13;
5521    const Register tmp3      = r14;
5522    const Register tmp4      = r15;
5523    const Register tmp5      = rbx;
5524 
5525     BLOCK_COMMENT("Entry:");
5526     __ enter(); // required for proper stackwalking of RuntimeStub frame
5527 
5528        setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
5529                           // zlen => rcx
5530                           // r9 and r10 may be used to save non-volatile registers
5531     __ movptr(r8, rdx);
5532     __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
5533 
5534     restore_arg_regs();
5535 
5536     __ leave(); // required for proper stackwalking of RuntimeStub frame
5537     __ ret(0);
5538 
5539     return start;
5540   }
5541 
5542    /**
5543    *  Arguments:
5544    *
5545    *  Input:
5546    *    c_rarg0   - out address
5547    *    c_rarg1   - in address
5548    *    c_rarg2   - offset
5549    *    c_rarg3   - len
5550    * not Win64
5551    *    c_rarg4   - k
5552    * Win64
5553    *    rsp+40    - k
5554    */
5555   address generate_mulAdd() {
5556     __ align(CodeEntryAlignment);
5557     StubCodeMark mark(this, "StubRoutines", "mulAdd");
5558 
5559     address start = __ pc();
5560     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5561     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5562     const Register out     = rdi;
5563     const Register in      = rsi;
5564     const Register offset  = r11;
5565     const Register len     = rcx;
5566     const Register k       = r8;
5567 
5568     // Next registers will be saved on stack in mul_add().
5569     const Register tmp1  = r12;
5570     const Register tmp2  = r13;
5571     const Register tmp3  = r14;
5572     const Register tmp4  = r15;
5573     const Register tmp5  = rbx;
5574 
5575     BLOCK_COMMENT("Entry:");
5576     __ enter(); // required for proper stackwalking of RuntimeStub frame
5577 
5578     setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
5579                        // len => rcx, k => r8
5580                        // r9 and r10 may be used to save non-volatile registers
5581 #ifdef _WIN64
5582     // last argument is on stack on Win64
5583     __ movl(k, Address(rsp, 6 * wordSize));
5584 #endif
5585     __ movptr(r11, rdx);  // move offset in rdx to offset(r11)
5586     __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
5587 
5588     restore_arg_regs();
5589 
5590     __ leave(); // required for proper stackwalking of RuntimeStub frame
5591     __ ret(0);
5592 
5593     return start;
5594   }
5595 
5596   address generate_libmExp() {
5597     StubCodeMark mark(this, "StubRoutines", "libmExp");
5598 
5599     address start = __ pc();
5600 
5601     const XMMRegister x0  = xmm0;
5602     const XMMRegister x1  = xmm1;
5603     const XMMRegister x2  = xmm2;
5604     const XMMRegister x3  = xmm3;
5605 
5606     const XMMRegister x4  = xmm4;
5607     const XMMRegister x5  = xmm5;
5608     const XMMRegister x6  = xmm6;
5609     const XMMRegister x7  = xmm7;
5610 
5611     const Register tmp   = r11;
5612 
5613     BLOCK_COMMENT("Entry:");
5614     __ enter(); // required for proper stackwalking of RuntimeStub frame
5615 
5616     __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
5617 
5618     __ leave(); // required for proper stackwalking of RuntimeStub frame
5619     __ ret(0);
5620 
5621     return start;
5622 
5623   }
5624 
5625   address generate_libmLog() {
5626     StubCodeMark mark(this, "StubRoutines", "libmLog");
5627 
5628     address start = __ pc();
5629 
5630     const XMMRegister x0 = xmm0;
5631     const XMMRegister x1 = xmm1;
5632     const XMMRegister x2 = xmm2;
5633     const XMMRegister x3 = xmm3;
5634 
5635     const XMMRegister x4 = xmm4;
5636     const XMMRegister x5 = xmm5;
5637     const XMMRegister x6 = xmm6;
5638     const XMMRegister x7 = xmm7;
5639 
5640     const Register tmp1 = r11;
5641     const Register tmp2 = r8;
5642 
5643     BLOCK_COMMENT("Entry:");
5644     __ enter(); // required for proper stackwalking of RuntimeStub frame
5645 
5646     __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
5647 
5648     __ leave(); // required for proper stackwalking of RuntimeStub frame
5649     __ ret(0);
5650 
5651     return start;
5652 
5653   }
5654 
5655   address generate_libmLog10() {
5656     StubCodeMark mark(this, "StubRoutines", "libmLog10");
5657 
5658     address start = __ pc();
5659 
5660     const XMMRegister x0 = xmm0;
5661     const XMMRegister x1 = xmm1;
5662     const XMMRegister x2 = xmm2;
5663     const XMMRegister x3 = xmm3;
5664 
5665     const XMMRegister x4 = xmm4;
5666     const XMMRegister x5 = xmm5;
5667     const XMMRegister x6 = xmm6;
5668     const XMMRegister x7 = xmm7;
5669 
5670     const Register tmp = r11;
5671 
5672     BLOCK_COMMENT("Entry:");
5673     __ enter(); // required for proper stackwalking of RuntimeStub frame
5674 
5675     __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
5676 
5677     __ leave(); // required for proper stackwalking of RuntimeStub frame
5678     __ ret(0);
5679 
5680     return start;
5681 
5682   }
5683 
5684   address generate_libmPow() {
5685     StubCodeMark mark(this, "StubRoutines", "libmPow");
5686 
5687     address start = __ pc();
5688 
5689     const XMMRegister x0 = xmm0;
5690     const XMMRegister x1 = xmm1;
5691     const XMMRegister x2 = xmm2;
5692     const XMMRegister x3 = xmm3;
5693 
5694     const XMMRegister x4 = xmm4;
5695     const XMMRegister x5 = xmm5;
5696     const XMMRegister x6 = xmm6;
5697     const XMMRegister x7 = xmm7;
5698 
5699     const Register tmp1 = r8;
5700     const Register tmp2 = r9;
5701     const Register tmp3 = r10;
5702     const Register tmp4 = r11;
5703 
5704     BLOCK_COMMENT("Entry:");
5705     __ enter(); // required for proper stackwalking of RuntimeStub frame
5706 
5707     __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5708 
5709     __ leave(); // required for proper stackwalking of RuntimeStub frame
5710     __ ret(0);
5711 
5712     return start;
5713 
5714   }
5715 
5716   address generate_libmSin() {
5717     StubCodeMark mark(this, "StubRoutines", "libmSin");
5718 
5719     address start = __ pc();
5720 
5721     const XMMRegister x0 = xmm0;
5722     const XMMRegister x1 = xmm1;
5723     const XMMRegister x2 = xmm2;
5724     const XMMRegister x3 = xmm3;
5725 
5726     const XMMRegister x4 = xmm4;
5727     const XMMRegister x5 = xmm5;
5728     const XMMRegister x6 = xmm6;
5729     const XMMRegister x7 = xmm7;
5730 
5731     const Register tmp1 = r8;
5732     const Register tmp2 = r9;
5733     const Register tmp3 = r10;
5734     const Register tmp4 = r11;
5735 
5736     BLOCK_COMMENT("Entry:");
5737     __ enter(); // required for proper stackwalking of RuntimeStub frame
5738 
5739 #ifdef _WIN64
5740     __ push(rsi);
5741     __ push(rdi);
5742 #endif
5743     __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5744 
5745 #ifdef _WIN64
5746     __ pop(rdi);
5747     __ pop(rsi);
5748 #endif
5749 
5750     __ leave(); // required for proper stackwalking of RuntimeStub frame
5751     __ ret(0);
5752 
5753     return start;
5754 
5755   }
5756 
5757   address generate_libmCos() {
5758     StubCodeMark mark(this, "StubRoutines", "libmCos");
5759 
5760     address start = __ pc();
5761 
5762     const XMMRegister x0 = xmm0;
5763     const XMMRegister x1 = xmm1;
5764     const XMMRegister x2 = xmm2;
5765     const XMMRegister x3 = xmm3;
5766 
5767     const XMMRegister x4 = xmm4;
5768     const XMMRegister x5 = xmm5;
5769     const XMMRegister x6 = xmm6;
5770     const XMMRegister x7 = xmm7;
5771 
5772     const Register tmp1 = r8;
5773     const Register tmp2 = r9;
5774     const Register tmp3 = r10;
5775     const Register tmp4 = r11;
5776 
5777     BLOCK_COMMENT("Entry:");
5778     __ enter(); // required for proper stackwalking of RuntimeStub frame
5779 
5780 #ifdef _WIN64
5781     __ push(rsi);
5782     __ push(rdi);
5783 #endif
5784     __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5785 
5786 #ifdef _WIN64
5787     __ pop(rdi);
5788     __ pop(rsi);
5789 #endif
5790 
5791     __ leave(); // required for proper stackwalking of RuntimeStub frame
5792     __ ret(0);
5793 
5794     return start;
5795 
5796   }
5797 
5798   address generate_libmTan() {
5799     StubCodeMark mark(this, "StubRoutines", "libmTan");
5800 
5801     address start = __ pc();
5802 
5803     const XMMRegister x0 = xmm0;
5804     const XMMRegister x1 = xmm1;
5805     const XMMRegister x2 = xmm2;
5806     const XMMRegister x3 = xmm3;
5807 
5808     const XMMRegister x4 = xmm4;
5809     const XMMRegister x5 = xmm5;
5810     const XMMRegister x6 = xmm6;
5811     const XMMRegister x7 = xmm7;
5812 
5813     const Register tmp1 = r8;
5814     const Register tmp2 = r9;
5815     const Register tmp3 = r10;
5816     const Register tmp4 = r11;
5817 
5818     BLOCK_COMMENT("Entry:");
5819     __ enter(); // required for proper stackwalking of RuntimeStub frame
5820 
5821 #ifdef _WIN64
5822     __ push(rsi);
5823     __ push(rdi);
5824 #endif
5825     __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5826 
5827 #ifdef _WIN64
5828     __ pop(rdi);
5829     __ pop(rsi);
5830 #endif
5831 
5832     __ leave(); // required for proper stackwalking of RuntimeStub frame
5833     __ ret(0);
5834 
5835     return start;
5836 
5837   }
5838 
5839 #undef __
5840 #define __ masm->
5841 
5842   // Continuation point for throwing of implicit exceptions that are
5843   // not handled in the current activation. Fabricates an exception
5844   // oop and initiates normal exception dispatching in this
5845   // frame. Since we need to preserve callee-saved values (currently
5846   // only for C2, but done for C1 as well) we need a callee-saved oop
5847   // map and therefore have to make these stubs into RuntimeStubs
5848   // rather than BufferBlobs.  If the compiler needs all registers to
5849   // be preserved between the fault point and the exception handler
5850   // then it must assume responsibility for that in
5851   // AbstractCompiler::continuation_for_implicit_null_exception or
5852   // continuation_for_implicit_division_by_zero_exception. All other
5853   // implicit exceptions (e.g., NullPointerException or
5854   // AbstractMethodError on entry) are either at call sites or
5855   // otherwise assume that stack unwinding will be initiated, so
5856   // caller saved registers were assumed volatile in the compiler.
5857   address generate_throw_exception(const char* name,
5858                                    address runtime_entry,
5859                                    Register arg1 = noreg,
5860                                    Register arg2 = noreg) {
5861     // Information about frame layout at time of blocking runtime call.
5862     // Note that we only have to preserve callee-saved registers since
5863     // the compilers are responsible for supplying a continuation point
5864     // if they expect all registers to be preserved.
5865     enum layout {
5866       rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
5867       rbp_off2,
5868       return_off,
5869       return_off2,
5870       framesize // inclusive of return address
5871     };
5872 
5873     int insts_size = 512;
5874     int locs_size  = 64;
5875 
5876     CodeBuffer code(name, insts_size, locs_size);
5877     OopMapSet* oop_maps  = new OopMapSet();
5878     MacroAssembler* masm = new MacroAssembler(&code);
5879 
5880     address start = __ pc();
5881 
5882     // This is an inlined and slightly modified version of call_VM
5883     // which has the ability to fetch the return PC out of
5884     // thread-local storage and also sets up last_Java_sp slightly
5885     // differently than the real call_VM
5886 
5887     __ enter(); // required for proper stackwalking of RuntimeStub frame
5888 
5889     assert(is_even(framesize/2), "sp not 16-byte aligned");
5890 
5891     // return address and rbp are already in place
5892     __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
5893 
5894     int frame_complete = __ pc() - start;
5895 
5896     // Set up last_Java_sp and last_Java_fp
5897     address the_pc = __ pc();
5898     __ set_last_Java_frame(rsp, rbp, the_pc);
5899     __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
5900 
5901     // Call runtime
5902     if (arg1 != noreg) {
5903       assert(arg2 != c_rarg1, "clobbered");
5904       __ movptr(c_rarg1, arg1);
5905     }
5906     if (arg2 != noreg) {
5907       __ movptr(c_rarg2, arg2);
5908     }
5909     __ movptr(c_rarg0, r15_thread);
5910     BLOCK_COMMENT("call runtime_entry");
5911     __ call(RuntimeAddress(runtime_entry));
5912 
5913     // Generate oop map
5914     OopMap* map = new OopMap(framesize, 0);
5915 
5916     oop_maps->add_gc_map(the_pc - start, map);
5917 
5918     __ reset_last_Java_frame(true);
5919 
5920     __ leave(); // required for proper stackwalking of RuntimeStub frame
5921 
5922     // check for pending exceptions
5923 #ifdef ASSERT
5924     Label L;
5925     __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()),
5926             (int32_t) NULL_WORD);
5927     __ jcc(Assembler::notEqual, L);
5928     __ should_not_reach_here();
5929     __ bind(L);
5930 #endif // ASSERT
5931     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
5932 
5933 
5934     // codeBlob framesize is in words (not VMRegImpl::slot_size)
5935     RuntimeStub* stub =
5936       RuntimeStub::new_runtime_stub(name,
5937                                     &code,
5938                                     frame_complete,
5939                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
5940                                     oop_maps, false);
5941     return stub->entry_point();
5942   }
5943 
5944   void create_control_words() {
5945     // Round to nearest, 53-bit mode, exceptions masked
5946     StubRoutines::_fpu_cntrl_wrd_std   = 0x027F;
5947     // Round to zero, 53-bit mode, exception mased
5948     StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F;
5949     // Round to nearest, 24-bit mode, exceptions masked
5950     StubRoutines::_fpu_cntrl_wrd_24    = 0x007F;
5951     // Round to nearest, 64-bit mode, exceptions masked
5952     StubRoutines::_fpu_cntrl_wrd_64    = 0x037F;
5953     // Round to nearest, 64-bit mode, exceptions masked
5954     StubRoutines::_mxcsr_std           = 0x1F80;
5955     // Note: the following two constants are 80-bit values
5956     //       layout is critical for correct loading by FPU.
5957     // Bias for strict fp multiply/divide
5958     StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000
5959     StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000;
5960     StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff;
5961     // Un-Bias for strict fp multiply/divide
5962     StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000
5963     StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000;
5964     StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff;
5965   }
5966 
5967   // Initialization
5968   void generate_initial() {
5969     // Generates all stubs and initializes the entry points
5970 
5971     // This platform-specific settings are needed by generate_call_stub()
5972     create_control_words();
5973 
5974     // entry points that exist in all platforms Note: This is code
5975     // that could be shared among different platforms - however the
5976     // benefit seems to be smaller than the disadvantage of having a
5977     // much more complicated generator structure. See also comment in
5978     // stubRoutines.hpp.
5979 
5980     StubRoutines::_forward_exception_entry = generate_forward_exception();
5981 
5982     StubRoutines::_call_stub_entry =
5983       generate_call_stub(StubRoutines::_call_stub_return_address);
5984 
5985     // is referenced by megamorphic call
5986     StubRoutines::_catch_exception_entry = generate_catch_exception();
5987 
5988     // atomic calls
5989     StubRoutines::_atomic_xchg_entry          = generate_atomic_xchg();
5990     StubRoutines::_atomic_xchg_long_entry     = generate_atomic_xchg_long();
5991     StubRoutines::_atomic_cmpxchg_entry       = generate_atomic_cmpxchg();
5992     StubRoutines::_atomic_cmpxchg_byte_entry  = generate_atomic_cmpxchg_byte();
5993     StubRoutines::_atomic_cmpxchg_long_entry  = generate_atomic_cmpxchg_long();
5994     StubRoutines::_atomic_add_entry           = generate_atomic_add();
5995     StubRoutines::_atomic_add_long_entry      = generate_atomic_add_long();
5996     StubRoutines::_fence_entry                = generate_orderaccess_fence();
5997 
5998     // platform dependent
5999     StubRoutines::x86::_get_previous_fp_entry = generate_get_previous_fp();
6000     StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp();
6001 
6002     StubRoutines::x86::_verify_mxcsr_entry    = generate_verify_mxcsr();
6003 
6004     // Build this early so it's available for the interpreter.
6005     StubRoutines::_throw_StackOverflowError_entry =
6006       generate_throw_exception("StackOverflowError throw_exception",
6007                                CAST_FROM_FN_PTR(address,
6008                                                 SharedRuntime::
6009                                                 throw_StackOverflowError));
6010     StubRoutines::_throw_delayed_StackOverflowError_entry =
6011       generate_throw_exception("delayed StackOverflowError throw_exception",
6012                                CAST_FROM_FN_PTR(address,
6013                                                 SharedRuntime::
6014                                                 throw_delayed_StackOverflowError));
6015     if (UseCRC32Intrinsics) {
6016       // set table address before stub generation which use it
6017       StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
6018       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
6019     }
6020 
6021     if (UseCRC32CIntrinsics) {
6022       bool supports_clmul = VM_Version::supports_clmul();
6023       StubRoutines::x86::generate_CRC32C_table(supports_clmul);
6024       StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
6025       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
6026     }
6027     if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) {
6028       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
6029           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
6030           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
6031         StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF;
6032         StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2;
6033         StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4;
6034         StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable;
6035         StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2;
6036         StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3;
6037         StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1;
6038         StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE;
6039         StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4;
6040         StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV;
6041         StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK;
6042         StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1;
6043         StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3;
6044         StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO;
6045       }
6046       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
6047         StubRoutines::_dexp = generate_libmExp();
6048       }
6049       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
6050         StubRoutines::_dlog = generate_libmLog();
6051       }
6052       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
6053         StubRoutines::_dlog10 = generate_libmLog10();
6054       }
6055       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
6056         StubRoutines::_dpow = generate_libmPow();
6057       }
6058       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
6059         StubRoutines::_dsin = generate_libmSin();
6060       }
6061       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
6062         StubRoutines::_dcos = generate_libmCos();
6063       }
6064       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
6065         StubRoutines::_dtan = generate_libmTan();
6066       }
6067     }
6068   }
6069 
6070   void generate_all() {
6071     // Generates all stubs and initializes the entry points
6072 
6073     // These entry points require SharedInfo::stack0 to be set up in
6074     // non-core builds and need to be relocatable, so they each
6075     // fabricate a RuntimeStub internally.
6076     StubRoutines::_throw_AbstractMethodError_entry =
6077       generate_throw_exception("AbstractMethodError throw_exception",
6078                                CAST_FROM_FN_PTR(address,
6079                                                 SharedRuntime::
6080                                                 throw_AbstractMethodError));
6081 
6082     StubRoutines::_throw_IncompatibleClassChangeError_entry =
6083       generate_throw_exception("IncompatibleClassChangeError throw_exception",
6084                                CAST_FROM_FN_PTR(address,
6085                                                 SharedRuntime::
6086                                                 throw_IncompatibleClassChangeError));
6087 
6088     StubRoutines::_throw_NullPointerException_at_call_entry =
6089       generate_throw_exception("NullPointerException at call throw_exception",
6090                                CAST_FROM_FN_PTR(address,
6091                                                 SharedRuntime::
6092                                                 throw_NullPointerException_at_call));
6093 
6094     // entry points that are platform specific
6095     StubRoutines::x86::_f2i_fixup = generate_f2i_fixup();
6096     StubRoutines::x86::_f2l_fixup = generate_f2l_fixup();
6097     StubRoutines::x86::_d2i_fixup = generate_d2i_fixup();
6098     StubRoutines::x86::_d2l_fixup = generate_d2l_fixup();
6099 
6100     StubRoutines::x86::_float_sign_mask  = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
6101     StubRoutines::x86::_float_sign_flip  = generate_fp_mask("float_sign_flip",  0x8000000080000000);
6102     StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
6103     StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
6104     StubRoutines::x86::_vector_float_sign_mask = generate_vector_fp_mask("vector_float_sign_mask", 0x7FFFFFFF7FFFFFFF);
6105     StubRoutines::x86::_vector_float_sign_flip = generate_vector_fp_mask("vector_float_sign_flip", 0x8000000080000000);
6106     StubRoutines::x86::_vector_double_sign_mask = generate_vector_fp_mask("vector_double_sign_mask", 0x7FFFFFFFFFFFFFFF);
6107     StubRoutines::x86::_vector_double_sign_flip = generate_vector_fp_mask("vector_double_sign_flip", 0x8000000000000000);
6108     StubRoutines::x86::_vector_all_bits_set = generate_vector_fp_mask("vector_all_bits_set", 0xFFFFFFFFFFFFFFFF);
6109     StubRoutines::x86::_vector_byte_bitset = generate_vector_fp_mask("vector_byte_bitset", 0x0101010101010101);
6110     StubRoutines::x86::_vector_long_perm_mask = generate_vector_custom_i32("vector_long_perm_mask", Assembler::AVX_512bit,
6111                                                                            0, 2, 4, 6, 8, 10, 12, 14);
6112     StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_fp_mask("vector_short_to_byte_mask", 0x00ff00ff00ff00ff);
6113     StubRoutines::x86::_vector_int_to_byte_mask = generate_vector_fp_mask("vector_int_to_byte_mask", 0x000000ff000000ff);
6114     StubRoutines::x86::_vector_int_to_short_mask = generate_vector_fp_mask("vector_int_to_short_mask", 0x0000ffff0000ffff);
6115     StubRoutines::x86::_vector_32_bit_mask = generate_vector_custom_i32("vector_32_bit_mask", Assembler::AVX_512bit,
6116                                                                         0xFFFFFFFF, 0, 0, 0);
6117     StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit,
6118                                                                         0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
6119 
6120     // support for verify_oop (must happen after universe_init)
6121     StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
6122 
6123     // arraycopy stubs used by compilers
6124     generate_arraycopy_stubs();
6125 
6126     // don't bother generating these AES intrinsic stubs unless global flag is set
6127     if (UseAESIntrinsics) {
6128       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
6129       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
6130       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
6131       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
6132       if (VM_Version::supports_vaes() &&  VM_Version::supports_avx512vl() && VM_Version::supports_avx512dq() ) {
6133         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptVectorAESCrypt();
6134       } else {
6135         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
6136       }
6137     }
6138     if (UseAESCTRIntrinsics){
6139       StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
6140       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
6141     }
6142 
6143     if (UseSHA1Intrinsics) {
6144       StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
6145       StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
6146       StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
6147       StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
6148     }
6149     if (UseSHA256Intrinsics) {
6150       StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
6151       char* dst = (char*)StubRoutines::x86::_k256_W;
6152       char* src = (char*)StubRoutines::x86::_k256;
6153       for (int ii = 0; ii < 16; ++ii) {
6154         memcpy(dst + 32 * ii,      src + 16 * ii, 16);
6155         memcpy(dst + 32 * ii + 16, src + 16 * ii, 16);
6156       }
6157       StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W;
6158       StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
6159       StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
6160       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
6161     }
6162     if (UseSHA512Intrinsics) {
6163       StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W;
6164       StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512();
6165       StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
6166       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
6167     }
6168 
6169     // Generate GHASH intrinsics code
6170     if (UseGHASHIntrinsics) {
6171       StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
6172       StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
6173       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
6174     }
6175 
6176     if (UseBASE64Intrinsics) {
6177       StubRoutines::x86::_and_mask = base64_and_mask_addr();
6178       StubRoutines::x86::_bswap_mask = base64_bswap_mask_addr();
6179       StubRoutines::x86::_base64_charset = base64_charset_addr();
6180       StubRoutines::x86::_url_charset = base64url_charset_addr();
6181       StubRoutines::x86::_gather_mask = base64_gather_mask_addr();
6182       StubRoutines::x86::_left_shift_mask = base64_left_shift_mask_addr();
6183       StubRoutines::x86::_right_shift_mask = base64_right_shift_mask_addr();
6184       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
6185     }
6186 
6187     // Safefetch stubs.
6188     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
6189                                                        &StubRoutines::_safefetch32_fault_pc,
6190                                                        &StubRoutines::_safefetch32_continuation_pc);
6191     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
6192                                                        &StubRoutines::_safefetchN_fault_pc,
6193                                                        &StubRoutines::_safefetchN_continuation_pc);
6194 #ifdef COMPILER2
6195     if (UseMultiplyToLenIntrinsic) {
6196       StubRoutines::_multiplyToLen = generate_multiplyToLen();
6197     }
6198     if (UseSquareToLenIntrinsic) {
6199       StubRoutines::_squareToLen = generate_squareToLen();
6200     }
6201     if (UseMulAddIntrinsic) {
6202       StubRoutines::_mulAdd = generate_mulAdd();
6203     }
6204 #ifndef _WINDOWS
6205     if (UseMontgomeryMultiplyIntrinsic) {
6206       StubRoutines::_montgomeryMultiply
6207         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
6208     }
6209     if (UseMontgomerySquareIntrinsic) {
6210       StubRoutines::_montgomerySquare
6211         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
6212     }
6213 #endif // WINDOWS
6214 #endif // COMPILER2
6215 
6216     if (UseVectorizedMismatchIntrinsic) {
6217       StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
6218     }
6219 
6220 #ifdef __VECTOR_API_MATH_INTRINSICS_COMMON
6221     if (UseVectorApiIntrinsics) {
6222       if (UseAVX >= 1) {
6223           #if defined(__VECTOR_API_MATH_INTRINSICS_LINUX)
6224           if (UseAVX > 2) {
6225               StubRoutines::_vector_float512_exp = CAST_FROM_FN_PTR(address, __svml_expf16_ha_z0);
6226               StubRoutines::_vector_double512_exp = CAST_FROM_FN_PTR(address, __svml_exp8_ha_z0); 
6227               StubRoutines::_vector_float512_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f16_ha_z0);
6228               StubRoutines::_vector_double512_expm1 = CAST_FROM_FN_PTR(address, __svml_expm18_ha_z0);
6229               StubRoutines::_vector_float512_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf16_ha_z0);
6230               StubRoutines::_vector_double512_log1p = CAST_FROM_FN_PTR(address, __svml_log1p8_ha_z0);
6231               StubRoutines::_vector_float512_log = CAST_FROM_FN_PTR(address, __svml_logf16_ha_z0);
6232               StubRoutines::_vector_double512_log = CAST_FROM_FN_PTR(address, __svml_log8_ha_z0);
6233               StubRoutines::_vector_float512_log10 = CAST_FROM_FN_PTR(address, __svml_log10f16_ha_z0);
6234               StubRoutines::_vector_double512_log10 = CAST_FROM_FN_PTR(address, __svml_log108_ha_z0);
6235               StubRoutines::_vector_float512_sin = CAST_FROM_FN_PTR(address, __svml_sinf16_ha_z0);      
6236               StubRoutines::_vector_double512_sin = CAST_FROM_FN_PTR(address, __svml_sin8_ha_z0);
6237               StubRoutines::_vector_float512_cos = CAST_FROM_FN_PTR(address, __svml_cosf16_ha_z0);      
6238               StubRoutines::_vector_double512_cos = CAST_FROM_FN_PTR(address, __svml_cos8_ha_z0);
6239               StubRoutines::_vector_float512_tan = CAST_FROM_FN_PTR(address, __svml_tanf16_ha_z0);
6240               StubRoutines::_vector_double512_tan = CAST_FROM_FN_PTR(address, __svml_tan8_ha_z0);      
6241               StubRoutines::_vector_float512_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf16_ha_z0);
6242               StubRoutines::_vector_double512_sinh = CAST_FROM_FN_PTR(address, __svml_sinh8_ha_z0);
6243               StubRoutines::_vector_float512_cosh = CAST_FROM_FN_PTR(address, __svml_coshf16_ha_z0);
6244               StubRoutines::_vector_double512_cosh = CAST_FROM_FN_PTR(address, __svml_cosh8_ha_z0);
6245               StubRoutines::_vector_float512_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf16_ha_z0);
6246               StubRoutines::_vector_double512_tanh = CAST_FROM_FN_PTR(address, __svml_tanh8_ha_z0);
6247               StubRoutines::_vector_float512_acos = CAST_FROM_FN_PTR(address, __svml_acosf16_ha_z0);
6248               StubRoutines::_vector_double512_acos = CAST_FROM_FN_PTR(address, __svml_acos8_ha_z0);
6249               StubRoutines::_vector_float512_asin = CAST_FROM_FN_PTR(address, __svml_asinf16_ha_z0);
6250               StubRoutines::_vector_double512_asin = CAST_FROM_FN_PTR(address, __svml_asin8_ha_z0);
6251               StubRoutines::_vector_float512_atan = CAST_FROM_FN_PTR(address, __svml_atanf16_ha_z0);
6252               StubRoutines::_vector_double512_atan = CAST_FROM_FN_PTR(address, __svml_atan8_ha_z0);
6253               StubRoutines::_vector_float512_pow = CAST_FROM_FN_PTR(address, __svml_powf16_ha_z0);
6254               StubRoutines::_vector_double512_pow = CAST_FROM_FN_PTR(address, __svml_pow8_ha_z0);
6255               StubRoutines::_vector_float512_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf16_ha_z0);
6256               StubRoutines::_vector_double512_hypot = CAST_FROM_FN_PTR(address, __svml_hypot8_ha_z0);
6257               StubRoutines::_vector_float512_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf16_ha_z0);
6258               StubRoutines::_vector_double512_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt8_ha_z0);
6259               StubRoutines::_vector_float512_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f16_ha_z0);
6260               StubRoutines::_vector_double512_atan2 = CAST_FROM_FN_PTR(address, __svml_atan28_ha_z0);
6261           }
6262           #endif
6263         if (UseAVX==1) {
6264           StubRoutines::_vector_float64_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_e9);  
6265           StubRoutines::_vector_float128_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_e9);
6266           StubRoutines::_vector_float256_exp = CAST_FROM_FN_PTR(address, __svml_expf8_ha_e9); 
6267           StubRoutines::_vector_double64_exp = CAST_FROM_FN_PTR(address, __svml_exp1_ha_e9);  
6268           StubRoutines::_vector_double128_exp = CAST_FROM_FN_PTR(address, __svml_exp2_ha_e9); 
6269           StubRoutines::_vector_double256_exp = CAST_FROM_FN_PTR(address, __svml_exp4_ha_e9);
6270           StubRoutines::_vector_float64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_e9);
6271           StubRoutines::_vector_float128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_e9);
6272           StubRoutines::_vector_float256_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f8_ha_e9);
6273           StubRoutines::_vector_double64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm11_ha_e9);
6274           StubRoutines::_vector_double128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm12_ha_e9);
6275           StubRoutines::_vector_double256_expm1 = CAST_FROM_FN_PTR(address, __svml_expm14_ha_e9);
6276           StubRoutines::_vector_float64_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_e9);
6277           StubRoutines::_vector_float128_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_e9);
6278           StubRoutines::_vector_float256_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf8_ha_e9);
6279           StubRoutines::_vector_double64_log1p = CAST_FROM_FN_PTR(address, __svml_log1p1_ha_e9);
6280           StubRoutines::_vector_double128_log1p = CAST_FROM_FN_PTR(address, __svml_log1p2_ha_e9);
6281           StubRoutines::_vector_double256_log1p = CAST_FROM_FN_PTR(address, __svml_log1p4_ha_e9);
6282           StubRoutines::_vector_float64_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_e9);
6283           StubRoutines::_vector_float128_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_e9);
6284           StubRoutines::_vector_float256_log = CAST_FROM_FN_PTR(address, __svml_logf8_ha_e9);
6285           StubRoutines::_vector_double64_log = CAST_FROM_FN_PTR(address, __svml_log1_ha_e9);
6286           StubRoutines::_vector_double128_log = CAST_FROM_FN_PTR(address, __svml_log2_ha_e9);
6287           StubRoutines::_vector_double256_log = CAST_FROM_FN_PTR(address, __svml_log4_ha_e9);
6288           StubRoutines::_vector_float64_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_e9);
6289           StubRoutines::_vector_float128_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_e9);
6290           StubRoutines::_vector_float256_log10 = CAST_FROM_FN_PTR(address, __svml_log10f8_ha_e9);
6291           StubRoutines::_vector_double64_log10 = CAST_FROM_FN_PTR(address, __svml_log101_ha_e9);
6292           StubRoutines::_vector_double128_log10 = CAST_FROM_FN_PTR(address, __svml_log102_ha_e9);
6293           StubRoutines::_vector_double256_log10 = CAST_FROM_FN_PTR(address, __svml_log104_ha_e9);
6294           StubRoutines::_vector_float64_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_e9);
6295           StubRoutines::_vector_float128_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_e9);
6296           StubRoutines::_vector_float256_sin = CAST_FROM_FN_PTR(address, __svml_sinf8_ha_e9);
6297           StubRoutines::_vector_double64_sin = CAST_FROM_FN_PTR(address, __svml_sin1_ha_e9);
6298           StubRoutines::_vector_double128_sin = CAST_FROM_FN_PTR(address, __svml_sin2_ha_e9);
6299           StubRoutines::_vector_double256_sin = CAST_FROM_FN_PTR(address, __svml_sin4_ha_e9);
6300           StubRoutines::_vector_float64_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_e9);
6301           StubRoutines::_vector_float128_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_e9);
6302           StubRoutines::_vector_float256_cos = CAST_FROM_FN_PTR(address, __svml_cosf8_ha_e9);
6303           StubRoutines::_vector_double64_cos = CAST_FROM_FN_PTR(address, __svml_cos1_ha_e9);
6304           StubRoutines::_vector_double128_cos = CAST_FROM_FN_PTR(address, __svml_cos2_ha_e9);
6305           StubRoutines::_vector_double256_cos = CAST_FROM_FN_PTR(address, __svml_cos4_ha_e9);
6306           StubRoutines::_vector_float64_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_e9);
6307           StubRoutines::_vector_float128_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_e9);
6308           StubRoutines::_vector_float256_tan = CAST_FROM_FN_PTR(address, __svml_tanf8_ha_e9);
6309           StubRoutines::_vector_double64_tan = CAST_FROM_FN_PTR(address, __svml_tan1_ha_e9);
6310           StubRoutines::_vector_double128_tan = CAST_FROM_FN_PTR(address, __svml_tan2_ha_e9);
6311           StubRoutines::_vector_double256_tan = CAST_FROM_FN_PTR(address, __svml_tan4_ha_e9);
6312           StubRoutines::_vector_float64_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_e9);
6313           StubRoutines::_vector_float128_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_e9);
6314           StubRoutines::_vector_float256_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf8_ha_e9);
6315           StubRoutines::_vector_double64_sinh = CAST_FROM_FN_PTR(address, __svml_sinh1_ha_e9);
6316           StubRoutines::_vector_double128_sinh = CAST_FROM_FN_PTR(address, __svml_sinh2_ha_e9);
6317           StubRoutines::_vector_double256_sinh = CAST_FROM_FN_PTR(address, __svml_sinh4_ha_e9);
6318           StubRoutines::_vector_float64_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_e9);
6319           StubRoutines::_vector_float128_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_e9);
6320           StubRoutines::_vector_float256_cosh = CAST_FROM_FN_PTR(address, __svml_coshf8_ha_e9);
6321           StubRoutines::_vector_double64_cosh = CAST_FROM_FN_PTR(address, __svml_cosh1_ha_e9);
6322           StubRoutines::_vector_double128_cosh = CAST_FROM_FN_PTR(address, __svml_cosh2_ha_e9);
6323           StubRoutines::_vector_double256_cosh = CAST_FROM_FN_PTR(address, __svml_cosh4_ha_e9);
6324           StubRoutines::_vector_float64_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_e9);
6325           StubRoutines::_vector_float128_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_e9);
6326           StubRoutines::_vector_float256_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf8_ha_e9);
6327           StubRoutines::_vector_double64_tanh = CAST_FROM_FN_PTR(address, __svml_tanh1_ha_e9);
6328           StubRoutines::_vector_double128_tanh = CAST_FROM_FN_PTR(address, __svml_tanh2_ha_e9);
6329           StubRoutines::_vector_double256_tanh = CAST_FROM_FN_PTR(address, __svml_tanh4_ha_e9);
6330           StubRoutines::_vector_float64_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_e9);
6331           StubRoutines::_vector_float128_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_e9);
6332           StubRoutines::_vector_float256_acos = CAST_FROM_FN_PTR(address, __svml_acosf8_ha_e9);
6333           StubRoutines::_vector_double64_acos = CAST_FROM_FN_PTR(address, __svml_acos1_ha_e9);
6334           StubRoutines::_vector_double128_acos = CAST_FROM_FN_PTR(address, __svml_acos2_ha_e9);
6335           StubRoutines::_vector_double256_acos = CAST_FROM_FN_PTR(address, __svml_acos4_ha_e9);
6336           StubRoutines::_vector_float64_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_e9);
6337           StubRoutines::_vector_float128_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_e9);
6338           StubRoutines::_vector_float256_asin = CAST_FROM_FN_PTR(address, __svml_asinf8_ha_e9);
6339           StubRoutines::_vector_double64_asin = CAST_FROM_FN_PTR(address, __svml_asin1_ha_e9);
6340           StubRoutines::_vector_double128_asin = CAST_FROM_FN_PTR(address, __svml_asin2_ha_e9);
6341           StubRoutines::_vector_double256_asin = CAST_FROM_FN_PTR(address, __svml_asin4_ha_e9);
6342           StubRoutines::_vector_float64_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_e9);
6343           StubRoutines::_vector_float128_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_e9);
6344           StubRoutines::_vector_float256_atan = CAST_FROM_FN_PTR(address, __svml_atanf8_ha_e9);
6345           StubRoutines::_vector_double64_atan = CAST_FROM_FN_PTR(address, __svml_atan1_ha_e9);
6346           StubRoutines::_vector_double128_atan = CAST_FROM_FN_PTR(address, __svml_atan2_ha_e9);
6347           StubRoutines::_vector_double256_atan = CAST_FROM_FN_PTR(address, __svml_atan4_ha_e9);
6348           StubRoutines::_vector_float64_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_e9);
6349           StubRoutines::_vector_float128_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_e9);
6350           StubRoutines::_vector_float256_pow = CAST_FROM_FN_PTR(address, __svml_powf8_ha_e9);
6351           StubRoutines::_vector_double64_pow = CAST_FROM_FN_PTR(address, __svml_pow1_ha_e9);
6352           StubRoutines::_vector_double128_pow = CAST_FROM_FN_PTR(address, __svml_pow2_ha_e9);
6353           StubRoutines::_vector_double256_pow = CAST_FROM_FN_PTR(address, __svml_pow4_ha_e9);
6354           StubRoutines::_vector_float64_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_e9);
6355           StubRoutines::_vector_float128_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_e9);
6356           StubRoutines::_vector_float256_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf8_ha_e9);
6357           StubRoutines::_vector_double64_hypot = CAST_FROM_FN_PTR(address, __svml_hypot1_ha_e9);
6358           StubRoutines::_vector_double128_hypot = CAST_FROM_FN_PTR(address, __svml_hypot2_ha_e9);
6359           StubRoutines::_vector_double256_hypot = CAST_FROM_FN_PTR(address, __svml_hypot4_ha_e9);
6360           StubRoutines::_vector_float64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_e9);
6361           StubRoutines::_vector_float128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_e9);
6362           StubRoutines::_vector_float256_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf8_ha_e9);
6363           StubRoutines::_vector_double64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt1_ha_e9);
6364           StubRoutines::_vector_double128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt2_ha_e9);
6365           StubRoutines::_vector_double256_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt4_ha_e9);
6366           StubRoutines::_vector_float64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_e9);
6367           StubRoutines::_vector_float128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_e9);
6368           StubRoutines::_vector_float256_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f8_ha_e9);
6369           StubRoutines::_vector_double64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan21_ha_e9);
6370           StubRoutines::_vector_double128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan22_ha_e9);
6371           StubRoutines::_vector_double256_atan2 = CAST_FROM_FN_PTR(address, __svml_atan24_ha_e9);  
6372         }  
6373         else {
6374           StubRoutines::_vector_float64_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_l9);  
6375           StubRoutines::_vector_float128_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_l9);
6376           StubRoutines::_vector_float256_exp = CAST_FROM_FN_PTR(address, __svml_expf8_ha_l9); 
6377           StubRoutines::_vector_double64_exp = CAST_FROM_FN_PTR(address, __svml_exp1_ha_l9);  
6378           StubRoutines::_vector_double128_exp = CAST_FROM_FN_PTR(address, __svml_exp2_ha_l9); 
6379           StubRoutines::_vector_double256_exp = CAST_FROM_FN_PTR(address, __svml_exp4_ha_l9);
6380           StubRoutines::_vector_float64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_l9);
6381           StubRoutines::_vector_float128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_l9);
6382           StubRoutines::_vector_float256_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f8_ha_l9);
6383           StubRoutines::_vector_double64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm11_ha_l9);
6384           StubRoutines::_vector_double128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm12_ha_l9);
6385           StubRoutines::_vector_double256_expm1 = CAST_FROM_FN_PTR(address, __svml_expm14_ha_l9);
6386           StubRoutines::_vector_float64_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_l9);
6387           StubRoutines::_vector_float128_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_l9);
6388           StubRoutines::_vector_float256_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf8_ha_l9);
6389           StubRoutines::_vector_double64_log1p = CAST_FROM_FN_PTR(address, __svml_log1p1_ha_l9);
6390           StubRoutines::_vector_double128_log1p = CAST_FROM_FN_PTR(address, __svml_log1p2_ha_l9);
6391           StubRoutines::_vector_double256_log1p = CAST_FROM_FN_PTR(address, __svml_log1p4_ha_l9);
6392           StubRoutines::_vector_float64_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_l9);
6393           StubRoutines::_vector_float128_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_l9);
6394           StubRoutines::_vector_float256_log = CAST_FROM_FN_PTR(address, __svml_logf8_ha_l9);
6395           StubRoutines::_vector_double64_log = CAST_FROM_FN_PTR(address, __svml_log1_ha_l9);
6396           StubRoutines::_vector_double128_log = CAST_FROM_FN_PTR(address, __svml_log2_ha_l9);
6397           StubRoutines::_vector_double256_log = CAST_FROM_FN_PTR(address, __svml_log4_ha_l9);
6398           StubRoutines::_vector_float64_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_l9);
6399           StubRoutines::_vector_float128_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_l9);
6400           StubRoutines::_vector_float256_log10 = CAST_FROM_FN_PTR(address, __svml_log10f8_ha_l9);
6401           StubRoutines::_vector_double64_log10 = CAST_FROM_FN_PTR(address, __svml_log101_ha_l9);
6402           StubRoutines::_vector_double128_log10 = CAST_FROM_FN_PTR(address, __svml_log102_ha_l9);
6403           StubRoutines::_vector_double256_log10 = CAST_FROM_FN_PTR(address, __svml_log104_ha_l9);
6404           StubRoutines::_vector_float64_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_l9);
6405           StubRoutines::_vector_float128_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_l9);
6406           StubRoutines::_vector_float256_sin = CAST_FROM_FN_PTR(address, __svml_sinf8_ha_l9);
6407           StubRoutines::_vector_double64_sin = CAST_FROM_FN_PTR(address, __svml_sin1_ha_l9);
6408           StubRoutines::_vector_double128_sin = CAST_FROM_FN_PTR(address, __svml_sin2_ha_l9);
6409           StubRoutines::_vector_double256_sin = CAST_FROM_FN_PTR(address, __svml_sin4_ha_l9);
6410           StubRoutines::_vector_float64_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_l9);
6411           StubRoutines::_vector_float128_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_l9);
6412           StubRoutines::_vector_float256_cos = CAST_FROM_FN_PTR(address, __svml_cosf8_ha_l9);
6413           StubRoutines::_vector_double64_cos = CAST_FROM_FN_PTR(address, __svml_cos1_ha_l9);
6414           StubRoutines::_vector_double128_cos = CAST_FROM_FN_PTR(address, __svml_cos2_ha_l9);
6415           StubRoutines::_vector_double256_cos = CAST_FROM_FN_PTR(address, __svml_cos4_ha_l9);
6416           StubRoutines::_vector_float64_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_l9);
6417           StubRoutines::_vector_float128_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_l9);
6418           StubRoutines::_vector_float256_tan = CAST_FROM_FN_PTR(address, __svml_tanf8_ha_l9);
6419           StubRoutines::_vector_double64_tan = CAST_FROM_FN_PTR(address, __svml_tan1_ha_l9);
6420           StubRoutines::_vector_double128_tan = CAST_FROM_FN_PTR(address, __svml_tan2_ha_l9);
6421           StubRoutines::_vector_double256_tan = CAST_FROM_FN_PTR(address, __svml_tan4_ha_l9);
6422           StubRoutines::_vector_float64_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_l9);
6423           StubRoutines::_vector_float128_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_l9);
6424           StubRoutines::_vector_float256_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf8_ha_l9);
6425           StubRoutines::_vector_double64_sinh = CAST_FROM_FN_PTR(address, __svml_sinh1_ha_l9);
6426           StubRoutines::_vector_double128_sinh = CAST_FROM_FN_PTR(address, __svml_sinh2_ha_l9);
6427           StubRoutines::_vector_double256_sinh = CAST_FROM_FN_PTR(address, __svml_sinh4_ha_l9);
6428           StubRoutines::_vector_float64_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_l9);
6429           StubRoutines::_vector_float128_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_l9);
6430           StubRoutines::_vector_float256_cosh = CAST_FROM_FN_PTR(address, __svml_coshf8_ha_l9);
6431           StubRoutines::_vector_double64_cosh = CAST_FROM_FN_PTR(address, __svml_cosh1_ha_l9);
6432           StubRoutines::_vector_double128_cosh = CAST_FROM_FN_PTR(address, __svml_cosh2_ha_l9);
6433           StubRoutines::_vector_double256_cosh = CAST_FROM_FN_PTR(address, __svml_cosh4_ha_l9);
6434           StubRoutines::_vector_float64_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_l9);
6435           StubRoutines::_vector_float128_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_l9);
6436           StubRoutines::_vector_float256_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf8_ha_l9);
6437           StubRoutines::_vector_double64_tanh = CAST_FROM_FN_PTR(address, __svml_tanh1_ha_l9);
6438           StubRoutines::_vector_double128_tanh = CAST_FROM_FN_PTR(address, __svml_tanh2_ha_l9);
6439           StubRoutines::_vector_double256_tanh = CAST_FROM_FN_PTR(address, __svml_tanh4_ha_l9);
6440           StubRoutines::_vector_float64_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_l9);
6441           StubRoutines::_vector_float128_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_l9);
6442           StubRoutines::_vector_float256_acos = CAST_FROM_FN_PTR(address, __svml_acosf8_ha_l9);
6443           StubRoutines::_vector_double64_acos = CAST_FROM_FN_PTR(address, __svml_acos1_ha_l9);
6444           StubRoutines::_vector_double128_acos = CAST_FROM_FN_PTR(address, __svml_acos2_ha_l9);
6445           StubRoutines::_vector_double256_acos = CAST_FROM_FN_PTR(address, __svml_acos4_ha_l9);
6446           StubRoutines::_vector_float64_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_l9);
6447           StubRoutines::_vector_float128_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_l9);
6448           StubRoutines::_vector_float256_asin = CAST_FROM_FN_PTR(address, __svml_asinf8_ha_l9);
6449           StubRoutines::_vector_double64_asin = CAST_FROM_FN_PTR(address, __svml_asin1_ha_l9);
6450           StubRoutines::_vector_double128_asin = CAST_FROM_FN_PTR(address, __svml_asin2_ha_l9);
6451           StubRoutines::_vector_double256_asin = CAST_FROM_FN_PTR(address, __svml_asin4_ha_l9);
6452           StubRoutines::_vector_float64_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_l9);
6453           StubRoutines::_vector_float128_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_l9);
6454           StubRoutines::_vector_float256_atan = CAST_FROM_FN_PTR(address, __svml_atanf8_ha_l9);
6455           StubRoutines::_vector_double64_atan = CAST_FROM_FN_PTR(address, __svml_atan1_ha_l9);
6456           StubRoutines::_vector_double128_atan = CAST_FROM_FN_PTR(address, __svml_atan2_ha_l9);
6457           StubRoutines::_vector_double256_atan = CAST_FROM_FN_PTR(address, __svml_atan4_ha_l9);
6458           StubRoutines::_vector_float64_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_l9);
6459           StubRoutines::_vector_float128_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_l9);
6460           StubRoutines::_vector_float256_pow = CAST_FROM_FN_PTR(address, __svml_powf8_ha_l9);
6461           StubRoutines::_vector_double64_pow = CAST_FROM_FN_PTR(address, __svml_pow1_ha_l9);
6462           StubRoutines::_vector_double128_pow = CAST_FROM_FN_PTR(address, __svml_pow2_ha_l9);
6463           StubRoutines::_vector_double256_pow = CAST_FROM_FN_PTR(address, __svml_pow4_ha_l9);
6464           StubRoutines::_vector_float64_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_l9);
6465           StubRoutines::_vector_float128_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_l9);
6466           StubRoutines::_vector_float256_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf8_ha_l9);
6467           StubRoutines::_vector_double64_hypot = CAST_FROM_FN_PTR(address, __svml_hypot1_ha_l9);
6468           StubRoutines::_vector_double128_hypot = CAST_FROM_FN_PTR(address, __svml_hypot2_ha_l9);
6469           StubRoutines::_vector_double256_hypot = CAST_FROM_FN_PTR(address, __svml_hypot4_ha_l9);
6470           StubRoutines::_vector_float64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_l9);
6471           StubRoutines::_vector_float128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_l9);
6472           StubRoutines::_vector_float256_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf8_ha_l9);
6473           StubRoutines::_vector_double64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt1_ha_l9);
6474           StubRoutines::_vector_double128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt2_ha_l9);
6475           StubRoutines::_vector_double256_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt4_ha_l9);
6476           StubRoutines::_vector_float64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_l9);
6477           StubRoutines::_vector_float128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_l9);
6478           StubRoutines::_vector_float256_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f8_ha_l9);
6479           StubRoutines::_vector_double64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan21_ha_l9);
6480           StubRoutines::_vector_double128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan22_ha_l9);
6481           StubRoutines::_vector_double256_atan2 = CAST_FROM_FN_PTR(address, __svml_atan24_ha_l9);  
6482       }  
6483         
6484        
6485       } else if (UseSSE>=2) {
6486         StubRoutines::_vector_float64_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_ex);  
6487         StubRoutines::_vector_float128_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_ex);  
6488         StubRoutines::_vector_double64_exp = CAST_FROM_FN_PTR(address, __svml_exp1_ha_ex);  
6489         StubRoutines::_vector_double128_exp = CAST_FROM_FN_PTR(address, __svml_exp2_ha_ex);  
6490         StubRoutines::_vector_float64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_ex);
6491         StubRoutines::_vector_float128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_ex);  
6492         StubRoutines::_vector_double64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm11_ha_ex);
6493         StubRoutines::_vector_double128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm12_ha_ex);   
6494         StubRoutines::_vector_float64_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_ex);   
6495         StubRoutines::_vector_float128_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_ex);  
6496         StubRoutines::_vector_double64_acos = CAST_FROM_FN_PTR(address, __svml_acos1_ha_ex);
6497         StubRoutines::_vector_double128_acos = CAST_FROM_FN_PTR(address, __svml_acos2_ha_ex);      
6498         StubRoutines::_vector_float64_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_ex);
6499         StubRoutines::_vector_float128_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_ex);  
6500         StubRoutines::_vector_double64_asin = CAST_FROM_FN_PTR(address, __svml_asin1_ha_ex);
6501         StubRoutines::_vector_double128_asin = CAST_FROM_FN_PTR(address, __svml_asin2_ha_ex);      
6502         StubRoutines::_vector_float64_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_ex);
6503         StubRoutines::_vector_float128_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_ex);  
6504         StubRoutines::_vector_double64_atan = CAST_FROM_FN_PTR(address, __svml_atan1_ha_ex);
6505         StubRoutines::_vector_double128_atan = CAST_FROM_FN_PTR(address, __svml_atan2_ha_ex);      
6506         StubRoutines::_vector_float64_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_ex);
6507         StubRoutines::_vector_float128_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_ex);  
6508         StubRoutines::_vector_double64_sin = CAST_FROM_FN_PTR(address, __svml_sin1_ha_ex);
6509         StubRoutines::_vector_double128_sin = CAST_FROM_FN_PTR(address, __svml_sin2_ha_ex);      
6510         StubRoutines::_vector_float64_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_ex);
6511         StubRoutines::_vector_float128_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_ex);  
6512         StubRoutines::_vector_double64_cos = CAST_FROM_FN_PTR(address, __svml_cos1_ha_ex);
6513         StubRoutines::_vector_double128_cos = CAST_FROM_FN_PTR(address, __svml_cos2_ha_ex);      
6514         StubRoutines::_vector_float64_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_ex);
6515         StubRoutines::_vector_float128_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_ex);  
6516         StubRoutines::_vector_double64_tan = CAST_FROM_FN_PTR(address, __svml_tan1_ha_ex);
6517         StubRoutines::_vector_double128_tan = CAST_FROM_FN_PTR(address, __svml_tan2_ha_ex);      
6518         StubRoutines::_vector_float64_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_ex);
6519         StubRoutines::_vector_float128_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_ex);  
6520         StubRoutines::_vector_double64_sinh = CAST_FROM_FN_PTR(address, __svml_sinh1_ha_ex);
6521         StubRoutines::_vector_double128_sinh = CAST_FROM_FN_PTR(address, __svml_sinh2_ha_ex);      
6522         StubRoutines::_vector_float64_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_ex);
6523         StubRoutines::_vector_float128_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_ex);
6524         StubRoutines::_vector_double64_cosh = CAST_FROM_FN_PTR(address, __svml_cosh1_ha_ex);  
6525         StubRoutines::_vector_double128_cosh = CAST_FROM_FN_PTR(address, __svml_cosh2_ha_ex);      
6526         StubRoutines::_vector_float64_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_ex);
6527         StubRoutines::_vector_float128_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_ex);  
6528         StubRoutines::_vector_double64_tanh = CAST_FROM_FN_PTR(address, __svml_tanh1_ha_ex);
6529         StubRoutines::_vector_double128_tanh = CAST_FROM_FN_PTR(address, __svml_tanh2_ha_ex);      
6530         StubRoutines::_vector_float64_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_ex);
6531         StubRoutines::_vector_float128_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_ex);  
6532         StubRoutines::_vector_double64_log = CAST_FROM_FN_PTR(address, __svml_log1_ha_ex);
6533         StubRoutines::_vector_double128_log = CAST_FROM_FN_PTR(address, __svml_log2_ha_ex);      
6534         StubRoutines::_vector_float64_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_ex);
6535         StubRoutines::_vector_float128_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_ex);  
6536         StubRoutines::_vector_double64_log10 = CAST_FROM_FN_PTR(address, __svml_log101_ha_ex);
6537         StubRoutines::_vector_double128_log10 = CAST_FROM_FN_PTR(address, __svml_log102_ha_ex);      
6538         StubRoutines::_vector_float64_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_ex);
6539         StubRoutines::_vector_float128_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_ex);  
6540         StubRoutines::_vector_double64_log1p = CAST_FROM_FN_PTR(address, __svml_log1p1_ha_ex);
6541         StubRoutines::_vector_double128_log1p = CAST_FROM_FN_PTR(address, __svml_log1p2_ha_ex);      
6542         StubRoutines::_vector_float64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_ex);
6543         StubRoutines::_vector_float128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_ex); 
6544         StubRoutines::_vector_double64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan21_ha_ex);
6545         StubRoutines::_vector_double128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan22_ha_ex);      
6546         StubRoutines::_vector_float64_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_ex);
6547         StubRoutines::_vector_float128_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_ex);  
6548         StubRoutines::_vector_double64_hypot = CAST_FROM_FN_PTR(address, __svml_hypot1_ha_ex);
6549         StubRoutines::_vector_double128_hypot = CAST_FROM_FN_PTR(address, __svml_hypot2_ha_ex);      
6550         StubRoutines::_vector_float64_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_ex);
6551         StubRoutines::_vector_float128_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_ex);  
6552         StubRoutines::_vector_double64_pow = CAST_FROM_FN_PTR(address, __svml_pow1_ha_ex);
6553         StubRoutines::_vector_double128_pow = CAST_FROM_FN_PTR(address, __svml_pow2_ha_ex);      
6554         StubRoutines::_vector_float64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_ex);
6555         StubRoutines::_vector_float128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_ex);  
6556         StubRoutines::_vector_double64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt1_ha_ex);
6557         StubRoutines::_vector_double128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt2_ha_ex);      
6558       }
6559   }
6560 #endif
6561   }
6562 
6563  public:
6564   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
6565     if (all) {
6566       generate_all();
6567     } else {
6568       generate_initial();
6569     }
6570   }
6571 }; // end class declaration
6572 
6573 void StubGenerator_generate(CodeBuffer* code, bool all) {
6574   StubGenerator g(code, all);
6575 }