1 /*
   2  * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "ci/ciUtilities.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "gc/shared/barrierSetNMethod.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "nativeInst_x86.hpp"
  34 #include "oops/instanceOop.hpp"
  35 #include "oops/method.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/frame.inline.hpp"
  40 #include "runtime/handles.inline.hpp"
  41 #include "runtime/sharedRuntime.hpp"
  42 #include "runtime/stubCodeGenerator.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 #ifdef COMPILER2
  46 #include "opto/runtime.hpp"
  47 #endif
  48 #if INCLUDE_ZGC
  49 #include "gc/z/zThreadLocalData.hpp"
  50 #endif
  51 
  52 #ifdef __VECTOR_API_MATH_INTRINSICS_COMMON
  53 // Vector API SVML routines written in assembly
  54 extern "C"
  55 {
  56    float __svml_expf4_ha_ex(float a);
  57    double __svml_exp1_ha_ex(double a); 
  58    double __svml_exp2_ha_ex(double a);     
  59    float __svml_expf4_ha_l9(float a);
  60    float __svml_expf8_ha_l9(float a);
  61    float __svml_expf4_ha_e9(float a);
  62    float __svml_expf8_ha_e9(float a);
  63    float __svml_expf16_ha_z0(float a);
  64    double __svml_exp1_ha_l9(double a);    
  65    double __svml_exp2_ha_l9(double a);
  66    double __svml_exp4_ha_l9(double a);
  67    double __svml_exp1_ha_e9(double a);    
  68    double __svml_exp2_ha_e9(double a);
  69    double __svml_exp4_ha_e9(double a);
  70    double __svml_exp8_ha_z0(double a); 
  71    float  __svml_expm1f4_ha_ex(float a);
  72    double __svml_expm11_ha_ex(double a);
  73    double __svml_expm12_ha_ex(double a);
  74    float  __svml_expm1f4_ha_l9(float a);
  75    float  __svml_expm1f8_ha_l9(float a);
  76    float  __svml_expm1f4_ha_e9(float a);
  77    float  __svml_expm1f8_ha_e9(float a);
  78    float __svml_expm1f16_ha_z0(float a);
  79    double __svml_expm11_ha_l9(double a); 
  80    double __svml_expm12_ha_l9(double a);
  81    double __svml_expm14_ha_l9(double a);
  82    double __svml_expm11_ha_e9(double a); 
  83    double __svml_expm12_ha_e9(double a);
  84    double __svml_expm14_ha_e9(double a);
  85    double __svml_expm18_ha_z0(double a); 
  86    float __svml_log1pf4_ha_l9(float a);
  87    float __svml_log1pf8_ha_l9(float a);
  88    float __svml_log1pf4_ha_e9(float a);
  89    float __svml_log1pf8_ha_e9(float a);
  90    float __svml_log1pf16_ha_z0(float a);
  91    double __svml_log1p1_ha_l9(double a);
  92    double __svml_log1p2_ha_l9(double a);
  93    double __svml_log1p4_ha_l9(double a);
  94    double __svml_log1p1_ha_e9(double a);
  95    double __svml_log1p2_ha_e9(double a);
  96    double __svml_log1p4_ha_e9(double a);
  97    double __svml_log1p8_ha_z0(double a);
  98    float __svml_logf4_ha_l9(float a);
  99    float __svml_logf8_ha_l9(float a);
 100    float __svml_logf4_ha_e9(float a);
 101    float __svml_logf8_ha_e9(float a);
 102    float __svml_logf16_ha_z0(float a);
 103    double __svml_log1_ha_l9(double a);
 104    double __svml_log2_ha_l9(double a);
 105    double __svml_log4_ha_l9(double a);
 106    double __svml_log1_ha_e9(double a);
 107    double __svml_log2_ha_e9(double a);
 108    double __svml_log4_ha_e9(double a);
 109    double __svml_log8_ha_z0(double a);
 110    float __svml_log10f4_ha_l9(float a);
 111    float __svml_log10f8_ha_l9(float a);
 112    float __svml_log10f4_ha_e9(float a);
 113    float __svml_log10f8_ha_e9(float a);
 114    float __svml_log10f16_ha_z0(float a);
 115    double __svml_log101_ha_l9(double a);
 116    double __svml_log102_ha_l9(double a);
 117    double __svml_log104_ha_l9(double a); 
 118    double __svml_log101_ha_e9(double a);
 119    double __svml_log102_ha_e9(double a);
 120    double __svml_log104_ha_e9(double a);
 121    double __svml_log108_ha_z0(double a);
 122    float __svml_sinf4_ha_l9(float a);
 123    float __svml_sinf8_ha_l9(float a);
 124    float __svml_sinf4_ha_e9(float a);
 125    float __svml_sinf8_ha_e9(float a);
 126    float __svml_sinf16_ha_z0(float a);
 127    double __svml_sin1_ha_l9(double a); 
 128    double __svml_sin2_ha_l9(double a);
 129    double __svml_sin4_ha_l9(double a); 
 130    double __svml_sin1_ha_e9(double a); 
 131    double __svml_sin2_ha_e9(double a);
 132    double __svml_sin4_ha_e9(double a); 
 133    double __svml_sin8_ha_z0(double a);
 134    float __svml_cosf4_ha_l9(float a);
 135    float __svml_cosf8_ha_l9(float a);
 136    float __svml_cosf4_ha_e9(float a);
 137    float __svml_cosf8_ha_e9(float a);
 138    float __svml_cosf16_ha_z0(float a);
 139    double  __svml_cos1_ha_l9(double a);
 140    double  __svml_cos2_ha_l9(double a);
 141    double __svml_cos4_ha_l9(double a);
 142    double  __svml_cos1_ha_e9(double a);
 143    double  __svml_cos2_ha_e9(double a);
 144    double __svml_cos4_ha_e9(double a);
 145    double  __svml_cos8_ha_z0(double a);
 146    float __svml_tanf4_ha_l9(float a);
 147    float __svml_tanf8_ha_l9(float a);
 148    float __svml_tanf4_ha_e9(float a);
 149    float __svml_tanf8_ha_e9(float a);
 150    float __svml_tanf16_ha_z0(float a);
 151    double __svml_tan1_ha_l9(double a);
 152    double __svml_tan2_ha_l9(double a);
 153    double __svml_tan4_ha_l9(double a);
 154    double __svml_tan1_ha_e9(double a);
 155    double __svml_tan2_ha_e9(double a);
 156    double __svml_tan4_ha_e9(double a);
 157    double __svml_tan8_ha_z0(double a);
 158    double __svml_sinh1_ha_l9(double a);
 159    double __svml_sinh2_ha_l9(double a);
 160    double __svml_sinh4_ha_l9(double a);
 161    double __svml_sinh1_ha_e9(double a);
 162    double __svml_sinh2_ha_e9(double a);
 163    double __svml_sinh4_ha_e9(double a);
 164    double __svml_sinh8_ha_z0(double a);
 165    float __svml_sinhf4_ha_l9(float a);
 166    float __svml_sinhf8_ha_l9(float a);
 167    float __svml_sinhf4_ha_e9(float a);
 168    float __svml_sinhf8_ha_e9(float a);
 169    float __svml_sinhf16_ha_z0(float a);
 170    double __svml_cosh1_ha_l9(double a);
 171    double __svml_cosh2_ha_l9(double a);
 172    double __svml_cosh4_ha_l9(double a);
 173    double __svml_cosh1_ha_e9(double a);
 174    double __svml_cosh2_ha_e9(double a);
 175    double __svml_cosh4_ha_e9(double a);
 176    double __svml_cosh8_ha_z0(double a);
 177    float __svml_coshf4_ha_l9(float a);
 178    float __svml_coshf8_ha_l9(float a);
 179    float __svml_coshf4_ha_e9(float a);
 180    float __svml_coshf8_ha_e9(float a);
 181    float __svml_coshf16_ha_z0(float a); 
 182    double __svml_tanh1_ha_l9(double a);
 183    double __svml_tanh2_ha_l9(double a);
 184    double __svml_tanh4_ha_l9(double a);
 185    double __svml_tanh1_ha_e9(double a);
 186    double __svml_tanh2_ha_e9(double a);
 187    double __svml_tanh4_ha_e9(double a);
 188    double __svml_tanh8_ha_z0(double a);
 189    float __svml_tanhf4_ha_l9(float a);
 190    float __svml_tanhf8_ha_l9(float a);
 191    float __svml_tanhf4_ha_e9(float a);
 192    float __svml_tanhf8_ha_e9(float a);
 193    float __svml_tanhf16_ha_z0(float a);
 194    float __svml_acosf4_ha_ex(float a);
 195    float __svml_acosf4_ha_l9(float a);
 196    float __svml_acosf8_ha_l9(float a);
 197    float __svml_acosf4_ha_e9(float a);
 198    float __svml_acosf8_ha_e9(float a);
 199    float __svml_acosf16_ha_z0(float a);
 200    double __svml_acos1_ha_ex(double a);
 201    double __svml_acos2_ha_ex(double a);
 202    double __svml_acos1_ha_l9(double a);
 203    double __svml_acos2_ha_l9(double a);
 204    double __svml_acos4_ha_l9(double a);
 205    double __svml_acos1_ha_e9(double a);
 206    double __svml_acos2_ha_e9(double a);
 207    double __svml_acos4_ha_e9(double a);
 208    double __svml_acos8_ha_z0(double a);
 209    float __svml_asinf4_ha_ex(float a);
 210    double __svml_asin1_ha_ex(double a);
 211    double __svml_asin2_ha_ex(double a);
 212    double __svml_asin1_ha_l9(double a);
 213    double __svml_asin2_ha_l9(double a);
 214    double __svml_asin4_ha_l9(double a);
 215    double __svml_asin1_ha_e9(double a);
 216    double __svml_asin2_ha_e9(double a);
 217    double __svml_asin4_ha_e9(double a);
 218    double __svml_asin8_ha_z0(double a);
 219    float __svml_asinf4_ha_l9(float a);
 220    float __svml_asinf8_ha_l9(float a);
 221    float __svml_asinf4_ha_e9(float a);
 222    float __svml_asinf8_ha_e9(float a);
 223    float __svml_asinf16_ha_z0(float a);
 224    float __svml_atanf4_ha_ex(float a);
 225    double __svml_atan1_ha_ex(double a);
 226    double __svml_atan2_ha_ex(double a);
 227    double __svml_atan1_ha_l9(double a);
 228    double __svml_atan2_ha_l9(double a);
 229    double __svml_atan4_ha_l9(double a);
 230    double __svml_atan1_ha_e9(double a);
 231    double __svml_atan2_ha_e9(double a);
 232    double __svml_atan4_ha_e9(double a);
 233    double __svml_atan8_ha_z0(double a);
 234    float __svml_atanf4_ha_l9(float a);
 235    float __svml_atanf8_ha_l9(float a);
 236    float __svml_atanf4_ha_e9(float a);
 237    float __svml_atanf8_ha_e9(float a);
 238    float __svml_atanf16_ha_z0(float a);
 239    float __svml_powf4_ha_l9(float a, float b);
 240    float __svml_powf8_ha_l9(float a, float b);
 241    float __svml_powf4_ha_e9(float a, float b);
 242    float __svml_powf8_ha_e9(float a, float b);
 243    float __svml_powf16_ha_z0(float a, float b);
 244    double __svml_pow1_ha_l9(double a, double b);
 245    double __svml_pow2_ha_l9(double a, double b);
 246    double __svml_pow4_ha_l9(double a, double b);
 247    double __svml_pow1_ha_e9(double a, double b);
 248    double __svml_pow2_ha_e9(double a, double b);
 249    double __svml_pow4_ha_e9(double a, double b);
 250    double __svml_pow8_ha_z0(double a, double b);
 251    float __svml_hypotf4_ha_l9(float a, float b);
 252    float __svml_hypotf8_ha_l9(float a, float b);
 253    float __svml_hypotf4_ha_e9(float a, float b);
 254    float __svml_hypotf8_ha_e9(float a, float b);
 255    float __svml_hypotf16_ha_z0(float a, float b);
 256    double __svml_hypot1_ha_l9(double a, double b);
 257    double __svml_hypot2_ha_l9(double a, double b);
 258    double __svml_hypot4_ha_l9(double a, double b);
 259    double __svml_hypot1_ha_e9(double a, double b);
 260    double __svml_hypot2_ha_e9(double a, double b);
 261    double __svml_hypot4_ha_e9(double a, double b);
 262    double __svml_hypot8_ha_z0(double a, double b);
 263    float __svml_cbrtf4_ha_l9(float a);
 264    float __svml_cbrtf8_ha_l9(float a);
 265    float __svml_cbrtf4_ha_e9(float a);
 266    float __svml_cbrtf8_ha_e9(float a);
 267    float __svml_cbrtf16_ha_z0(float a);
 268    double __svml_cbrt1_ha_l9(double a);
 269    double __svml_cbrt2_ha_l9(double a);
 270    double __svml_cbrt4_ha_l9(double a);
 271    double __svml_cbrt1_ha_e9(double a);
 272    double __svml_cbrt2_ha_e9(double a);
 273    double __svml_cbrt4_ha_e9(double a);
 274    double __svml_cbrt8_ha_z0(double a);
 275    float __svml_atan2f4_ha_l9(float a, float b);
 276    float __svml_atan2f8_ha_l9(float a, float b);
 277    float __svml_atan2f4_ha_e9(float a, float b);
 278    float __svml_atan2f8_ha_e9(float a, float b);
 279    float __svml_atan2f16_ha_z0(float a, float b);
 280    double __svml_atan21_ha_l9(double a, double b);
 281    double __svml_atan22_ha_l9(double a, double b);
 282    double __svml_atan24_ha_l9(double a, double b);
 283    double __svml_atan28_ha_z0(double a, double b);
 284    double __svml_atan21_ha_e9(double a, double b);
 285    double __svml_atan22_ha_e9(double a, double b);
 286    double __svml_atan24_ha_e9(double a, double b);
 287    float __svml_sinf4_ha_ex(float a);
 288    double __svml_sin1_ha_ex(double a);
 289    double __svml_sin2_ha_ex(double a);
 290    float __svml_cosf4_ha_ex(float a);
 291    double __svml_cos1_ha_ex(double a);
 292    double __svml_cos2_ha_ex(double a);
 293    float __svml_tanf4_ha_ex(float a);
 294    double __svml_tan1_ha_ex(double a);
 295    double __svml_tan2_ha_ex(double a);
 296    float __svml_sinhf4_ha_ex(float a);
 297    double __svml_sinh1_ha_ex(double a);
 298    double __svml_sinh2_ha_ex(double a);
 299    float __svml_coshf4_ha_ex(float a);
 300    double __svml_cosh1_ha_ex(double a);
 301    double __svml_cosh2_ha_ex(double a);
 302    float __svml_tanhf4_ha_ex(float a);
 303    double __svml_tanh1_ha_ex(double a);
 304    double __svml_tanh2_ha_ex(double a);
 305    double __svml_log1_ha_ex(double a);
 306    double __svml_log2_ha_ex(double a);
 307    double __svml_log1p1_ha_ex(double a);
 308    double __svml_log1p2_ha_ex(double a);
 309    double __svml_log101_ha_ex(double a);
 310    double __svml_log102_ha_ex(double a);
 311    float __svml_logf4_ha_ex(float a);
 312    float __svml_log1pf4_ha_ex(float a);
 313    float __svml_log10f4_ha_ex(float a);
 314    double __svml_atan21_ha_ex(double a); 
 315    double __svml_atan22_ha_ex(double a); 
 316    float __svml_atan2f4_ha_ex(float a);
 317    float __svml_hypotf4_ha_ex(float a);
 318    double __svml_hypot1_ha_ex(double a);
 319    double __svml_hypot2_ha_ex(double a);
 320    double __svml_pow1_ha_ex(double a);
 321    double __svml_pow2_ha_ex(double a);
 322    float __svml_powf4_ha_ex(float a);
 323    double __svml_cbrt1_ha_ex(double a);
 324    double __svml_cbrt2_ha_ex(double a);
 325    float __svml_cbrtf4_ha_ex(float a);
 326 }
 327 #endif
 328 
 329 // Declaration and definition of StubGenerator (no .hpp file).
 330 // For a more detailed description of the stub routine structure
 331 // see the comment in stubRoutines.hpp
 332 
 333 #define __ _masm->
 334 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
 335 #define a__ ((Assembler*)_masm)->
 336 
 337 #ifdef PRODUCT
 338 #define BLOCK_COMMENT(str) /* nothing */
 339 #else
 340 #define BLOCK_COMMENT(str) __ block_comment(str)
 341 #endif
 342 
 343 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
 344 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
 345 
 346 // Stub Code definitions
 347 
 348 class StubGenerator: public StubCodeGenerator {
 349  private:
 350 
 351 #ifdef PRODUCT
 352 #define inc_counter_np(counter) ((void)0)
 353 #else
 354   void inc_counter_np_(int& counter) {
 355     // This can destroy rscratch1 if counter is far from the code cache
 356     __ incrementl(ExternalAddress((address)&counter));
 357   }
 358 #define inc_counter_np(counter) \
 359   BLOCK_COMMENT("inc_counter " #counter); \
 360   inc_counter_np_(counter);
 361 #endif
 362 
 363   // Call stubs are used to call Java from C
 364   //
 365   // Linux Arguments:
 366   //    c_rarg0:   call wrapper address                   address
 367   //    c_rarg1:   result                                 address
 368   //    c_rarg2:   result type                            BasicType
 369   //    c_rarg3:   method                                 Method*
 370   //    c_rarg4:   (interpreter) entry point              address
 371   //    c_rarg5:   parameters                             intptr_t*
 372   //    16(rbp): parameter size (in words)              int
 373   //    24(rbp): thread                                 Thread*
 374   //
 375   //     [ return_from_Java     ] <--- rsp
 376   //     [ argument word n      ]
 377   //      ...
 378   // -12 [ argument word 1      ]
 379   // -11 [ saved r15            ] <--- rsp_after_call
 380   // -10 [ saved r14            ]
 381   //  -9 [ saved r13            ]
 382   //  -8 [ saved r12            ]
 383   //  -7 [ saved rbx            ]
 384   //  -6 [ call wrapper         ]
 385   //  -5 [ result               ]
 386   //  -4 [ result type          ]
 387   //  -3 [ method               ]
 388   //  -2 [ entry point          ]
 389   //  -1 [ parameters           ]
 390   //   0 [ saved rbp            ] <--- rbp
 391   //   1 [ return address       ]
 392   //   2 [ parameter size       ]
 393   //   3 [ thread               ]
 394   //
 395   // Windows Arguments:
 396   //    c_rarg0:   call wrapper address                   address
 397   //    c_rarg1:   result                                 address
 398   //    c_rarg2:   result type                            BasicType
 399   //    c_rarg3:   method                                 Method*
 400   //    48(rbp): (interpreter) entry point              address
 401   //    56(rbp): parameters                             intptr_t*
 402   //    64(rbp): parameter size (in words)              int
 403   //    72(rbp): thread                                 Thread*
 404   //
 405   //     [ return_from_Java     ] <--- rsp
 406   //     [ argument word n      ]
 407   //      ...
 408   // -60 [ argument word 1      ]
 409   // -59 [ saved xmm31          ] <--- rsp after_call
 410   //     [ saved xmm16-xmm30    ] (EVEX enabled, else the space is blank)
 411   // -27 [ saved xmm15          ]
 412   //     [ saved xmm7-xmm14     ]
 413   //  -9 [ saved xmm6           ] (each xmm register takes 2 slots)
 414   //  -7 [ saved r15            ]
 415   //  -6 [ saved r14            ]
 416   //  -5 [ saved r13            ]
 417   //  -4 [ saved r12            ]
 418   //  -3 [ saved rdi            ]
 419   //  -2 [ saved rsi            ]
 420   //  -1 [ saved rbx            ]
 421   //   0 [ saved rbp            ] <--- rbp
 422   //   1 [ return address       ]
 423   //   2 [ call wrapper         ]
 424   //   3 [ result               ]
 425   //   4 [ result type          ]
 426   //   5 [ method               ]
 427   //   6 [ entry point          ]
 428   //   7 [ parameters           ]
 429   //   8 [ parameter size       ]
 430   //   9 [ thread               ]
 431   //
 432   //    Windows reserves the callers stack space for arguments 1-4.
 433   //    We spill c_rarg0-c_rarg3 to this space.
 434 
 435   // Call stub stack layout word offsets from rbp
 436   enum call_stub_layout {
 437 #ifdef _WIN64
 438     xmm_save_first     = 6,  // save from xmm6
 439     xmm_save_last      = 31, // to xmm31
 440     xmm_save_base      = -9,
 441     rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
 442     r15_off            = -7,
 443     r14_off            = -6,
 444     r13_off            = -5,
 445     r12_off            = -4,
 446     rdi_off            = -3,
 447     rsi_off            = -2,
 448     rbx_off            = -1,
 449     rbp_off            =  0,
 450     retaddr_off        =  1,
 451     call_wrapper_off   =  2,
 452     result_off         =  3,
 453     result_type_off    =  4,
 454     method_off         =  5,
 455     entry_point_off    =  6,
 456     parameters_off     =  7,
 457     parameter_size_off =  8,
 458     thread_off         =  9
 459 #else
 460     rsp_after_call_off = -12,
 461     mxcsr_off          = rsp_after_call_off,
 462     r15_off            = -11,
 463     r14_off            = -10,
 464     r13_off            = -9,
 465     r12_off            = -8,
 466     rbx_off            = -7,
 467     call_wrapper_off   = -6,
 468     result_off         = -5,
 469     result_type_off    = -4,
 470     method_off         = -3,
 471     entry_point_off    = -2,
 472     parameters_off     = -1,
 473     rbp_off            =  0,
 474     retaddr_off        =  1,
 475     parameter_size_off =  2,
 476     thread_off         =  3
 477 #endif
 478   };
 479 
 480 #ifdef _WIN64
 481   Address xmm_save(int reg) {
 482     assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range");
 483     return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize);
 484   }
 485 #endif
 486 
 487   address generate_call_stub(address& return_address) {
 488     assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 &&
 489            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 490            "adjust this code");
 491     StubCodeMark mark(this, "StubRoutines", "call_stub");
 492     address start = __ pc();
 493 
 494     // same as in generate_catch_exception()!
 495     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 496 
 497     const Address call_wrapper  (rbp, call_wrapper_off   * wordSize);
 498     const Address result        (rbp, result_off         * wordSize);
 499     const Address result_type   (rbp, result_type_off    * wordSize);
 500     const Address method        (rbp, method_off         * wordSize);
 501     const Address entry_point   (rbp, entry_point_off    * wordSize);
 502     const Address parameters    (rbp, parameters_off     * wordSize);
 503     const Address parameter_size(rbp, parameter_size_off * wordSize);
 504 
 505     // same as in generate_catch_exception()!
 506     const Address thread        (rbp, thread_off         * wordSize);
 507 
 508     const Address r15_save(rbp, r15_off * wordSize);
 509     const Address r14_save(rbp, r14_off * wordSize);
 510     const Address r13_save(rbp, r13_off * wordSize);
 511     const Address r12_save(rbp, r12_off * wordSize);
 512     const Address rbx_save(rbp, rbx_off * wordSize);
 513 
 514     // stub code
 515     __ enter();
 516     __ subptr(rsp, -rsp_after_call_off * wordSize);
 517 
 518     // save register parameters
 519 #ifndef _WIN64
 520     __ movptr(parameters,   c_rarg5); // parameters
 521     __ movptr(entry_point,  c_rarg4); // entry_point
 522 #endif
 523 
 524     __ movptr(method,       c_rarg3); // method
 525     __ movl(result_type,  c_rarg2);   // result type
 526     __ movptr(result,       c_rarg1); // result
 527     __ movptr(call_wrapper, c_rarg0); // call wrapper
 528 
 529     // save regs belonging to calling function
 530     __ movptr(rbx_save, rbx);
 531     __ movptr(r12_save, r12);
 532     __ movptr(r13_save, r13);
 533     __ movptr(r14_save, r14);
 534     __ movptr(r15_save, r15);
 535 
 536 #ifdef _WIN64
 537     int last_reg = 15;
 538     if (UseAVX > 2) {
 539       last_reg = 31;
 540     }
 541     if (VM_Version::supports_evex()) {
 542       for (int i = xmm_save_first; i <= last_reg; i++) {
 543         __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0);
 544       }
 545     } else {
 546       for (int i = xmm_save_first; i <= last_reg; i++) {
 547         __ movdqu(xmm_save(i), as_XMMRegister(i));
 548       }
 549     }
 550 
 551     const Address rdi_save(rbp, rdi_off * wordSize);
 552     const Address rsi_save(rbp, rsi_off * wordSize);
 553 
 554     __ movptr(rsi_save, rsi);
 555     __ movptr(rdi_save, rdi);
 556 #else
 557     const Address mxcsr_save(rbp, mxcsr_off * wordSize);
 558     {
 559       Label skip_ldmx;
 560       __ stmxcsr(mxcsr_save);
 561       __ movl(rax, mxcsr_save);
 562       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 563       ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
 564       __ cmp32(rax, mxcsr_std);
 565       __ jcc(Assembler::equal, skip_ldmx);
 566       __ ldmxcsr(mxcsr_std);
 567       __ bind(skip_ldmx);
 568     }
 569 #endif
 570 
 571     // Load up thread register
 572     __ movptr(r15_thread, thread);
 573     __ reinit_heapbase();
 574 
 575 #ifdef ASSERT
 576     // make sure we have no pending exceptions
 577     {
 578       Label L;
 579       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 580       __ jcc(Assembler::equal, L);
 581       __ stop("StubRoutines::call_stub: entered with pending exception");
 582       __ bind(L);
 583     }
 584 #endif
 585 
 586     // pass parameters if any
 587     BLOCK_COMMENT("pass parameters if any");
 588     Label parameters_done;
 589     __ movl(c_rarg3, parameter_size);
 590     __ testl(c_rarg3, c_rarg3);
 591     __ jcc(Assembler::zero, parameters_done);
 592 
 593     Label loop;
 594     __ movptr(c_rarg2, parameters);       // parameter pointer
 595     __ movl(c_rarg1, c_rarg3);            // parameter counter is in c_rarg1
 596     __ BIND(loop);
 597     __ movptr(rax, Address(c_rarg2, 0));// get parameter
 598     __ addptr(c_rarg2, wordSize);       // advance to next parameter
 599     __ decrementl(c_rarg1);             // decrement counter
 600     __ push(rax);                       // pass parameter
 601     __ jcc(Assembler::notZero, loop);
 602 
 603     // call Java function
 604     __ BIND(parameters_done);
 605     __ movptr(rbx, method);             // get Method*
 606     __ movptr(c_rarg1, entry_point);    // get entry_point
 607     __ mov(r13, rsp);                   // set sender sp
 608     BLOCK_COMMENT("call Java function");
 609     __ call(c_rarg1);
 610 
 611     BLOCK_COMMENT("call_stub_return_address:");
 612     return_address = __ pc();
 613 
 614     // store result depending on type (everything that is not
 615     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 616     __ movptr(c_rarg0, result);
 617     Label is_long, is_float, is_double, exit;
 618     __ movl(c_rarg1, result_type);
 619     __ cmpl(c_rarg1, T_OBJECT);
 620     __ jcc(Assembler::equal, is_long);
 621     __ cmpl(c_rarg1, T_LONG);
 622     __ jcc(Assembler::equal, is_long);
 623     __ cmpl(c_rarg1, T_FLOAT);
 624     __ jcc(Assembler::equal, is_float);
 625     __ cmpl(c_rarg1, T_DOUBLE);
 626     __ jcc(Assembler::equal, is_double);
 627 
 628     // handle T_INT case
 629     __ movl(Address(c_rarg0, 0), rax);
 630 
 631     __ BIND(exit);
 632 
 633     // pop parameters
 634     __ lea(rsp, rsp_after_call);
 635 
 636 #ifdef ASSERT
 637     // verify that threads correspond
 638     {
 639      Label L1, L2, L3;
 640       __ cmpptr(r15_thread, thread);
 641       __ jcc(Assembler::equal, L1);
 642       __ stop("StubRoutines::call_stub: r15_thread is corrupted");
 643       __ bind(L1);
 644       __ get_thread(rbx);
 645       __ cmpptr(r15_thread, thread);
 646       __ jcc(Assembler::equal, L2);
 647       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
 648       __ bind(L2);
 649       __ cmpptr(r15_thread, rbx);
 650       __ jcc(Assembler::equal, L3);
 651       __ stop("StubRoutines::call_stub: threads must correspond");
 652       __ bind(L3);
 653     }
 654 #endif
 655 
 656     // restore regs belonging to calling function
 657 #ifdef _WIN64
 658     // emit the restores for xmm regs
 659     if (VM_Version::supports_evex()) {
 660       for (int i = xmm_save_first; i <= last_reg; i++) {
 661         __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
 662       }
 663     } else {
 664       for (int i = xmm_save_first; i <= last_reg; i++) {
 665         __ movdqu(as_XMMRegister(i), xmm_save(i));
 666       }
 667     }
 668 #endif
 669     __ movptr(r15, r15_save);
 670     __ movptr(r14, r14_save);
 671     __ movptr(r13, r13_save);
 672     __ movptr(r12, r12_save);
 673     __ movptr(rbx, rbx_save);
 674 
 675 #ifdef _WIN64
 676     __ movptr(rdi, rdi_save);
 677     __ movptr(rsi, rsi_save);
 678 #else
 679     __ ldmxcsr(mxcsr_save);
 680 #endif
 681 
 682     // restore rsp
 683     __ addptr(rsp, -rsp_after_call_off * wordSize);
 684 
 685     // return
 686     __ vzeroupper();
 687     __ pop(rbp);
 688     __ ret(0);
 689 
 690     // handle return types different from T_INT
 691     __ BIND(is_long);
 692     __ movq(Address(c_rarg0, 0), rax);
 693     __ jmp(exit);
 694 
 695     __ BIND(is_float);
 696     __ movflt(Address(c_rarg0, 0), xmm0);
 697     __ jmp(exit);
 698 
 699     __ BIND(is_double);
 700     __ movdbl(Address(c_rarg0, 0), xmm0);
 701     __ jmp(exit);
 702 
 703     return start;
 704   }
 705 
 706   // Return point for a Java call if there's an exception thrown in
 707   // Java code.  The exception is caught and transformed into a
 708   // pending exception stored in JavaThread that can be tested from
 709   // within the VM.
 710   //
 711   // Note: Usually the parameters are removed by the callee. In case
 712   // of an exception crossing an activation frame boundary, that is
 713   // not the case if the callee is compiled code => need to setup the
 714   // rsp.
 715   //
 716   // rax: exception oop
 717 
 718   address generate_catch_exception() {
 719     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 720     address start = __ pc();
 721 
 722     // same as in generate_call_stub():
 723     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 724     const Address thread        (rbp, thread_off         * wordSize);
 725 
 726 #ifdef ASSERT
 727     // verify that threads correspond
 728     {
 729       Label L1, L2, L3;
 730       __ cmpptr(r15_thread, thread);
 731       __ jcc(Assembler::equal, L1);
 732       __ stop("StubRoutines::catch_exception: r15_thread is corrupted");
 733       __ bind(L1);
 734       __ get_thread(rbx);
 735       __ cmpptr(r15_thread, thread);
 736       __ jcc(Assembler::equal, L2);
 737       __ stop("StubRoutines::catch_exception: r15_thread is modified by call");
 738       __ bind(L2);
 739       __ cmpptr(r15_thread, rbx);
 740       __ jcc(Assembler::equal, L3);
 741       __ stop("StubRoutines::catch_exception: threads must correspond");
 742       __ bind(L3);
 743     }
 744 #endif
 745 
 746     // set pending exception
 747     __ verify_oop(rax);
 748 
 749     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
 750     __ lea(rscratch1, ExternalAddress((address)__FILE__));
 751     __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
 752     __ movl(Address(r15_thread, Thread::exception_line_offset()), (int)  __LINE__);
 753 
 754     // complete return to VM
 755     assert(StubRoutines::_call_stub_return_address != NULL,
 756            "_call_stub_return_address must have been generated before");
 757     __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
 758 
 759     return start;
 760   }
 761 
 762   // Continuation point for runtime calls returning with a pending
 763   // exception.  The pending exception check happened in the runtime
 764   // or native call stub.  The pending exception in Thread is
 765   // converted into a Java-level exception.
 766   //
 767   // Contract with Java-level exception handlers:
 768   // rax: exception
 769   // rdx: throwing pc
 770   //
 771   // NOTE: At entry of this stub, exception-pc must be on stack !!
 772 
 773   address generate_forward_exception() {
 774     StubCodeMark mark(this, "StubRoutines", "forward exception");
 775     address start = __ pc();
 776 
 777     // Upon entry, the sp points to the return address returning into
 778     // Java (interpreted or compiled) code; i.e., the return address
 779     // becomes the throwing pc.
 780     //
 781     // Arguments pushed before the runtime call are still on the stack
 782     // but the exception handler will reset the stack pointer ->
 783     // ignore them.  A potential result in registers can be ignored as
 784     // well.
 785 
 786 #ifdef ASSERT
 787     // make sure this code is only executed if there is a pending exception
 788     {
 789       Label L;
 790       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL);
 791       __ jcc(Assembler::notEqual, L);
 792       __ stop("StubRoutines::forward exception: no pending exception (1)");
 793       __ bind(L);
 794     }
 795 #endif
 796 
 797     // compute exception handler into rbx
 798     __ movptr(c_rarg0, Address(rsp, 0));
 799     BLOCK_COMMENT("call exception_handler_for_return_address");
 800     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 801                          SharedRuntime::exception_handler_for_return_address),
 802                     r15_thread, c_rarg0);
 803     __ mov(rbx, rax);
 804 
 805     // setup rax & rdx, remove return address & clear pending exception
 806     __ pop(rdx);
 807     __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 808     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 809 
 810 #ifdef ASSERT
 811     // make sure exception is set
 812     {
 813       Label L;
 814       __ testptr(rax, rax);
 815       __ jcc(Assembler::notEqual, L);
 816       __ stop("StubRoutines::forward exception: no pending exception (2)");
 817       __ bind(L);
 818     }
 819 #endif
 820 
 821     // continue at exception handler (return address removed)
 822     // rax: exception
 823     // rbx: exception handler
 824     // rdx: throwing pc
 825     __ verify_oop(rax);
 826     __ jmp(rbx);
 827 
 828     return start;
 829   }
 830 
 831   // Support for jint atomic::xchg(jint exchange_value, volatile jint* dest)
 832   //
 833   // Arguments :
 834   //    c_rarg0: exchange_value
 835   //    c_rarg0: dest
 836   //
 837   // Result:
 838   //    *dest <- ex, return (orig *dest)
 839   address generate_atomic_xchg() {
 840     StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
 841     address start = __ pc();
 842 
 843     __ movl(rax, c_rarg0); // Copy to eax we need a return value anyhow
 844     __ xchgl(rax, Address(c_rarg1, 0)); // automatic LOCK
 845     __ ret(0);
 846 
 847     return start;
 848   }
 849 
 850   // Support for intptr_t atomic::xchg_long(jlong exchange_value, volatile jlong* dest)
 851   //
 852   // Arguments :
 853   //    c_rarg0: exchange_value
 854   //    c_rarg1: dest
 855   //
 856   // Result:
 857   //    *dest <- ex, return (orig *dest)
 858   address generate_atomic_xchg_long() {
 859     StubCodeMark mark(this, "StubRoutines", "atomic_xchg_long");
 860     address start = __ pc();
 861 
 862     __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
 863     __ xchgptr(rax, Address(c_rarg1, 0)); // automatic LOCK
 864     __ ret(0);
 865 
 866     return start;
 867   }
 868 
 869   // Support for jint atomic::atomic_cmpxchg(jint exchange_value, volatile jint* dest,
 870   //                                         jint compare_value)
 871   //
 872   // Arguments :
 873   //    c_rarg0: exchange_value
 874   //    c_rarg1: dest
 875   //    c_rarg2: compare_value
 876   //
 877   // Result:
 878   //    if ( compare_value == *dest ) {
 879   //       *dest = exchange_value
 880   //       return compare_value;
 881   //    else
 882   //       return *dest;
 883   address generate_atomic_cmpxchg() {
 884     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
 885     address start = __ pc();
 886 
 887     __ movl(rax, c_rarg2);
 888     __ lock();
 889     __ cmpxchgl(c_rarg0, Address(c_rarg1, 0));
 890     __ ret(0);
 891 
 892     return start;
 893   }
 894 
 895   // Support for int8_t atomic::atomic_cmpxchg(int8_t exchange_value, volatile int8_t* dest,
 896   //                                           int8_t compare_value)
 897   //
 898   // Arguments :
 899   //    c_rarg0: exchange_value
 900   //    c_rarg1: dest
 901   //    c_rarg2: compare_value
 902   //
 903   // Result:
 904   //    if ( compare_value == *dest ) {
 905   //       *dest = exchange_value
 906   //       return compare_value;
 907   //    else
 908   //       return *dest;
 909   address generate_atomic_cmpxchg_byte() {
 910     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_byte");
 911     address start = __ pc();
 912 
 913     __ movsbq(rax, c_rarg2);
 914     __ lock();
 915     __ cmpxchgb(c_rarg0, Address(c_rarg1, 0));
 916     __ ret(0);
 917 
 918     return start;
 919   }
 920 
 921   // Support for int64_t atomic::atomic_cmpxchg(int64_t exchange_value,
 922   //                                            volatile int64_t* dest,
 923   //                                            int64_t compare_value)
 924   // Arguments :
 925   //    c_rarg0: exchange_value
 926   //    c_rarg1: dest
 927   //    c_rarg2: compare_value
 928   //
 929   // Result:
 930   //    if ( compare_value == *dest ) {
 931   //       *dest = exchange_value
 932   //       return compare_value;
 933   //    else
 934   //       return *dest;
 935   address generate_atomic_cmpxchg_long() {
 936     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
 937     address start = __ pc();
 938 
 939     __ movq(rax, c_rarg2);
 940     __ lock();
 941     __ cmpxchgq(c_rarg0, Address(c_rarg1, 0));
 942     __ ret(0);
 943 
 944     return start;
 945   }
 946 
 947   // Support for jint atomic::add(jint add_value, volatile jint* dest)
 948   //
 949   // Arguments :
 950   //    c_rarg0: add_value
 951   //    c_rarg1: dest
 952   //
 953   // Result:
 954   //    *dest += add_value
 955   //    return *dest;
 956   address generate_atomic_add() {
 957     StubCodeMark mark(this, "StubRoutines", "atomic_add");
 958     address start = __ pc();
 959 
 960     __ movl(rax, c_rarg0);
 961     __ lock();
 962     __ xaddl(Address(c_rarg1, 0), c_rarg0);
 963     __ addl(rax, c_rarg0);
 964     __ ret(0);
 965 
 966     return start;
 967   }
 968 
 969   // Support for intptr_t atomic::add_ptr(intptr_t add_value, volatile intptr_t* dest)
 970   //
 971   // Arguments :
 972   //    c_rarg0: add_value
 973   //    c_rarg1: dest
 974   //
 975   // Result:
 976   //    *dest += add_value
 977   //    return *dest;
 978   address generate_atomic_add_long() {
 979     StubCodeMark mark(this, "StubRoutines", "atomic_add_long");
 980     address start = __ pc();
 981 
 982     __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
 983     __ lock();
 984     __ xaddptr(Address(c_rarg1, 0), c_rarg0);
 985     __ addptr(rax, c_rarg0);
 986     __ ret(0);
 987 
 988     return start;
 989   }
 990 
 991   // Support for intptr_t OrderAccess::fence()
 992   //
 993   // Arguments :
 994   //
 995   // Result:
 996   address generate_orderaccess_fence() {
 997     StubCodeMark mark(this, "StubRoutines", "orderaccess_fence");
 998     address start = __ pc();
 999     __ membar(Assembler::StoreLoad);
1000     __ ret(0);
1001 
1002     return start;
1003   }
1004 
1005   // Support for intptr_t get_previous_fp()
1006   //
1007   // This routine is used to find the previous frame pointer for the
1008   // caller (current_frame_guess). This is used as part of debugging
1009   // ps() is seemingly lost trying to find frames.
1010   // This code assumes that caller current_frame_guess) has a frame.
1011   address generate_get_previous_fp() {
1012     StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
1013     const Address old_fp(rbp, 0);
1014     const Address older_fp(rax, 0);
1015     address start = __ pc();
1016 
1017     __ enter();
1018     __ movptr(rax, old_fp); // callers fp
1019     __ movptr(rax, older_fp); // the frame for ps()
1020     __ pop(rbp);
1021     __ ret(0);
1022 
1023     return start;
1024   }
1025 
1026   // Support for intptr_t get_previous_sp()
1027   //
1028   // This routine is used to find the previous stack pointer for the
1029   // caller.
1030   address generate_get_previous_sp() {
1031     StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
1032     address start = __ pc();
1033 
1034     __ movptr(rax, rsp);
1035     __ addptr(rax, 8); // return address is at the top of the stack.
1036     __ ret(0);
1037 
1038     return start;
1039   }
1040 
1041   //----------------------------------------------------------------------------------------------------
1042   // Support for void verify_mxcsr()
1043   //
1044   // This routine is used with -Xcheck:jni to verify that native
1045   // JNI code does not return to Java code without restoring the
1046   // MXCSR register to our expected state.
1047 
1048   address generate_verify_mxcsr() {
1049     StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
1050     address start = __ pc();
1051 
1052     const Address mxcsr_save(rsp, 0);
1053 
1054     if (CheckJNICalls) {
1055       Label ok_ret;
1056       ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
1057       __ push(rax);
1058       __ subptr(rsp, wordSize);      // allocate a temp location
1059       __ stmxcsr(mxcsr_save);
1060       __ movl(rax, mxcsr_save);
1061       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
1062       __ cmp32(rax, mxcsr_std);
1063       __ jcc(Assembler::equal, ok_ret);
1064 
1065       __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
1066 
1067       __ ldmxcsr(mxcsr_std);
1068 
1069       __ bind(ok_ret);
1070       __ addptr(rsp, wordSize);
1071       __ pop(rax);
1072     }
1073 
1074     __ ret(0);
1075 
1076     return start;
1077   }
1078 
1079   address generate_f2i_fixup() {
1080     StubCodeMark mark(this, "StubRoutines", "f2i_fixup");
1081     Address inout(rsp, 5 * wordSize); // return address + 4 saves
1082 
1083     address start = __ pc();
1084 
1085     Label L;
1086 
1087     __ push(rax);
1088     __ push(c_rarg3);
1089     __ push(c_rarg2);
1090     __ push(c_rarg1);
1091 
1092     __ movl(rax, 0x7f800000);
1093     __ xorl(c_rarg3, c_rarg3);
1094     __ movl(c_rarg2, inout);
1095     __ movl(c_rarg1, c_rarg2);
1096     __ andl(c_rarg1, 0x7fffffff);
1097     __ cmpl(rax, c_rarg1); // NaN? -> 0
1098     __ jcc(Assembler::negative, L);
1099     __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint
1100     __ movl(c_rarg3, 0x80000000);
1101     __ movl(rax, 0x7fffffff);
1102     __ cmovl(Assembler::positive, c_rarg3, rax);
1103 
1104     __ bind(L);
1105     __ movptr(inout, c_rarg3);
1106 
1107     __ pop(c_rarg1);
1108     __ pop(c_rarg2);
1109     __ pop(c_rarg3);
1110     __ pop(rax);
1111 
1112     __ ret(0);
1113 
1114     return start;
1115   }
1116 
1117   address generate_f2l_fixup() {
1118     StubCodeMark mark(this, "StubRoutines", "f2l_fixup");
1119     Address inout(rsp, 5 * wordSize); // return address + 4 saves
1120     address start = __ pc();
1121 
1122     Label L;
1123 
1124     __ push(rax);
1125     __ push(c_rarg3);
1126     __ push(c_rarg2);
1127     __ push(c_rarg1);
1128 
1129     __ movl(rax, 0x7f800000);
1130     __ xorl(c_rarg3, c_rarg3);
1131     __ movl(c_rarg2, inout);
1132     __ movl(c_rarg1, c_rarg2);
1133     __ andl(c_rarg1, 0x7fffffff);
1134     __ cmpl(rax, c_rarg1); // NaN? -> 0
1135     __ jcc(Assembler::negative, L);
1136     __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
1137     __ mov64(c_rarg3, 0x8000000000000000);
1138     __ mov64(rax, 0x7fffffffffffffff);
1139     __ cmov(Assembler::positive, c_rarg3, rax);
1140 
1141     __ bind(L);
1142     __ movptr(inout, c_rarg3);
1143 
1144     __ pop(c_rarg1);
1145     __ pop(c_rarg2);
1146     __ pop(c_rarg3);
1147     __ pop(rax);
1148 
1149     __ ret(0);
1150 
1151     return start;
1152   }
1153 
1154   address generate_d2i_fixup() {
1155     StubCodeMark mark(this, "StubRoutines", "d2i_fixup");
1156     Address inout(rsp, 6 * wordSize); // return address + 5 saves
1157 
1158     address start = __ pc();
1159 
1160     Label L;
1161 
1162     __ push(rax);
1163     __ push(c_rarg3);
1164     __ push(c_rarg2);
1165     __ push(c_rarg1);
1166     __ push(c_rarg0);
1167 
1168     __ movl(rax, 0x7ff00000);
1169     __ movq(c_rarg2, inout);
1170     __ movl(c_rarg3, c_rarg2);
1171     __ mov(c_rarg1, c_rarg2);
1172     __ mov(c_rarg0, c_rarg2);
1173     __ negl(c_rarg3);
1174     __ shrptr(c_rarg1, 0x20);
1175     __ orl(c_rarg3, c_rarg2);
1176     __ andl(c_rarg1, 0x7fffffff);
1177     __ xorl(c_rarg2, c_rarg2);
1178     __ shrl(c_rarg3, 0x1f);
1179     __ orl(c_rarg1, c_rarg3);
1180     __ cmpl(rax, c_rarg1);
1181     __ jcc(Assembler::negative, L); // NaN -> 0
1182     __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
1183     __ movl(c_rarg2, 0x80000000);
1184     __ movl(rax, 0x7fffffff);
1185     __ cmov(Assembler::positive, c_rarg2, rax);
1186 
1187     __ bind(L);
1188     __ movptr(inout, c_rarg2);
1189 
1190     __ pop(c_rarg0);
1191     __ pop(c_rarg1);
1192     __ pop(c_rarg2);
1193     __ pop(c_rarg3);
1194     __ pop(rax);
1195 
1196     __ ret(0);
1197 
1198     return start;
1199   }
1200 
1201   address generate_d2l_fixup() {
1202     StubCodeMark mark(this, "StubRoutines", "d2l_fixup");
1203     Address inout(rsp, 6 * wordSize); // return address + 5 saves
1204 
1205     address start = __ pc();
1206 
1207     Label L;
1208 
1209     __ push(rax);
1210     __ push(c_rarg3);
1211     __ push(c_rarg2);
1212     __ push(c_rarg1);
1213     __ push(c_rarg0);
1214 
1215     __ movl(rax, 0x7ff00000);
1216     __ movq(c_rarg2, inout);
1217     __ movl(c_rarg3, c_rarg2);
1218     __ mov(c_rarg1, c_rarg2);
1219     __ mov(c_rarg0, c_rarg2);
1220     __ negl(c_rarg3);
1221     __ shrptr(c_rarg1, 0x20);
1222     __ orl(c_rarg3, c_rarg2);
1223     __ andl(c_rarg1, 0x7fffffff);
1224     __ xorl(c_rarg2, c_rarg2);
1225     __ shrl(c_rarg3, 0x1f);
1226     __ orl(c_rarg1, c_rarg3);
1227     __ cmpl(rax, c_rarg1);
1228     __ jcc(Assembler::negative, L); // NaN -> 0
1229     __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong
1230     __ mov64(c_rarg2, 0x8000000000000000);
1231     __ mov64(rax, 0x7fffffffffffffff);
1232     __ cmovq(Assembler::positive, c_rarg2, rax);
1233 
1234     __ bind(L);
1235     __ movq(inout, c_rarg2);
1236 
1237     __ pop(c_rarg0);
1238     __ pop(c_rarg1);
1239     __ pop(c_rarg2);
1240     __ pop(c_rarg3);
1241     __ pop(rax);
1242 
1243     __ ret(0);
1244 
1245     return start;
1246   }
1247 
1248   address generate_fp_mask(const char *stub_name, int64_t mask) {
1249     __ align(CodeEntryAlignment);
1250     StubCodeMark mark(this, "StubRoutines", stub_name);
1251     address start = __ pc();
1252 
1253     __ emit_data64( mask, relocInfo::none );
1254     __ emit_data64( mask, relocInfo::none );
1255 
1256     return start;
1257   }
1258 
1259   address generate_vector_fp_mask(const char *stub_name, int64_t mask) {
1260     __ align(CodeEntryAlignment);
1261     StubCodeMark mark(this, "StubRoutines", stub_name);
1262     address start = __ pc();
1263 
1264     __ emit_data64(mask, relocInfo::none);
1265     __ emit_data64(mask, relocInfo::none);
1266     __ emit_data64(mask, relocInfo::none);
1267     __ emit_data64(mask, relocInfo::none);
1268     __ emit_data64(mask, relocInfo::none);
1269     __ emit_data64(mask, relocInfo::none);
1270     __ emit_data64(mask, relocInfo::none);
1271     __ emit_data64(mask, relocInfo::none);
1272 
1273     return start;
1274   }
1275 
1276   address generate_vector_byte_perm_mask(const char *stub_name) {
1277     __ align(CodeEntryAlignment);
1278     StubCodeMark mark(this, "StubRoutines", stub_name);
1279     address start = __ pc();
1280 
1281     __ emit_data64(0x0000000000000001, relocInfo::none);
1282     __ emit_data64(0x0000000000000003, relocInfo::none);
1283     __ emit_data64(0x0000000000000005, relocInfo::none);
1284     __ emit_data64(0x0000000000000007, relocInfo::none);
1285     __ emit_data64(0x0000000000000000, relocInfo::none);
1286     __ emit_data64(0x0000000000000002, relocInfo::none);
1287     __ emit_data64(0x0000000000000004, relocInfo::none);
1288     __ emit_data64(0x0000000000000006, relocInfo::none);
1289 
1290     return start;
1291   }
1292 
1293   address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len,
1294                                      int32_t val0, int32_t val1, int32_t val2, int32_t val3,
1295                                      int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0,
1296                                      int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0,
1297                                      int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0) {
1298     __ align(CodeEntryAlignment);
1299     StubCodeMark mark(this, "StubRoutines", stub_name);
1300     address start = __ pc();
1301 
1302     assert(len != Assembler::AVX_NoVec, "vector len must be specified");
1303     __ emit_data(val0, relocInfo::none, 0);
1304     __ emit_data(val1, relocInfo::none, 0);
1305     __ emit_data(val2, relocInfo::none, 0);
1306     __ emit_data(val3, relocInfo::none, 0);
1307     if (len >= Assembler::AVX_256bit) {
1308       __ emit_data(val4, relocInfo::none, 0);
1309       __ emit_data(val5, relocInfo::none, 0);
1310       __ emit_data(val6, relocInfo::none, 0);
1311       __ emit_data(val7, relocInfo::none, 0);
1312       if (len >= Assembler::AVX_512bit) {
1313         __ emit_data(val8, relocInfo::none, 0);
1314         __ emit_data(val9, relocInfo::none, 0);
1315         __ emit_data(val10, relocInfo::none, 0);
1316         __ emit_data(val11, relocInfo::none, 0);
1317         __ emit_data(val12, relocInfo::none, 0);
1318         __ emit_data(val13, relocInfo::none, 0);
1319         __ emit_data(val14, relocInfo::none, 0);
1320         __ emit_data(val15, relocInfo::none, 0);
1321       }
1322     }
1323 
1324     return start;
1325   }
1326 
1327   // Non-destructive plausibility checks for oops
1328   //
1329   // Arguments:
1330   //    all args on stack!
1331   //
1332   // Stack after saving c_rarg3:
1333   //    [tos + 0]: saved c_rarg3
1334   //    [tos + 1]: saved c_rarg2
1335   //    [tos + 2]: saved r12 (several TemplateTable methods use it)
1336   //    [tos + 3]: saved flags
1337   //    [tos + 4]: return address
1338   //  * [tos + 5]: error message (char*)
1339   //  * [tos + 6]: object to verify (oop)
1340   //  * [tos + 7]: saved rax - saved by caller and bashed
1341   //  * [tos + 8]: saved r10 (rscratch1) - saved by caller
1342   //  * = popped on exit
1343   address generate_verify_oop() {
1344     StubCodeMark mark(this, "StubRoutines", "verify_oop");
1345     address start = __ pc();
1346 
1347     Label exit, error;
1348 
1349     __ pushf();
1350     __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
1351 
1352     __ push(r12);
1353 
1354     // save c_rarg2 and c_rarg3
1355     __ push(c_rarg2);
1356     __ push(c_rarg3);
1357 
1358     enum {
1359            // After previous pushes.
1360            oop_to_verify = 6 * wordSize,
1361            saved_rax     = 7 * wordSize,
1362            saved_r10     = 8 * wordSize,
1363 
1364            // Before the call to MacroAssembler::debug(), see below.
1365            return_addr   = 16 * wordSize,
1366            error_msg     = 17 * wordSize
1367     };
1368 
1369     // get object
1370     __ movptr(rax, Address(rsp, oop_to_verify));
1371 
1372     // make sure object is 'reasonable'
1373     __ testptr(rax, rax);
1374     __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
1375 
1376 #if INCLUDE_ZGC
1377     if (UseZGC) {
1378       // Check if metadata bits indicate a bad oop
1379       __ testptr(rax, Address(r15_thread, ZThreadLocalData::address_bad_mask_offset()));
1380       __ jcc(Assembler::notZero, error);
1381     }
1382 #endif
1383 
1384     // Check if the oop is in the right area of memory
1385     __ movptr(c_rarg2, rax);
1386     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
1387     __ andptr(c_rarg2, c_rarg3);
1388     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
1389     __ cmpptr(c_rarg2, c_rarg3);
1390     __ jcc(Assembler::notZero, error);
1391 
1392     // set r12 to heapbase for load_klass()
1393     __ reinit_heapbase();
1394 
1395     // make sure klass is 'reasonable', which is not zero.
1396     __ load_klass(rax, rax);  // get klass
1397     __ testptr(rax, rax);
1398     __ jcc(Assembler::zero, error); // if klass is NULL it is broken
1399 
1400     // return if everything seems ok
1401     __ bind(exit);
1402     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1403     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1404     __ pop(c_rarg3);                             // restore c_rarg3
1405     __ pop(c_rarg2);                             // restore c_rarg2
1406     __ pop(r12);                                 // restore r12
1407     __ popf();                                   // restore flags
1408     __ ret(4 * wordSize);                        // pop caller saved stuff
1409 
1410     // handle errors
1411     __ bind(error);
1412     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1413     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1414     __ pop(c_rarg3);                             // get saved c_rarg3 back
1415     __ pop(c_rarg2);                             // get saved c_rarg2 back
1416     __ pop(r12);                                 // get saved r12 back
1417     __ popf();                                   // get saved flags off stack --
1418                                                  // will be ignored
1419 
1420     __ pusha();                                  // push registers
1421                                                  // (rip is already
1422                                                  // already pushed)
1423     // debug(char* msg, int64_t pc, int64_t regs[])
1424     // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and
1425     // pushed all the registers, so now the stack looks like:
1426     //     [tos +  0] 16 saved registers
1427     //     [tos + 16] return address
1428     //   * [tos + 17] error message (char*)
1429     //   * [tos + 18] object to verify (oop)
1430     //   * [tos + 19] saved rax - saved by caller and bashed
1431     //   * [tos + 20] saved r10 (rscratch1) - saved by caller
1432     //   * = popped on exit
1433 
1434     __ movptr(c_rarg0, Address(rsp, error_msg));    // pass address of error message
1435     __ movptr(c_rarg1, Address(rsp, return_addr));  // pass return address
1436     __ movq(c_rarg2, rsp);                          // pass address of regs on stack
1437     __ mov(r12, rsp);                               // remember rsp
1438     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1439     __ andptr(rsp, -16);                            // align stack as required by ABI
1440     BLOCK_COMMENT("call MacroAssembler::debug");
1441     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
1442     __ mov(rsp, r12);                               // restore rsp
1443     __ popa();                                      // pop registers (includes r12)
1444     __ ret(4 * wordSize);                           // pop caller saved stuff
1445 
1446     return start;
1447   }
1448 
1449   //
1450   // Verify that a register contains clean 32-bits positive value
1451   // (high 32-bits are 0) so it could be used in 64-bits shifts.
1452   //
1453   //  Input:
1454   //    Rint  -  32-bits value
1455   //    Rtmp  -  scratch
1456   //
1457   void assert_clean_int(Register Rint, Register Rtmp) {
1458 #ifdef ASSERT
1459     Label L;
1460     assert_different_registers(Rtmp, Rint);
1461     __ movslq(Rtmp, Rint);
1462     __ cmpq(Rtmp, Rint);
1463     __ jcc(Assembler::equal, L);
1464     __ stop("high 32-bits of int value are not 0");
1465     __ bind(L);
1466 #endif
1467   }
1468 
1469   //  Generate overlap test for array copy stubs
1470   //
1471   //  Input:
1472   //     c_rarg0 - from
1473   //     c_rarg1 - to
1474   //     c_rarg2 - element count
1475   //
1476   //  Output:
1477   //     rax   - &from[element count - 1]
1478   //
1479   void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
1480     assert(no_overlap_target != NULL, "must be generated");
1481     array_overlap_test(no_overlap_target, NULL, sf);
1482   }
1483   void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
1484     array_overlap_test(NULL, &L_no_overlap, sf);
1485   }
1486   void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
1487     const Register from     = c_rarg0;
1488     const Register to       = c_rarg1;
1489     const Register count    = c_rarg2;
1490     const Register end_from = rax;
1491 
1492     __ cmpptr(to, from);
1493     __ lea(end_from, Address(from, count, sf, 0));
1494     if (NOLp == NULL) {
1495       ExternalAddress no_overlap(no_overlap_target);
1496       __ jump_cc(Assembler::belowEqual, no_overlap);
1497       __ cmpptr(to, end_from);
1498       __ jump_cc(Assembler::aboveEqual, no_overlap);
1499     } else {
1500       __ jcc(Assembler::belowEqual, (*NOLp));
1501       __ cmpptr(to, end_from);
1502       __ jcc(Assembler::aboveEqual, (*NOLp));
1503     }
1504   }
1505 
1506   // Shuffle first three arg regs on Windows into Linux/Solaris locations.
1507   //
1508   // Outputs:
1509   //    rdi - rcx
1510   //    rsi - rdx
1511   //    rdx - r8
1512   //    rcx - r9
1513   //
1514   // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter
1515   // are non-volatile.  r9 and r10 should not be used by the caller.
1516   //
1517   DEBUG_ONLY(bool regs_in_thread;)
1518 
1519   void setup_arg_regs(int nargs = 3) {
1520     const Register saved_rdi = r9;
1521     const Register saved_rsi = r10;
1522     assert(nargs == 3 || nargs == 4, "else fix");
1523 #ifdef _WIN64
1524     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1525            "unexpected argument registers");
1526     if (nargs >= 4)
1527       __ mov(rax, r9);  // r9 is also saved_rdi
1528     __ movptr(saved_rdi, rdi);
1529     __ movptr(saved_rsi, rsi);
1530     __ mov(rdi, rcx); // c_rarg0
1531     __ mov(rsi, rdx); // c_rarg1
1532     __ mov(rdx, r8);  // c_rarg2
1533     if (nargs >= 4)
1534       __ mov(rcx, rax); // c_rarg3 (via rax)
1535 #else
1536     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1537            "unexpected argument registers");
1538 #endif
1539     DEBUG_ONLY(regs_in_thread = false;)
1540   }
1541 
1542   void restore_arg_regs() {
1543     assert(!regs_in_thread, "wrong call to restore_arg_regs");
1544     const Register saved_rdi = r9;
1545     const Register saved_rsi = r10;
1546 #ifdef _WIN64
1547     __ movptr(rdi, saved_rdi);
1548     __ movptr(rsi, saved_rsi);
1549 #endif
1550   }
1551 
1552   // This is used in places where r10 is a scratch register, and can
1553   // be adapted if r9 is needed also.
1554   void setup_arg_regs_using_thread() {
1555     const Register saved_r15 = r9;
1556 #ifdef _WIN64
1557     __ mov(saved_r15, r15);  // r15 is callee saved and needs to be restored
1558     __ get_thread(r15_thread);
1559     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1560            "unexpected argument registers");
1561     __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())), rdi);
1562     __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())), rsi);
1563 
1564     __ mov(rdi, rcx); // c_rarg0
1565     __ mov(rsi, rdx); // c_rarg1
1566     __ mov(rdx, r8);  // c_rarg2
1567 #else
1568     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1569            "unexpected argument registers");
1570 #endif
1571     DEBUG_ONLY(regs_in_thread = true;)
1572   }
1573 
1574   void restore_arg_regs_using_thread() {
1575     assert(regs_in_thread, "wrong call to restore_arg_regs");
1576     const Register saved_r15 = r9;
1577 #ifdef _WIN64
1578     __ get_thread(r15_thread);
1579     __ movptr(rsi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())));
1580     __ movptr(rdi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())));
1581     __ mov(r15, saved_r15);  // r15 is callee saved and needs to be restored
1582 #endif
1583   }
1584 
1585   // Copy big chunks forward
1586   //
1587   // Inputs:
1588   //   end_from     - source arrays end address
1589   //   end_to       - destination array end address
1590   //   qword_count  - 64-bits element count, negative
1591   //   to           - scratch
1592   //   L_copy_bytes - entry label
1593   //   L_copy_8_bytes  - exit  label
1594   //
1595   void copy_bytes_forward(Register end_from, Register end_to,
1596                              Register qword_count, Register to,
1597                              Label& L_copy_bytes, Label& L_copy_8_bytes) {
1598     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1599     Label L_loop;
1600     __ align(OptoLoopAlignment);
1601     if (UseUnalignedLoadStores) {
1602       Label L_end;
1603       // Copy 64-bytes per iteration
1604       __ BIND(L_loop);
1605       if (UseAVX > 2) {
1606         __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
1607         __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
1608       } else if (UseAVX == 2) {
1609         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1610         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1611         __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
1612         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1613       } else {
1614         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1615         __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1616         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
1617         __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
1618         __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
1619         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
1620         __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
1621         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
1622       }
1623       __ BIND(L_copy_bytes);
1624       __ addptr(qword_count, 8);
1625       __ jcc(Assembler::lessEqual, L_loop);
1626       __ subptr(qword_count, 4);  // sub(8) and add(4)
1627       __ jccb(Assembler::greater, L_end);
1628       // Copy trailing 32 bytes
1629       if (UseAVX >= 2) {
1630         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1631         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1632       } else {
1633         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1634         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1635         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1636         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1637       }
1638       __ addptr(qword_count, 4);
1639       __ BIND(L_end);
1640       if (UseAVX >= 2) {
1641         // clean upper bits of YMM registers
1642         __ vpxor(xmm0, xmm0);
1643         __ vpxor(xmm1, xmm1);
1644       }
1645     } else {
1646       // Copy 32-bytes per iteration
1647       __ BIND(L_loop);
1648       __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1649       __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1650       __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1651       __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1652       __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1653       __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1654       __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1655       __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1656 
1657       __ BIND(L_copy_bytes);
1658       __ addptr(qword_count, 4);
1659       __ jcc(Assembler::lessEqual, L_loop);
1660     }
1661     __ subptr(qword_count, 4);
1662     __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1663   }
1664 
1665   // Copy big chunks backward
1666   //
1667   // Inputs:
1668   //   from         - source arrays address
1669   //   dest         - destination array address
1670   //   qword_count  - 64-bits element count
1671   //   to           - scratch
1672   //   L_copy_bytes - entry label
1673   //   L_copy_8_bytes  - exit  label
1674   //
1675   void copy_bytes_backward(Register from, Register dest,
1676                               Register qword_count, Register to,
1677                               Label& L_copy_bytes, Label& L_copy_8_bytes) {
1678     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1679     Label L_loop;
1680     __ align(OptoLoopAlignment);
1681     if (UseUnalignedLoadStores) {
1682       Label L_end;
1683       // Copy 64-bytes per iteration
1684       __ BIND(L_loop);
1685       if (UseAVX > 2) {
1686         __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit);
1687         __ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit);
1688       } else if (UseAVX == 2) {
1689         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
1690         __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1691         __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1692         __ vmovdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1693       } else {
1694         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
1695         __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
1696         __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
1697         __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
1698         __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
1699         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
1700         __ movdqu(xmm3, Address(from, qword_count, Address::times_8,  0));
1701         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm3);
1702       }
1703       __ BIND(L_copy_bytes);
1704       __ subptr(qword_count, 8);
1705       __ jcc(Assembler::greaterEqual, L_loop);
1706 
1707       __ addptr(qword_count, 4);  // add(8) and sub(4)
1708       __ jccb(Assembler::less, L_end);
1709       // Copy trailing 32 bytes
1710       if (UseAVX >= 2) {
1711         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
1712         __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
1713       } else {
1714         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1715         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1716         __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1717         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1718       }
1719       __ subptr(qword_count, 4);
1720       __ BIND(L_end);
1721       if (UseAVX >= 2) {
1722         // clean upper bits of YMM registers
1723         __ vpxor(xmm0, xmm0);
1724         __ vpxor(xmm1, xmm1);
1725       }
1726     } else {
1727       // Copy 32-bytes per iteration
1728       __ BIND(L_loop);
1729       __ movq(to, Address(from, qword_count, Address::times_8, 24));
1730       __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1731       __ movq(to, Address(from, qword_count, Address::times_8, 16));
1732       __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1733       __ movq(to, Address(from, qword_count, Address::times_8,  8));
1734       __ movq(Address(dest, qword_count, Address::times_8,  8), to);
1735       __ movq(to, Address(from, qword_count, Address::times_8,  0));
1736       __ movq(Address(dest, qword_count, Address::times_8,  0), to);
1737 
1738       __ BIND(L_copy_bytes);
1739       __ subptr(qword_count, 4);
1740       __ jcc(Assembler::greaterEqual, L_loop);
1741     }
1742     __ addptr(qword_count, 4);
1743     __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1744   }
1745 
1746 
1747   // Arguments:
1748   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1749   //             ignored
1750   //   name    - stub name string
1751   //
1752   // Inputs:
1753   //   c_rarg0   - source array address
1754   //   c_rarg1   - destination array address
1755   //   c_rarg2   - element count, treated as ssize_t, can be zero
1756   //
1757   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1758   // we let the hardware handle it.  The one to eight bytes within words,
1759   // dwords or qwords that span cache line boundaries will still be loaded
1760   // and stored atomically.
1761   //
1762   // Side Effects:
1763   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1764   //   used by generate_conjoint_byte_copy().
1765   //
1766   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1767     __ align(CodeEntryAlignment);
1768     StubCodeMark mark(this, "StubRoutines", name);
1769     address start = __ pc();
1770 
1771     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1772     Label L_copy_byte, L_exit;
1773     const Register from        = rdi;  // source array address
1774     const Register to          = rsi;  // destination array address
1775     const Register count       = rdx;  // elements count
1776     const Register byte_count  = rcx;
1777     const Register qword_count = count;
1778     const Register end_from    = from; // source array end address
1779     const Register end_to      = to;   // destination array end address
1780     // End pointers are inclusive, and if count is not zero they point
1781     // to the last unit copied:  end_to[0] := end_from[0]
1782 
1783     __ enter(); // required for proper stackwalking of RuntimeStub frame
1784     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1785 
1786     if (entry != NULL) {
1787       *entry = __ pc();
1788        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1789       BLOCK_COMMENT("Entry:");
1790     }
1791 
1792     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1793                       // r9 and r10 may be used to save non-volatile registers
1794 
1795     // 'from', 'to' and 'count' are now valid
1796     __ movptr(byte_count, count);
1797     __ shrptr(count, 3); // count => qword_count
1798 
1799     // Copy from low to high addresses.  Use 'to' as scratch.
1800     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1801     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1802     __ negptr(qword_count); // make the count negative
1803     __ jmp(L_copy_bytes);
1804 
1805     // Copy trailing qwords
1806   __ BIND(L_copy_8_bytes);
1807     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1808     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1809     __ increment(qword_count);
1810     __ jcc(Assembler::notZero, L_copy_8_bytes);
1811 
1812     // Check for and copy trailing dword
1813   __ BIND(L_copy_4_bytes);
1814     __ testl(byte_count, 4);
1815     __ jccb(Assembler::zero, L_copy_2_bytes);
1816     __ movl(rax, Address(end_from, 8));
1817     __ movl(Address(end_to, 8), rax);
1818 
1819     __ addptr(end_from, 4);
1820     __ addptr(end_to, 4);
1821 
1822     // Check for and copy trailing word
1823   __ BIND(L_copy_2_bytes);
1824     __ testl(byte_count, 2);
1825     __ jccb(Assembler::zero, L_copy_byte);
1826     __ movw(rax, Address(end_from, 8));
1827     __ movw(Address(end_to, 8), rax);
1828 
1829     __ addptr(end_from, 2);
1830     __ addptr(end_to, 2);
1831 
1832     // Check for and copy trailing byte
1833   __ BIND(L_copy_byte);
1834     __ testl(byte_count, 1);
1835     __ jccb(Assembler::zero, L_exit);
1836     __ movb(rax, Address(end_from, 8));
1837     __ movb(Address(end_to, 8), rax);
1838 
1839   __ BIND(L_exit);
1840     restore_arg_regs();
1841     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1842     __ xorptr(rax, rax); // return 0
1843     __ vzeroupper();
1844     __ leave(); // required for proper stackwalking of RuntimeStub frame
1845     __ ret(0);
1846 
1847     // Copy in multi-bytes chunks
1848     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1849     __ jmp(L_copy_4_bytes);
1850 
1851     return start;
1852   }
1853 
1854   // Arguments:
1855   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1856   //             ignored
1857   //   name    - stub name string
1858   //
1859   // Inputs:
1860   //   c_rarg0   - source array address
1861   //   c_rarg1   - destination array address
1862   //   c_rarg2   - element count, treated as ssize_t, can be zero
1863   //
1864   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1865   // we let the hardware handle it.  The one to eight bytes within words,
1866   // dwords or qwords that span cache line boundaries will still be loaded
1867   // and stored atomically.
1868   //
1869   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1870                                       address* entry, const char *name) {
1871     __ align(CodeEntryAlignment);
1872     StubCodeMark mark(this, "StubRoutines", name);
1873     address start = __ pc();
1874 
1875     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1876     const Register from        = rdi;  // source array address
1877     const Register to          = rsi;  // destination array address
1878     const Register count       = rdx;  // elements count
1879     const Register byte_count  = rcx;
1880     const Register qword_count = count;
1881 
1882     __ enter(); // required for proper stackwalking of RuntimeStub frame
1883     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1884 
1885     if (entry != NULL) {
1886       *entry = __ pc();
1887       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1888       BLOCK_COMMENT("Entry:");
1889     }
1890 
1891     array_overlap_test(nooverlap_target, Address::times_1);
1892     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1893                       // r9 and r10 may be used to save non-volatile registers
1894 
1895     // 'from', 'to' and 'count' are now valid
1896     __ movptr(byte_count, count);
1897     __ shrptr(count, 3);   // count => qword_count
1898 
1899     // Copy from high to low addresses.
1900 
1901     // Check for and copy trailing byte
1902     __ testl(byte_count, 1);
1903     __ jcc(Assembler::zero, L_copy_2_bytes);
1904     __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1905     __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1906     __ decrement(byte_count); // Adjust for possible trailing word
1907 
1908     // Check for and copy trailing word
1909   __ BIND(L_copy_2_bytes);
1910     __ testl(byte_count, 2);
1911     __ jcc(Assembler::zero, L_copy_4_bytes);
1912     __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1913     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1914 
1915     // Check for and copy trailing dword
1916   __ BIND(L_copy_4_bytes);
1917     __ testl(byte_count, 4);
1918     __ jcc(Assembler::zero, L_copy_bytes);
1919     __ movl(rax, Address(from, qword_count, Address::times_8));
1920     __ movl(Address(to, qword_count, Address::times_8), rax);
1921     __ jmp(L_copy_bytes);
1922 
1923     // Copy trailing qwords
1924   __ BIND(L_copy_8_bytes);
1925     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1926     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1927     __ decrement(qword_count);
1928     __ jcc(Assembler::notZero, L_copy_8_bytes);
1929 
1930     restore_arg_regs();
1931     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1932     __ xorptr(rax, rax); // return 0
1933     __ vzeroupper();
1934     __ leave(); // required for proper stackwalking of RuntimeStub frame
1935     __ ret(0);
1936 
1937     // Copy in multi-bytes chunks
1938     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1939 
1940     restore_arg_regs();
1941     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1942     __ xorptr(rax, rax); // return 0
1943     __ vzeroupper();
1944     __ leave(); // required for proper stackwalking of RuntimeStub frame
1945     __ ret(0);
1946 
1947     return start;
1948   }
1949 
1950   // Arguments:
1951   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1952   //             ignored
1953   //   name    - stub name string
1954   //
1955   // Inputs:
1956   //   c_rarg0   - source array address
1957   //   c_rarg1   - destination array address
1958   //   c_rarg2   - element count, treated as ssize_t, can be zero
1959   //
1960   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1961   // let the hardware handle it.  The two or four words within dwords
1962   // or qwords that span cache line boundaries will still be loaded
1963   // and stored atomically.
1964   //
1965   // Side Effects:
1966   //   disjoint_short_copy_entry is set to the no-overlap entry point
1967   //   used by generate_conjoint_short_copy().
1968   //
1969   address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
1970     __ align(CodeEntryAlignment);
1971     StubCodeMark mark(this, "StubRoutines", name);
1972     address start = __ pc();
1973 
1974     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1975     const Register from        = rdi;  // source array address
1976     const Register to          = rsi;  // destination array address
1977     const Register count       = rdx;  // elements count
1978     const Register word_count  = rcx;
1979     const Register qword_count = count;
1980     const Register end_from    = from; // source array end address
1981     const Register end_to      = to;   // destination array end address
1982     // End pointers are inclusive, and if count is not zero they point
1983     // to the last unit copied:  end_to[0] := end_from[0]
1984 
1985     __ enter(); // required for proper stackwalking of RuntimeStub frame
1986     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1987 
1988     if (entry != NULL) {
1989       *entry = __ pc();
1990       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1991       BLOCK_COMMENT("Entry:");
1992     }
1993 
1994     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1995                       // r9 and r10 may be used to save non-volatile registers
1996 
1997     // 'from', 'to' and 'count' are now valid
1998     __ movptr(word_count, count);
1999     __ shrptr(count, 2); // count => qword_count
2000 
2001     // Copy from low to high addresses.  Use 'to' as scratch.
2002     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2003     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2004     __ negptr(qword_count);
2005     __ jmp(L_copy_bytes);
2006 
2007     // Copy trailing qwords
2008   __ BIND(L_copy_8_bytes);
2009     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2010     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2011     __ increment(qword_count);
2012     __ jcc(Assembler::notZero, L_copy_8_bytes);
2013 
2014     // Original 'dest' is trashed, so we can't use it as a
2015     // base register for a possible trailing word copy
2016 
2017     // Check for and copy trailing dword
2018   __ BIND(L_copy_4_bytes);
2019     __ testl(word_count, 2);
2020     __ jccb(Assembler::zero, L_copy_2_bytes);
2021     __ movl(rax, Address(end_from, 8));
2022     __ movl(Address(end_to, 8), rax);
2023 
2024     __ addptr(end_from, 4);
2025     __ addptr(end_to, 4);
2026 
2027     // Check for and copy trailing word
2028   __ BIND(L_copy_2_bytes);
2029     __ testl(word_count, 1);
2030     __ jccb(Assembler::zero, L_exit);
2031     __ movw(rax, Address(end_from, 8));
2032     __ movw(Address(end_to, 8), rax);
2033 
2034   __ BIND(L_exit);
2035     restore_arg_regs();
2036     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2037     __ xorptr(rax, rax); // return 0
2038     __ vzeroupper();
2039     __ leave(); // required for proper stackwalking of RuntimeStub frame
2040     __ ret(0);
2041 
2042     // Copy in multi-bytes chunks
2043     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2044     __ jmp(L_copy_4_bytes);
2045 
2046     return start;
2047   }
2048 
2049   address generate_fill(BasicType t, bool aligned, const char *name) {
2050     __ align(CodeEntryAlignment);
2051     StubCodeMark mark(this, "StubRoutines", name);
2052     address start = __ pc();
2053 
2054     BLOCK_COMMENT("Entry:");
2055 
2056     const Register to       = c_rarg0;  // source array address
2057     const Register value    = c_rarg1;  // value
2058     const Register count    = c_rarg2;  // elements count
2059 
2060     __ enter(); // required for proper stackwalking of RuntimeStub frame
2061 
2062     __ generate_fill(t, aligned, to, value, count, rax, xmm0);
2063 
2064     __ vzeroupper();
2065     __ leave(); // required for proper stackwalking of RuntimeStub frame
2066     __ ret(0);
2067     return start;
2068   }
2069 
2070   // Arguments:
2071   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2072   //             ignored
2073   //   name    - stub name string
2074   //
2075   // Inputs:
2076   //   c_rarg0   - source array address
2077   //   c_rarg1   - destination array address
2078   //   c_rarg2   - element count, treated as ssize_t, can be zero
2079   //
2080   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
2081   // let the hardware handle it.  The two or four words within dwords
2082   // or qwords that span cache line boundaries will still be loaded
2083   // and stored atomically.
2084   //
2085   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
2086                                        address *entry, const char *name) {
2087     __ align(CodeEntryAlignment);
2088     StubCodeMark mark(this, "StubRoutines", name);
2089     address start = __ pc();
2090 
2091     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
2092     const Register from        = rdi;  // source array address
2093     const Register to          = rsi;  // destination array address
2094     const Register count       = rdx;  // elements count
2095     const Register word_count  = rcx;
2096     const Register qword_count = count;
2097 
2098     __ enter(); // required for proper stackwalking of RuntimeStub frame
2099     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2100 
2101     if (entry != NULL) {
2102       *entry = __ pc();
2103       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2104       BLOCK_COMMENT("Entry:");
2105     }
2106 
2107     array_overlap_test(nooverlap_target, Address::times_2);
2108     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2109                       // r9 and r10 may be used to save non-volatile registers
2110 
2111     // 'from', 'to' and 'count' are now valid
2112     __ movptr(word_count, count);
2113     __ shrptr(count, 2); // count => qword_count
2114 
2115     // Copy from high to low addresses.  Use 'to' as scratch.
2116 
2117     // Check for and copy trailing word
2118     __ testl(word_count, 1);
2119     __ jccb(Assembler::zero, L_copy_4_bytes);
2120     __ movw(rax, Address(from, word_count, Address::times_2, -2));
2121     __ movw(Address(to, word_count, Address::times_2, -2), rax);
2122 
2123     // Check for and copy trailing dword
2124   __ BIND(L_copy_4_bytes);
2125     __ testl(word_count, 2);
2126     __ jcc(Assembler::zero, L_copy_bytes);
2127     __ movl(rax, Address(from, qword_count, Address::times_8));
2128     __ movl(Address(to, qword_count, Address::times_8), rax);
2129     __ jmp(L_copy_bytes);
2130 
2131     // Copy trailing qwords
2132   __ BIND(L_copy_8_bytes);
2133     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2134     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2135     __ decrement(qword_count);
2136     __ jcc(Assembler::notZero, L_copy_8_bytes);
2137 
2138     restore_arg_regs();
2139     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2140     __ xorptr(rax, rax); // return 0
2141     __ vzeroupper();
2142     __ leave(); // required for proper stackwalking of RuntimeStub frame
2143     __ ret(0);
2144 
2145     // Copy in multi-bytes chunks
2146     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2147 
2148     restore_arg_regs();
2149     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2150     __ xorptr(rax, rax); // return 0
2151     __ vzeroupper();
2152     __ leave(); // required for proper stackwalking of RuntimeStub frame
2153     __ ret(0);
2154 
2155     return start;
2156   }
2157 
2158   // Arguments:
2159   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2160   //             ignored
2161   //   is_oop  - true => oop array, so generate store check code
2162   //   name    - stub name string
2163   //
2164   // Inputs:
2165   //   c_rarg0   - source array address
2166   //   c_rarg1   - destination array address
2167   //   c_rarg2   - element count, treated as ssize_t, can be zero
2168   //
2169   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2170   // the hardware handle it.  The two dwords within qwords that span
2171   // cache line boundaries will still be loaded and stored atomicly.
2172   //
2173   // Side Effects:
2174   //   disjoint_int_copy_entry is set to the no-overlap entry point
2175   //   used by generate_conjoint_int_oop_copy().
2176   //
2177   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
2178                                          const char *name, bool dest_uninitialized = false) {
2179     __ align(CodeEntryAlignment);
2180     StubCodeMark mark(this, "StubRoutines", name);
2181     address start = __ pc();
2182 
2183     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
2184     const Register from        = rdi;  // source array address
2185     const Register to          = rsi;  // destination array address
2186     const Register count       = rdx;  // elements count
2187     const Register dword_count = rcx;
2188     const Register qword_count = count;
2189     const Register end_from    = from; // source array end address
2190     const Register end_to      = to;   // destination array end address
2191     // End pointers are inclusive, and if count is not zero they point
2192     // to the last unit copied:  end_to[0] := end_from[0]
2193 
2194     __ enter(); // required for proper stackwalking of RuntimeStub frame
2195     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2196 
2197     if (entry != NULL) {
2198       *entry = __ pc();
2199       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2200       BLOCK_COMMENT("Entry:");
2201     }
2202 
2203     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2204                                    // r9 is used to save r15_thread
2205 
2206     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2207     if (dest_uninitialized) {
2208       decorators |= IS_DEST_UNINITIALIZED;
2209     }
2210     if (aligned) {
2211       decorators |= ARRAYCOPY_ALIGNED;
2212     }
2213 
2214     BasicType type = is_oop ? T_OBJECT : T_INT;
2215     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2216     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2217 
2218     // 'from', 'to' and 'count' are now valid
2219     __ movptr(dword_count, count);
2220     __ shrptr(count, 1); // count => qword_count
2221 
2222     // Copy from low to high addresses.  Use 'to' as scratch.
2223     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2224     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2225     __ negptr(qword_count);
2226     __ jmp(L_copy_bytes);
2227 
2228     // Copy trailing qwords
2229   __ BIND(L_copy_8_bytes);
2230     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2231     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2232     __ increment(qword_count);
2233     __ jcc(Assembler::notZero, L_copy_8_bytes);
2234 
2235     // Check for and copy trailing dword
2236   __ BIND(L_copy_4_bytes);
2237     __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
2238     __ jccb(Assembler::zero, L_exit);
2239     __ movl(rax, Address(end_from, 8));
2240     __ movl(Address(end_to, 8), rax);
2241 
2242   __ BIND(L_exit);
2243     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2244     restore_arg_regs_using_thread();
2245     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2246     __ vzeroupper();
2247     __ xorptr(rax, rax); // return 0
2248     __ leave(); // required for proper stackwalking of RuntimeStub frame
2249     __ ret(0);
2250 
2251     // Copy in multi-bytes chunks
2252     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2253     __ jmp(L_copy_4_bytes);
2254 
2255     return start;
2256   }
2257 
2258   // Arguments:
2259   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2260   //             ignored
2261   //   is_oop  - true => oop array, so generate store check code
2262   //   name    - stub name string
2263   //
2264   // Inputs:
2265   //   c_rarg0   - source array address
2266   //   c_rarg1   - destination array address
2267   //   c_rarg2   - element count, treated as ssize_t, can be zero
2268   //
2269   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2270   // the hardware handle it.  The two dwords within qwords that span
2271   // cache line boundaries will still be loaded and stored atomicly.
2272   //
2273   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
2274                                          address *entry, const char *name,
2275                                          bool dest_uninitialized = false) {
2276     __ align(CodeEntryAlignment);
2277     StubCodeMark mark(this, "StubRoutines", name);
2278     address start = __ pc();
2279 
2280     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2281     const Register from        = rdi;  // source array address
2282     const Register to          = rsi;  // destination array address
2283     const Register count       = rdx;  // elements count
2284     const Register dword_count = rcx;
2285     const Register qword_count = count;
2286 
2287     __ enter(); // required for proper stackwalking of RuntimeStub frame
2288     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2289 
2290     if (entry != NULL) {
2291       *entry = __ pc();
2292        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2293       BLOCK_COMMENT("Entry:");
2294     }
2295 
2296     array_overlap_test(nooverlap_target, Address::times_4);
2297     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2298                                    // r9 is used to save r15_thread
2299 
2300     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2301     if (dest_uninitialized) {
2302       decorators |= IS_DEST_UNINITIALIZED;
2303     }
2304     if (aligned) {
2305       decorators |= ARRAYCOPY_ALIGNED;
2306     }
2307 
2308     BasicType type = is_oop ? T_OBJECT : T_INT;
2309     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2310     // no registers are destroyed by this call
2311     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2312 
2313     assert_clean_int(count, rax); // Make sure 'count' is clean int.
2314     // 'from', 'to' and 'count' are now valid
2315     __ movptr(dword_count, count);
2316     __ shrptr(count, 1); // count => qword_count
2317 
2318     // Copy from high to low addresses.  Use 'to' as scratch.
2319 
2320     // Check for and copy trailing dword
2321     __ testl(dword_count, 1);
2322     __ jcc(Assembler::zero, L_copy_bytes);
2323     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2324     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2325     __ jmp(L_copy_bytes);
2326 
2327     // Copy trailing qwords
2328   __ BIND(L_copy_8_bytes);
2329     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2330     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2331     __ decrement(qword_count);
2332     __ jcc(Assembler::notZero, L_copy_8_bytes);
2333 
2334     if (is_oop) {
2335       __ jmp(L_exit);
2336     }
2337     restore_arg_regs_using_thread();
2338     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2339     __ xorptr(rax, rax); // return 0
2340     __ vzeroupper();
2341     __ leave(); // required for proper stackwalking of RuntimeStub frame
2342     __ ret(0);
2343 
2344     // Copy in multi-bytes chunks
2345     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2346 
2347   __ BIND(L_exit);
2348     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2349     restore_arg_regs_using_thread();
2350     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2351     __ xorptr(rax, rax); // return 0
2352     __ vzeroupper();
2353     __ leave(); // required for proper stackwalking of RuntimeStub frame
2354     __ ret(0);
2355 
2356     return start;
2357   }
2358 
2359   // Arguments:
2360   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2361   //             ignored
2362   //   is_oop  - true => oop array, so generate store check code
2363   //   name    - stub name string
2364   //
2365   // Inputs:
2366   //   c_rarg0   - source array address
2367   //   c_rarg1   - destination array address
2368   //   c_rarg2   - element count, treated as ssize_t, can be zero
2369   //
2370  // Side Effects:
2371   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2372   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
2373   //
2374   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
2375                                           const char *name, bool dest_uninitialized = false) {
2376     __ align(CodeEntryAlignment);
2377     StubCodeMark mark(this, "StubRoutines", name);
2378     address start = __ pc();
2379 
2380     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2381     const Register from        = rdi;  // source array address
2382     const Register to          = rsi;  // destination array address
2383     const Register qword_count = rdx;  // elements count
2384     const Register end_from    = from; // source array end address
2385     const Register end_to      = rcx;  // destination array end address
2386     const Register saved_count = r11;
2387     // End pointers are inclusive, and if count is not zero they point
2388     // to the last unit copied:  end_to[0] := end_from[0]
2389 
2390     __ enter(); // required for proper stackwalking of RuntimeStub frame
2391     // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2392     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2393 
2394     if (entry != NULL) {
2395       *entry = __ pc();
2396       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2397       BLOCK_COMMENT("Entry:");
2398     }
2399 
2400     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2401                                      // r9 is used to save r15_thread
2402     // 'from', 'to' and 'qword_count' are now valid
2403 
2404     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2405     if (dest_uninitialized) {
2406       decorators |= IS_DEST_UNINITIALIZED;
2407     }
2408     if (aligned) {
2409       decorators |= ARRAYCOPY_ALIGNED;
2410     }
2411 
2412     BasicType type = is_oop ? T_OBJECT : T_LONG;
2413     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2414     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2415 
2416     // Copy from low to high addresses.  Use 'to' as scratch.
2417     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2418     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2419     __ negptr(qword_count);
2420     __ jmp(L_copy_bytes);
2421 
2422     // Copy trailing qwords
2423   __ BIND(L_copy_8_bytes);
2424     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2425     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2426     __ increment(qword_count);
2427     __ jcc(Assembler::notZero, L_copy_8_bytes);
2428 
2429     if (is_oop) {
2430       __ jmp(L_exit);
2431     } else {
2432       restore_arg_regs_using_thread();
2433       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2434       __ xorptr(rax, rax); // return 0
2435       __ vzeroupper();
2436       __ leave(); // required for proper stackwalking of RuntimeStub frame
2437       __ ret(0);
2438     }
2439 
2440     // Copy in multi-bytes chunks
2441     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2442 
2443     __ BIND(L_exit);
2444     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2445     restore_arg_regs_using_thread();
2446     if (is_oop) {
2447       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2448     } else {
2449       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2450     }
2451     __ vzeroupper();
2452     __ xorptr(rax, rax); // return 0
2453     __ leave(); // required for proper stackwalking of RuntimeStub frame
2454     __ ret(0);
2455 
2456     return start;
2457   }
2458 
2459   // Arguments:
2460   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2461   //             ignored
2462   //   is_oop  - true => oop array, so generate store check code
2463   //   name    - stub name string
2464   //
2465   // Inputs:
2466   //   c_rarg0   - source array address
2467   //   c_rarg1   - destination array address
2468   //   c_rarg2   - element count, treated as ssize_t, can be zero
2469   //
2470   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2471                                           address nooverlap_target, address *entry,
2472                                           const char *name, bool dest_uninitialized = false) {
2473     __ align(CodeEntryAlignment);
2474     StubCodeMark mark(this, "StubRoutines", name);
2475     address start = __ pc();
2476 
2477     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2478     const Register from        = rdi;  // source array address
2479     const Register to          = rsi;  // destination array address
2480     const Register qword_count = rdx;  // elements count
2481     const Register saved_count = rcx;
2482 
2483     __ enter(); // required for proper stackwalking of RuntimeStub frame
2484     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2485 
2486     if (entry != NULL) {
2487       *entry = __ pc();
2488       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2489       BLOCK_COMMENT("Entry:");
2490     }
2491 
2492     array_overlap_test(nooverlap_target, Address::times_8);
2493     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2494                                    // r9 is used to save r15_thread
2495     // 'from', 'to' and 'qword_count' are now valid
2496 
2497     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2498     if (dest_uninitialized) {
2499       decorators |= IS_DEST_UNINITIALIZED;
2500     }
2501     if (aligned) {
2502       decorators |= ARRAYCOPY_ALIGNED;
2503     }
2504 
2505     BasicType type = is_oop ? T_OBJECT : T_LONG;
2506     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2507     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2508 
2509     __ jmp(L_copy_bytes);
2510 
2511     // Copy trailing qwords
2512   __ BIND(L_copy_8_bytes);
2513     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2514     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2515     __ decrement(qword_count);
2516     __ jcc(Assembler::notZero, L_copy_8_bytes);
2517 
2518     if (is_oop) {
2519       __ jmp(L_exit);
2520     } else {
2521       restore_arg_regs_using_thread();
2522       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2523       __ xorptr(rax, rax); // return 0
2524       __ vzeroupper();
2525       __ leave(); // required for proper stackwalking of RuntimeStub frame
2526       __ ret(0);
2527     }
2528 
2529     // Copy in multi-bytes chunks
2530     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2531 
2532     __ BIND(L_exit);
2533     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2534     restore_arg_regs_using_thread();
2535     if (is_oop) {
2536       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2537     } else {
2538       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2539     }
2540     __ vzeroupper();
2541     __ xorptr(rax, rax); // return 0
2542     __ leave(); // required for proper stackwalking of RuntimeStub frame
2543     __ ret(0);
2544 
2545     return start;
2546   }
2547 
2548 
2549   // Helper for generating a dynamic type check.
2550   // Smashes no registers.
2551   void generate_type_check(Register sub_klass,
2552                            Register super_check_offset,
2553                            Register super_klass,
2554                            Label& L_success) {
2555     assert_different_registers(sub_klass, super_check_offset, super_klass);
2556 
2557     BLOCK_COMMENT("type_check:");
2558 
2559     Label L_miss;
2560 
2561     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
2562                                      super_check_offset);
2563     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
2564 
2565     // Fall through on failure!
2566     __ BIND(L_miss);
2567   }
2568 
2569   //
2570   //  Generate checkcasting array copy stub
2571   //
2572   //  Input:
2573   //    c_rarg0   - source array address
2574   //    c_rarg1   - destination array address
2575   //    c_rarg2   - element count, treated as ssize_t, can be zero
2576   //    c_rarg3   - size_t ckoff (super_check_offset)
2577   // not Win64
2578   //    c_rarg4   - oop ckval (super_klass)
2579   // Win64
2580   //    rsp+40    - oop ckval (super_klass)
2581   //
2582   //  Output:
2583   //    rax ==  0  -  success
2584   //    rax == -1^K - failure, where K is partial transfer count
2585   //
2586   address generate_checkcast_copy(const char *name, address *entry,
2587                                   bool dest_uninitialized = false) {
2588 
2589     Label L_load_element, L_store_element, L_do_card_marks, L_done;
2590 
2591     // Input registers (after setup_arg_regs)
2592     const Register from        = rdi;   // source array address
2593     const Register to          = rsi;   // destination array address
2594     const Register length      = rdx;   // elements count
2595     const Register ckoff       = rcx;   // super_check_offset
2596     const Register ckval       = r8;    // super_klass
2597 
2598     // Registers used as temps (r13, r14 are save-on-entry)
2599     const Register end_from    = from;  // source array end address
2600     const Register end_to      = r13;   // destination array end address
2601     const Register count       = rdx;   // -(count_remaining)
2602     const Register r14_length  = r14;   // saved copy of length
2603     // End pointers are inclusive, and if length is not zero they point
2604     // to the last unit copied:  end_to[0] := end_from[0]
2605 
2606     const Register rax_oop    = rax;    // actual oop copied
2607     const Register r11_klass  = r11;    // oop._klass
2608 
2609     //---------------------------------------------------------------
2610     // Assembler stub will be used for this call to arraycopy
2611     // if the two arrays are subtypes of Object[] but the
2612     // destination array type is not equal to or a supertype
2613     // of the source type.  Each element must be separately
2614     // checked.
2615 
2616     __ align(CodeEntryAlignment);
2617     StubCodeMark mark(this, "StubRoutines", name);
2618     address start = __ pc();
2619 
2620     __ enter(); // required for proper stackwalking of RuntimeStub frame
2621 
2622 #ifdef ASSERT
2623     // caller guarantees that the arrays really are different
2624     // otherwise, we would have to make conjoint checks
2625     { Label L;
2626       array_overlap_test(L, TIMES_OOP);
2627       __ stop("checkcast_copy within a single array");
2628       __ bind(L);
2629     }
2630 #endif //ASSERT
2631 
2632     setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
2633                        // ckoff => rcx, ckval => r8
2634                        // r9 and r10 may be used to save non-volatile registers
2635 #ifdef _WIN64
2636     // last argument (#4) is on stack on Win64
2637     __ movptr(ckval, Address(rsp, 6 * wordSize));
2638 #endif
2639 
2640     // Caller of this entry point must set up the argument registers.
2641     if (entry != NULL) {
2642       *entry = __ pc();
2643       BLOCK_COMMENT("Entry:");
2644     }
2645 
2646     // allocate spill slots for r13, r14
2647     enum {
2648       saved_r13_offset,
2649       saved_r14_offset,
2650       saved_r10_offset,
2651       saved_rbp_offset
2652     };
2653     __ subptr(rsp, saved_rbp_offset * wordSize);
2654     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2655     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2656     __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2657 
2658 #ifdef ASSERT
2659       Label L2;
2660       __ get_thread(r14);
2661       __ cmpptr(r15_thread, r14);
2662       __ jcc(Assembler::equal, L2);
2663       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2664       __ bind(L2);
2665 #endif // ASSERT
2666 
2667     // check that int operands are properly extended to size_t
2668     assert_clean_int(length, rax);
2669     assert_clean_int(ckoff, rax);
2670 
2671 #ifdef ASSERT
2672     BLOCK_COMMENT("assert consistent ckoff/ckval");
2673     // The ckoff and ckval must be mutually consistent,
2674     // even though caller generates both.
2675     { Label L;
2676       int sco_offset = in_bytes(Klass::super_check_offset_offset());
2677       __ cmpl(ckoff, Address(ckval, sco_offset));
2678       __ jcc(Assembler::equal, L);
2679       __ stop("super_check_offset inconsistent");
2680       __ bind(L);
2681     }
2682 #endif //ASSERT
2683 
2684     // Loop-invariant addresses.  They are exclusive end pointers.
2685     Address end_from_addr(from, length, TIMES_OOP, 0);
2686     Address   end_to_addr(to,   length, TIMES_OOP, 0);
2687     // Loop-variant addresses.  They assume post-incremented count < 0.
2688     Address from_element_addr(end_from, count, TIMES_OOP, 0);
2689     Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2690 
2691     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
2692     if (dest_uninitialized) {
2693       decorators |= IS_DEST_UNINITIALIZED;
2694     }
2695 
2696     BasicType type = T_OBJECT;
2697     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2698     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2699 
2700     // Copy from low to high addresses, indexed from the end of each array.
2701     __ lea(end_from, end_from_addr);
2702     __ lea(end_to,   end_to_addr);
2703     __ movptr(r14_length, length);        // save a copy of the length
2704     assert(length == count, "");          // else fix next line:
2705     __ negptr(count);                     // negate and test the length
2706     __ jcc(Assembler::notZero, L_load_element);
2707 
2708     // Empty array:  Nothing to do.
2709     __ xorptr(rax, rax);                  // return 0 on (trivial) success
2710     __ jmp(L_done);
2711 
2712     // ======== begin loop ========
2713     // (Loop is rotated; its entry is L_load_element.)
2714     // Loop control:
2715     //   for (count = -count; count != 0; count++)
2716     // Base pointers src, dst are biased by 8*(count-1),to last element.
2717     __ align(OptoLoopAlignment);
2718 
2719     __ BIND(L_store_element);
2720     __ store_heap_oop(to_element_addr, rax_oop, noreg, noreg, AS_RAW);  // store the oop
2721     __ increment(count);               // increment the count toward zero
2722     __ jcc(Assembler::zero, L_do_card_marks);
2723 
2724     // ======== loop entry is here ========
2725     __ BIND(L_load_element);
2726     __ load_heap_oop(rax_oop, from_element_addr, noreg, noreg, AS_RAW); // load the oop
2727     __ testptr(rax_oop, rax_oop);
2728     __ jcc(Assembler::zero, L_store_element);
2729 
2730     __ load_klass(r11_klass, rax_oop);// query the object klass
2731     generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2732     // ======== end loop ========
2733 
2734     // It was a real error; we must depend on the caller to finish the job.
2735     // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2736     // Emit GC store barriers for the oops we have copied (r14 + rdx),
2737     // and report their number to the caller.
2738     assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2739     Label L_post_barrier;
2740     __ addptr(r14_length, count);     // K = (original - remaining) oops
2741     __ movptr(rax, r14_length);       // save the value
2742     __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
2743     __ jccb(Assembler::notZero, L_post_barrier);
2744     __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2745 
2746     // Come here on success only.
2747     __ BIND(L_do_card_marks);
2748     __ xorptr(rax, rax);              // return 0 on success
2749 
2750     __ BIND(L_post_barrier);
2751     bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2752 
2753     // Common exit point (success or failure).
2754     __ BIND(L_done);
2755     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2756     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2757     __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
2758     restore_arg_regs();
2759     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free
2760     __ leave(); // required for proper stackwalking of RuntimeStub frame
2761     __ ret(0);
2762 
2763     return start;
2764   }
2765 
2766   //
2767   //  Generate 'unsafe' array copy stub
2768   //  Though just as safe as the other stubs, it takes an unscaled
2769   //  size_t argument instead of an element count.
2770   //
2771   //  Input:
2772   //    c_rarg0   - source array address
2773   //    c_rarg1   - destination array address
2774   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2775   //
2776   // Examines the alignment of the operands and dispatches
2777   // to a long, int, short, or byte copy loop.
2778   //
2779   address generate_unsafe_copy(const char *name,
2780                                address byte_copy_entry, address short_copy_entry,
2781                                address int_copy_entry, address long_copy_entry) {
2782 
2783     Label L_long_aligned, L_int_aligned, L_short_aligned;
2784 
2785     // Input registers (before setup_arg_regs)
2786     const Register from        = c_rarg0;  // source array address
2787     const Register to          = c_rarg1;  // destination array address
2788     const Register size        = c_rarg2;  // byte count (size_t)
2789 
2790     // Register used as a temp
2791     const Register bits        = rax;      // test copy of low bits
2792 
2793     __ align(CodeEntryAlignment);
2794     StubCodeMark mark(this, "StubRoutines", name);
2795     address start = __ pc();
2796 
2797     __ enter(); // required for proper stackwalking of RuntimeStub frame
2798 
2799     // bump this on entry, not on exit:
2800     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2801 
2802     __ mov(bits, from);
2803     __ orptr(bits, to);
2804     __ orptr(bits, size);
2805 
2806     __ testb(bits, BytesPerLong-1);
2807     __ jccb(Assembler::zero, L_long_aligned);
2808 
2809     __ testb(bits, BytesPerInt-1);
2810     __ jccb(Assembler::zero, L_int_aligned);
2811 
2812     __ testb(bits, BytesPerShort-1);
2813     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2814 
2815     __ BIND(L_short_aligned);
2816     __ shrptr(size, LogBytesPerShort); // size => short_count
2817     __ jump(RuntimeAddress(short_copy_entry));
2818 
2819     __ BIND(L_int_aligned);
2820     __ shrptr(size, LogBytesPerInt); // size => int_count
2821     __ jump(RuntimeAddress(int_copy_entry));
2822 
2823     __ BIND(L_long_aligned);
2824     __ shrptr(size, LogBytesPerLong); // size => qword_count
2825     __ jump(RuntimeAddress(long_copy_entry));
2826 
2827     return start;
2828   }
2829 
2830   // Perform range checks on the proposed arraycopy.
2831   // Kills temp, but nothing else.
2832   // Also, clean the sign bits of src_pos and dst_pos.
2833   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2834                               Register src_pos, // source position (c_rarg1)
2835                               Register dst,     // destination array oo (c_rarg2)
2836                               Register dst_pos, // destination position (c_rarg3)
2837                               Register length,
2838                               Register temp,
2839                               Label& L_failed) {
2840     BLOCK_COMMENT("arraycopy_range_checks:");
2841 
2842     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2843     __ movl(temp, length);
2844     __ addl(temp, src_pos);             // src_pos + length
2845     __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2846     __ jcc(Assembler::above, L_failed);
2847 
2848     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2849     __ movl(temp, length);
2850     __ addl(temp, dst_pos);             // dst_pos + length
2851     __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2852     __ jcc(Assembler::above, L_failed);
2853 
2854     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2855     // Move with sign extension can be used since they are positive.
2856     __ movslq(src_pos, src_pos);
2857     __ movslq(dst_pos, dst_pos);
2858 
2859     BLOCK_COMMENT("arraycopy_range_checks done");
2860   }
2861 
2862   //
2863   //  Generate generic array copy stubs
2864   //
2865   //  Input:
2866   //    c_rarg0    -  src oop
2867   //    c_rarg1    -  src_pos (32-bits)
2868   //    c_rarg2    -  dst oop
2869   //    c_rarg3    -  dst_pos (32-bits)
2870   // not Win64
2871   //    c_rarg4    -  element count (32-bits)
2872   // Win64
2873   //    rsp+40     -  element count (32-bits)
2874   //
2875   //  Output:
2876   //    rax ==  0  -  success
2877   //    rax == -1^K - failure, where K is partial transfer count
2878   //
2879   address generate_generic_copy(const char *name,
2880                                 address byte_copy_entry, address short_copy_entry,
2881                                 address int_copy_entry, address oop_copy_entry,
2882                                 address long_copy_entry, address checkcast_copy_entry) {
2883 
2884     Label L_failed, L_failed_0, L_objArray;
2885     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2886 
2887     // Input registers
2888     const Register src        = c_rarg0;  // source array oop
2889     const Register src_pos    = c_rarg1;  // source position
2890     const Register dst        = c_rarg2;  // destination array oop
2891     const Register dst_pos    = c_rarg3;  // destination position
2892 #ifndef _WIN64
2893     const Register length     = c_rarg4;
2894 #else
2895     const Address  length(rsp, 6 * wordSize);  // elements count is on stack on Win64
2896 #endif
2897 
2898     { int modulus = CodeEntryAlignment;
2899       int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
2900       int advance = target - (__ offset() % modulus);
2901       if (advance < 0)  advance += modulus;
2902       if (advance > 0)  __ nop(advance);
2903     }
2904     StubCodeMark mark(this, "StubRoutines", name);
2905 
2906     // Short-hop target to L_failed.  Makes for denser prologue code.
2907     __ BIND(L_failed_0);
2908     __ jmp(L_failed);
2909     assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2910 
2911     __ align(CodeEntryAlignment);
2912     address start = __ pc();
2913 
2914     __ enter(); // required for proper stackwalking of RuntimeStub frame
2915 
2916     // bump this on entry, not on exit:
2917     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2918 
2919     //-----------------------------------------------------------------------
2920     // Assembler stub will be used for this call to arraycopy
2921     // if the following conditions are met:
2922     //
2923     // (1) src and dst must not be null.
2924     // (2) src_pos must not be negative.
2925     // (3) dst_pos must not be negative.
2926     // (4) length  must not be negative.
2927     // (5) src klass and dst klass should be the same and not NULL.
2928     // (6) src and dst should be arrays.
2929     // (7) src_pos + length must not exceed length of src.
2930     // (8) dst_pos + length must not exceed length of dst.
2931     //
2932 
2933     //  if (src == NULL) return -1;
2934     __ testptr(src, src);         // src oop
2935     size_t j1off = __ offset();
2936     __ jccb(Assembler::zero, L_failed_0);
2937 
2938     //  if (src_pos < 0) return -1;
2939     __ testl(src_pos, src_pos); // src_pos (32-bits)
2940     __ jccb(Assembler::negative, L_failed_0);
2941 
2942     //  if (dst == NULL) return -1;
2943     __ testptr(dst, dst);         // dst oop
2944     __ jccb(Assembler::zero, L_failed_0);
2945 
2946     //  if (dst_pos < 0) return -1;
2947     __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
2948     size_t j4off = __ offset();
2949     __ jccb(Assembler::negative, L_failed_0);
2950 
2951     // The first four tests are very dense code,
2952     // but not quite dense enough to put four
2953     // jumps in a 16-byte instruction fetch buffer.
2954     // That's good, because some branch predicters
2955     // do not like jumps so close together.
2956     // Make sure of this.
2957     guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
2958 
2959     // registers used as temp
2960     const Register r11_length    = r11; // elements count to copy
2961     const Register r10_src_klass = r10; // array klass
2962 
2963     //  if (length < 0) return -1;
2964     __ movl(r11_length, length);        // length (elements count, 32-bits value)
2965     __ testl(r11_length, r11_length);
2966     __ jccb(Assembler::negative, L_failed_0);
2967 
2968     __ load_klass(r10_src_klass, src);
2969 #ifdef ASSERT
2970     //  assert(src->klass() != NULL);
2971     {
2972       BLOCK_COMMENT("assert klasses not null {");
2973       Label L1, L2;
2974       __ testptr(r10_src_klass, r10_src_klass);
2975       __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
2976       __ bind(L1);
2977       __ stop("broken null klass");
2978       __ bind(L2);
2979       __ load_klass(rax, dst);
2980       __ cmpq(rax, 0);
2981       __ jcc(Assembler::equal, L1);     // this would be broken also
2982       BLOCK_COMMENT("} assert klasses not null done");
2983     }
2984 #endif
2985 
2986     // Load layout helper (32-bits)
2987     //
2988     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2989     // 32        30    24            16              8     2                 0
2990     //
2991     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2992     //
2993 
2994     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2995 
2996     // Handle objArrays completely differently...
2997     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2998     __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
2999     __ jcc(Assembler::equal, L_objArray);
3000 
3001     //  if (src->klass() != dst->klass()) return -1;
3002     __ load_klass(rax, dst);
3003     __ cmpq(r10_src_klass, rax);
3004     __ jcc(Assembler::notEqual, L_failed);
3005 
3006     const Register rax_lh = rax;  // layout helper
3007     __ movl(rax_lh, Address(r10_src_klass, lh_offset));
3008 
3009     //  if (!src->is_Array()) return -1;
3010     __ cmpl(rax_lh, Klass::_lh_neutral_value);
3011     __ jcc(Assembler::greaterEqual, L_failed);
3012 
3013     // At this point, it is known to be a typeArray (array_tag 0x3).
3014 #ifdef ASSERT
3015     {
3016       BLOCK_COMMENT("assert primitive array {");
3017       Label L;
3018       __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
3019       __ jcc(Assembler::greaterEqual, L);
3020       __ stop("must be a primitive array");
3021       __ bind(L);
3022       BLOCK_COMMENT("} assert primitive array done");
3023     }
3024 #endif
3025 
3026     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3027                            r10, L_failed);
3028 
3029     // TypeArrayKlass
3030     //
3031     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3032     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3033     //
3034 
3035     const Register r10_offset = r10;    // array offset
3036     const Register rax_elsize = rax_lh; // element size
3037 
3038     __ movl(r10_offset, rax_lh);
3039     __ shrl(r10_offset, Klass::_lh_header_size_shift);
3040     __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
3041     __ addptr(src, r10_offset);           // src array offset
3042     __ addptr(dst, r10_offset);           // dst array offset
3043     BLOCK_COMMENT("choose copy loop based on element size");
3044     __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
3045 
3046     // next registers should be set before the jump to corresponding stub
3047     const Register from     = c_rarg0;  // source array address
3048     const Register to       = c_rarg1;  // destination array address
3049     const Register count    = c_rarg2;  // elements count
3050 
3051     // 'from', 'to', 'count' registers should be set in such order
3052     // since they are the same as 'src', 'src_pos', 'dst'.
3053 
3054   __ BIND(L_copy_bytes);
3055     __ cmpl(rax_elsize, 0);
3056     __ jccb(Assembler::notEqual, L_copy_shorts);
3057     __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
3058     __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
3059     __ movl2ptr(count, r11_length); // length
3060     __ jump(RuntimeAddress(byte_copy_entry));
3061 
3062   __ BIND(L_copy_shorts);
3063     __ cmpl(rax_elsize, LogBytesPerShort);
3064     __ jccb(Assembler::notEqual, L_copy_ints);
3065     __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
3066     __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
3067     __ movl2ptr(count, r11_length); // length
3068     __ jump(RuntimeAddress(short_copy_entry));
3069 
3070   __ BIND(L_copy_ints);
3071     __ cmpl(rax_elsize, LogBytesPerInt);
3072     __ jccb(Assembler::notEqual, L_copy_longs);
3073     __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
3074     __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
3075     __ movl2ptr(count, r11_length); // length
3076     __ jump(RuntimeAddress(int_copy_entry));
3077 
3078   __ BIND(L_copy_longs);
3079 #ifdef ASSERT
3080     {
3081       BLOCK_COMMENT("assert long copy {");
3082       Label L;
3083       __ cmpl(rax_elsize, LogBytesPerLong);
3084       __ jcc(Assembler::equal, L);
3085       __ stop("must be long copy, but elsize is wrong");
3086       __ bind(L);
3087       BLOCK_COMMENT("} assert long copy done");
3088     }
3089 #endif
3090     __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
3091     __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
3092     __ movl2ptr(count, r11_length); // length
3093     __ jump(RuntimeAddress(long_copy_entry));
3094 
3095     // ObjArrayKlass
3096   __ BIND(L_objArray);
3097     // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
3098 
3099     Label L_plain_copy, L_checkcast_copy;
3100     //  test array classes for subtyping
3101     __ load_klass(rax, dst);
3102     __ cmpq(r10_src_klass, rax); // usual case is exact equality
3103     __ jcc(Assembler::notEqual, L_checkcast_copy);
3104 
3105     // Identically typed arrays can be copied without element-wise checks.
3106     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3107                            r10, L_failed);
3108 
3109     __ lea(from, Address(src, src_pos, TIMES_OOP,
3110                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
3111     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3112                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
3113     __ movl2ptr(count, r11_length); // length
3114   __ BIND(L_plain_copy);
3115     __ jump(RuntimeAddress(oop_copy_entry));
3116 
3117   __ BIND(L_checkcast_copy);
3118     // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
3119     {
3120       // Before looking at dst.length, make sure dst is also an objArray.
3121       __ cmpl(Address(rax, lh_offset), objArray_lh);
3122       __ jcc(Assembler::notEqual, L_failed);
3123 
3124       // It is safe to examine both src.length and dst.length.
3125       arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3126                              rax, L_failed);
3127 
3128       const Register r11_dst_klass = r11;
3129       __ load_klass(r11_dst_klass, dst); // reload
3130 
3131       // Marshal the base address arguments now, freeing registers.
3132       __ lea(from, Address(src, src_pos, TIMES_OOP,
3133                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3134       __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3135                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3136       __ movl(count, length);           // length (reloaded)
3137       Register sco_temp = c_rarg3;      // this register is free now
3138       assert_different_registers(from, to, count, sco_temp,
3139                                  r11_dst_klass, r10_src_klass);
3140       assert_clean_int(count, sco_temp);
3141 
3142       // Generate the type check.
3143       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3144       __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3145       assert_clean_int(sco_temp, rax);
3146       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3147 
3148       // Fetch destination element klass from the ObjArrayKlass header.
3149       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3150       __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3151       __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
3152       assert_clean_int(sco_temp, rax);
3153 
3154       // the checkcast_copy loop needs two extra arguments:
3155       assert(c_rarg3 == sco_temp, "#3 already in place");
3156       // Set up arguments for checkcast_copy_entry.
3157       setup_arg_regs(4);
3158       __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3159       __ jump(RuntimeAddress(checkcast_copy_entry));
3160     }
3161 
3162   __ BIND(L_failed);
3163     __ xorptr(rax, rax);
3164     __ notptr(rax); // return -1
3165     __ leave();   // required for proper stackwalking of RuntimeStub frame
3166     __ ret(0);
3167 
3168     return start;
3169   }
3170 
3171   void generate_arraycopy_stubs() {
3172     address entry;
3173     address entry_jbyte_arraycopy;
3174     address entry_jshort_arraycopy;
3175     address entry_jint_arraycopy;
3176     address entry_oop_arraycopy;
3177     address entry_jlong_arraycopy;
3178     address entry_checkcast_arraycopy;
3179 
3180     StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, &entry,
3181                                                                            "jbyte_disjoint_arraycopy");
3182     StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
3183                                                                            "jbyte_arraycopy");
3184 
3185     StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
3186                                                                             "jshort_disjoint_arraycopy");
3187     StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
3188                                                                             "jshort_arraycopy");
3189 
3190     StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, &entry,
3191                                                                               "jint_disjoint_arraycopy");
3192     StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(false, false, entry,
3193                                                                               &entry_jint_arraycopy, "jint_arraycopy");
3194 
3195     StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, false, &entry,
3196                                                                                "jlong_disjoint_arraycopy");
3197     StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(false, false, entry,
3198                                                                                &entry_jlong_arraycopy, "jlong_arraycopy");
3199 
3200 
3201     if (UseCompressedOops) {
3202       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(false, true, &entry,
3203                                                                               "oop_disjoint_arraycopy");
3204       StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(false, true, entry,
3205                                                                               &entry_oop_arraycopy, "oop_arraycopy");
3206       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(false, true, &entry,
3207                                                                                      "oop_disjoint_arraycopy_uninit",
3208                                                                                      /*dest_uninitialized*/true);
3209       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(false, true, entry,
3210                                                                                      NULL, "oop_arraycopy_uninit",
3211                                                                                      /*dest_uninitialized*/true);
3212     } else {
3213       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, true, &entry,
3214                                                                                "oop_disjoint_arraycopy");
3215       StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(false, true, entry,
3216                                                                                &entry_oop_arraycopy, "oop_arraycopy");
3217       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(false, true, &entry,
3218                                                                                       "oop_disjoint_arraycopy_uninit",
3219                                                                                       /*dest_uninitialized*/true);
3220       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(false, true, entry,
3221                                                                                       NULL, "oop_arraycopy_uninit",
3222                                                                                       /*dest_uninitialized*/true);
3223     }
3224 
3225     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3226     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3227                                                                         /*dest_uninitialized*/true);
3228 
3229     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
3230                                                               entry_jbyte_arraycopy,
3231                                                               entry_jshort_arraycopy,
3232                                                               entry_jint_arraycopy,
3233                                                               entry_jlong_arraycopy);
3234     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
3235                                                                entry_jbyte_arraycopy,
3236                                                                entry_jshort_arraycopy,
3237                                                                entry_jint_arraycopy,
3238                                                                entry_oop_arraycopy,
3239                                                                entry_jlong_arraycopy,
3240                                                                entry_checkcast_arraycopy);
3241 
3242     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3243     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3244     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3245     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3246     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3247     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3248 
3249     // We don't generate specialized code for HeapWord-aligned source
3250     // arrays, so just use the code we've already generated
3251     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
3252     StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
3253 
3254     StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
3255     StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
3256 
3257     StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
3258     StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
3259 
3260     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
3261     StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
3262 
3263     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
3264     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
3265 
3266     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
3267     StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
3268   }
3269 
3270   // AES intrinsic stubs
3271   enum {AESBlockSize = 16};
3272 
3273   address generate_key_shuffle_mask() {
3274     __ align(16);
3275     StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
3276     address start = __ pc();
3277     __ emit_data64( 0x0405060700010203, relocInfo::none );
3278     __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
3279     return start;
3280   }
3281 
3282   address generate_counter_shuffle_mask() {
3283     __ align(16);
3284     StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
3285     address start = __ pc();
3286     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3287     __ emit_data64(0x0001020304050607, relocInfo::none);
3288     return start;
3289   }
3290 
3291   // Utility routine for loading a 128-bit key word in little endian format
3292   // can optionally specify that the shuffle mask is already in an xmmregister
3293   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
3294     __ movdqu(xmmdst, Address(key, offset));
3295     if (xmm_shuf_mask != NULL) {
3296       __ pshufb(xmmdst, xmm_shuf_mask);
3297     } else {
3298       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3299     }
3300   }
3301 
3302   // Utility routine for increase 128bit counter (iv in CTR mode)
3303   void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
3304     __ pextrq(reg, xmmdst, 0x0);
3305     __ addq(reg, inc_delta);
3306     __ pinsrq(xmmdst, reg, 0x0);
3307     __ jcc(Assembler::carryClear, next_block); // jump if no carry
3308     __ pextrq(reg, xmmdst, 0x01); // Carry
3309     __ addq(reg, 0x01);
3310     __ pinsrq(xmmdst, reg, 0x01); //Carry end
3311     __ BIND(next_block);          // next instruction
3312   }
3313 
3314   // Arguments:
3315   //
3316   // Inputs:
3317   //   c_rarg0   - source byte array address
3318   //   c_rarg1   - destination byte array address
3319   //   c_rarg2   - K (key) in little endian int array
3320   //
3321   address generate_aescrypt_encryptBlock() {
3322     assert(UseAES, "need AES instructions and misaligned SSE support");
3323     __ align(CodeEntryAlignment);
3324     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3325     Label L_doLast;
3326     address start = __ pc();
3327 
3328     const Register from        = c_rarg0;  // source array address
3329     const Register to          = c_rarg1;  // destination array address
3330     const Register key         = c_rarg2;  // key array address
3331     const Register keylen      = rax;
3332 
3333     const XMMRegister xmm_result = xmm0;
3334     const XMMRegister xmm_key_shuf_mask = xmm1;
3335     // On win64 xmm6-xmm15 must be preserved so don't use them.
3336     const XMMRegister xmm_temp1  = xmm2;
3337     const XMMRegister xmm_temp2  = xmm3;
3338     const XMMRegister xmm_temp3  = xmm4;
3339     const XMMRegister xmm_temp4  = xmm5;
3340 
3341     __ enter(); // required for proper stackwalking of RuntimeStub frame
3342 
3343     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3344     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3345 
3346     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3347     __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
3348 
3349     // For encryption, the java expanded key ordering is just what we need
3350     // we don't know if the key is aligned, hence not using load-execute form
3351 
3352     load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
3353     __ pxor(xmm_result, xmm_temp1);
3354 
3355     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3356     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3357     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3358     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3359 
3360     __ aesenc(xmm_result, xmm_temp1);
3361     __ aesenc(xmm_result, xmm_temp2);
3362     __ aesenc(xmm_result, xmm_temp3);
3363     __ aesenc(xmm_result, xmm_temp4);
3364 
3365     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3366     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3367     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3368     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3369 
3370     __ aesenc(xmm_result, xmm_temp1);
3371     __ aesenc(xmm_result, xmm_temp2);
3372     __ aesenc(xmm_result, xmm_temp3);
3373     __ aesenc(xmm_result, xmm_temp4);
3374 
3375     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3376     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3377 
3378     __ cmpl(keylen, 44);
3379     __ jccb(Assembler::equal, L_doLast);
3380 
3381     __ aesenc(xmm_result, xmm_temp1);
3382     __ aesenc(xmm_result, xmm_temp2);
3383 
3384     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3385     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3386 
3387     __ cmpl(keylen, 52);
3388     __ jccb(Assembler::equal, L_doLast);
3389 
3390     __ aesenc(xmm_result, xmm_temp1);
3391     __ aesenc(xmm_result, xmm_temp2);
3392 
3393     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3394     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3395 
3396     __ BIND(L_doLast);
3397     __ aesenc(xmm_result, xmm_temp1);
3398     __ aesenclast(xmm_result, xmm_temp2);
3399     __ movdqu(Address(to, 0), xmm_result);        // store the result
3400     __ xorptr(rax, rax); // return 0
3401     __ leave(); // required for proper stackwalking of RuntimeStub frame
3402     __ ret(0);
3403 
3404     return start;
3405   }
3406 
3407 
3408   // Arguments:
3409   //
3410   // Inputs:
3411   //   c_rarg0   - source byte array address
3412   //   c_rarg1   - destination byte array address
3413   //   c_rarg2   - K (key) in little endian int array
3414   //
3415   address generate_aescrypt_decryptBlock() {
3416     assert(UseAES, "need AES instructions and misaligned SSE support");
3417     __ align(CodeEntryAlignment);
3418     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3419     Label L_doLast;
3420     address start = __ pc();
3421 
3422     const Register from        = c_rarg0;  // source array address
3423     const Register to          = c_rarg1;  // destination array address
3424     const Register key         = c_rarg2;  // key array address
3425     const Register keylen      = rax;
3426 
3427     const XMMRegister xmm_result = xmm0;
3428     const XMMRegister xmm_key_shuf_mask = xmm1;
3429     // On win64 xmm6-xmm15 must be preserved so don't use them.
3430     const XMMRegister xmm_temp1  = xmm2;
3431     const XMMRegister xmm_temp2  = xmm3;
3432     const XMMRegister xmm_temp3  = xmm4;
3433     const XMMRegister xmm_temp4  = xmm5;
3434 
3435     __ enter(); // required for proper stackwalking of RuntimeStub frame
3436 
3437     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3438     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3439 
3440     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3441     __ movdqu(xmm_result, Address(from, 0));
3442 
3443     // for decryption java expanded key ordering is rotated one position from what we want
3444     // so we start from 0x10 here and hit 0x00 last
3445     // we don't know if the key is aligned, hence not using load-execute form
3446     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3447     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3448     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3449     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3450 
3451     __ pxor  (xmm_result, xmm_temp1);
3452     __ aesdec(xmm_result, xmm_temp2);
3453     __ aesdec(xmm_result, xmm_temp3);
3454     __ aesdec(xmm_result, xmm_temp4);
3455 
3456     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3457     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3458     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3459     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3460 
3461     __ aesdec(xmm_result, xmm_temp1);
3462     __ aesdec(xmm_result, xmm_temp2);
3463     __ aesdec(xmm_result, xmm_temp3);
3464     __ aesdec(xmm_result, xmm_temp4);
3465 
3466     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3467     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3468     load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
3469 
3470     __ cmpl(keylen, 44);
3471     __ jccb(Assembler::equal, L_doLast);
3472 
3473     __ aesdec(xmm_result, xmm_temp1);
3474     __ aesdec(xmm_result, xmm_temp2);
3475 
3476     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3477     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3478 
3479     __ cmpl(keylen, 52);
3480     __ jccb(Assembler::equal, L_doLast);
3481 
3482     __ aesdec(xmm_result, xmm_temp1);
3483     __ aesdec(xmm_result, xmm_temp2);
3484 
3485     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3486     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3487 
3488     __ BIND(L_doLast);
3489     __ aesdec(xmm_result, xmm_temp1);
3490     __ aesdec(xmm_result, xmm_temp2);
3491 
3492     // for decryption the aesdeclast operation is always on key+0x00
3493     __ aesdeclast(xmm_result, xmm_temp3);
3494     __ movdqu(Address(to, 0), xmm_result);  // store the result
3495     __ xorptr(rax, rax); // return 0
3496     __ leave(); // required for proper stackwalking of RuntimeStub frame
3497     __ ret(0);
3498 
3499     return start;
3500   }
3501 
3502 
3503   // Arguments:
3504   //
3505   // Inputs:
3506   //   c_rarg0   - source byte array address
3507   //   c_rarg1   - destination byte array address
3508   //   c_rarg2   - K (key) in little endian int array
3509   //   c_rarg3   - r vector byte array address
3510   //   c_rarg4   - input length
3511   //
3512   // Output:
3513   //   rax       - input length
3514   //
3515   address generate_cipherBlockChaining_encryptAESCrypt() {
3516     assert(UseAES, "need AES instructions and misaligned SSE support");
3517     __ align(CodeEntryAlignment);
3518     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3519     address start = __ pc();
3520 
3521     Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3522     const Register from        = c_rarg0;  // source array address
3523     const Register to          = c_rarg1;  // destination array address
3524     const Register key         = c_rarg2;  // key array address
3525     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3526                                            // and left with the results of the last encryption block
3527 #ifndef _WIN64
3528     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3529 #else
3530     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3531     const Register len_reg     = r11;      // pick the volatile windows register
3532 #endif
3533     const Register pos         = rax;
3534 
3535     // xmm register assignments for the loops below
3536     const XMMRegister xmm_result = xmm0;
3537     const XMMRegister xmm_temp   = xmm1;
3538     // keys 0-10 preloaded into xmm2-xmm12
3539     const int XMM_REG_NUM_KEY_FIRST = 2;
3540     const int XMM_REG_NUM_KEY_LAST  = 15;
3541     const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3542     const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3543     const XMMRegister xmm_key11  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3544     const XMMRegister xmm_key12  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3545     const XMMRegister xmm_key13  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3546 
3547     __ enter(); // required for proper stackwalking of RuntimeStub frame
3548 
3549 #ifdef _WIN64
3550     // on win64, fill len_reg from stack position
3551     __ movl(len_reg, len_mem);
3552 #else
3553     __ push(len_reg); // Save
3554 #endif
3555 
3556     const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
3557     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3558     // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3559     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3560       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3561       offset += 0x10;
3562     }
3563     __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
3564 
3565     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3566     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3567     __ cmpl(rax, 44);
3568     __ jcc(Assembler::notEqual, L_key_192_256);
3569 
3570     // 128 bit code follows here
3571     __ movptr(pos, 0);
3572     __ align(OptoLoopAlignment);
3573 
3574     __ BIND(L_loopTop_128);
3575     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3576     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3577     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3578     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3579       __ aesenc(xmm_result, as_XMMRegister(rnum));
3580     }
3581     __ aesenclast(xmm_result, xmm_key10);
3582     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3583     // no need to store r to memory until we exit
3584     __ addptr(pos, AESBlockSize);
3585     __ subptr(len_reg, AESBlockSize);
3586     __ jcc(Assembler::notEqual, L_loopTop_128);
3587 
3588     __ BIND(L_exit);
3589     __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
3590 
3591 #ifdef _WIN64
3592     __ movl(rax, len_mem);
3593 #else
3594     __ pop(rax); // return length
3595 #endif
3596     __ leave(); // required for proper stackwalking of RuntimeStub frame
3597     __ ret(0);
3598 
3599     __ BIND(L_key_192_256);
3600     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3601     load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3602     load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3603     __ cmpl(rax, 52);
3604     __ jcc(Assembler::notEqual, L_key_256);
3605 
3606     // 192-bit code follows here (could be changed to use more xmm registers)
3607     __ movptr(pos, 0);
3608     __ align(OptoLoopAlignment);
3609 
3610     __ BIND(L_loopTop_192);
3611     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3612     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3613     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3614     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
3615       __ aesenc(xmm_result, as_XMMRegister(rnum));
3616     }
3617     __ aesenclast(xmm_result, xmm_key12);
3618     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3619     // no need to store r to memory until we exit
3620     __ addptr(pos, AESBlockSize);
3621     __ subptr(len_reg, AESBlockSize);
3622     __ jcc(Assembler::notEqual, L_loopTop_192);
3623     __ jmp(L_exit);
3624 
3625     __ BIND(L_key_256);
3626     // 256-bit code follows here (could be changed to use more xmm registers)
3627     load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
3628     __ movptr(pos, 0);
3629     __ align(OptoLoopAlignment);
3630 
3631     __ BIND(L_loopTop_256);
3632     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3633     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3634     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3635     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
3636       __ aesenc(xmm_result, as_XMMRegister(rnum));
3637     }
3638     load_key(xmm_temp, key, 0xe0);
3639     __ aesenclast(xmm_result, xmm_temp);
3640     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3641     // no need to store r to memory until we exit
3642     __ addptr(pos, AESBlockSize);
3643     __ subptr(len_reg, AESBlockSize);
3644     __ jcc(Assembler::notEqual, L_loopTop_256);
3645     __ jmp(L_exit);
3646 
3647     return start;
3648   }
3649 
3650   // Safefetch stubs.
3651   void generate_safefetch(const char* name, int size, address* entry,
3652                           address* fault_pc, address* continuation_pc) {
3653     // safefetch signatures:
3654     //   int      SafeFetch32(int*      adr, int      errValue);
3655     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3656     //
3657     // arguments:
3658     //   c_rarg0 = adr
3659     //   c_rarg1 = errValue
3660     //
3661     // result:
3662     //   PPC_RET  = *adr or errValue
3663 
3664     StubCodeMark mark(this, "StubRoutines", name);
3665 
3666     // Entry point, pc or function descriptor.
3667     *entry = __ pc();
3668 
3669     // Load *adr into c_rarg1, may fault.
3670     *fault_pc = __ pc();
3671     switch (size) {
3672       case 4:
3673         // int32_t
3674         __ movl(c_rarg1, Address(c_rarg0, 0));
3675         break;
3676       case 8:
3677         // int64_t
3678         __ movq(c_rarg1, Address(c_rarg0, 0));
3679         break;
3680       default:
3681         ShouldNotReachHere();
3682     }
3683 
3684     // return errValue or *adr
3685     *continuation_pc = __ pc();
3686     __ movq(rax, c_rarg1);
3687     __ ret(0);
3688   }
3689 
3690   // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3691   // to hide instruction latency
3692   //
3693   // Arguments:
3694   //
3695   // Inputs:
3696   //   c_rarg0   - source byte array address
3697   //   c_rarg1   - destination byte array address
3698   //   c_rarg2   - K (key) in little endian int array
3699   //   c_rarg3   - r vector byte array address
3700   //   c_rarg4   - input length
3701   //
3702   // Output:
3703   //   rax       - input length
3704   //
3705   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3706     assert(UseAES, "need AES instructions and misaligned SSE support");
3707     __ align(CodeEntryAlignment);
3708     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3709     address start = __ pc();
3710 
3711     const Register from        = c_rarg0;  // source array address
3712     const Register to          = c_rarg1;  // destination array address
3713     const Register key         = c_rarg2;  // key array address
3714     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3715                                            // and left with the results of the last encryption block
3716 #ifndef _WIN64
3717     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3718 #else
3719     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3720     const Register len_reg     = r11;      // pick the volatile windows register
3721 #endif
3722     const Register pos         = rax;
3723 
3724     const int PARALLEL_FACTOR = 4;
3725     const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256
3726 
3727     Label L_exit;
3728     Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
3729     Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
3730     Label L_singleBlock_loopTop[3]; // 128, 192, 256
3731     Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
3732     Label L_multiBlock_loopTop[3]; // 128, 192, 256
3733 
3734     // keys 0-10 preloaded into xmm5-xmm15
3735     const int XMM_REG_NUM_KEY_FIRST = 5;
3736     const int XMM_REG_NUM_KEY_LAST  = 15;
3737     const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3738     const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3739 
3740     __ enter(); // required for proper stackwalking of RuntimeStub frame
3741 
3742 #ifdef _WIN64
3743     // on win64, fill len_reg from stack position
3744     __ movl(len_reg, len_mem);
3745 #else
3746     __ push(len_reg); // Save
3747 #endif
3748     __ push(rbx);
3749     // the java expanded key ordering is rotated one position from what we want
3750     // so we start from 0x10 here and hit 0x00 last
3751     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
3752     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3753     // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3754     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3755       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3756       offset += 0x10;
3757     }
3758     load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3759 
3760     const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
3761 
3762     // registers holding the four results in the parallelized loop
3763     const XMMRegister xmm_result0 = xmm0;
3764     const XMMRegister xmm_result1 = xmm2;
3765     const XMMRegister xmm_result2 = xmm3;
3766     const XMMRegister xmm_result3 = xmm4;
3767 
3768     __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
3769 
3770     __ xorptr(pos, pos);
3771 
3772     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3773     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3774     __ cmpl(rbx, 52);
3775     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
3776     __ cmpl(rbx, 60);
3777     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
3778 
3779 #define DoFour(opc, src_reg)           \
3780   __ opc(xmm_result0, src_reg);         \
3781   __ opc(xmm_result1, src_reg);         \
3782   __ opc(xmm_result2, src_reg);         \
3783   __ opc(xmm_result3, src_reg);         \
3784 
3785     for (int k = 0; k < 3; ++k) {
3786       __ BIND(L_multiBlock_loopTopHead[k]);
3787       if (k != 0) {
3788         __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3789         __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
3790       }
3791       if (k == 1) {
3792         __ subptr(rsp, 6 * wordSize);
3793         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3794         load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
3795         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3796         load_key(xmm1, key, 0xc0);  // 0xc0;
3797         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3798       } else if (k == 2) {
3799         __ subptr(rsp, 10 * wordSize);
3800         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3801         load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes upto 0xe0
3802         __ movdqu(Address(rsp, 6 * wordSize), xmm15);
3803         load_key(xmm1, key, 0xe0);  // 0xe0;
3804         __ movdqu(Address(rsp, 8 * wordSize), xmm1);
3805         load_key(xmm15, key, 0xb0); // 0xb0;
3806         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3807         load_key(xmm1, key, 0xc0);  // 0xc0;
3808         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3809       }
3810       __ align(OptoLoopAlignment);
3811       __ BIND(L_multiBlock_loopTop[k]);
3812       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3813       __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);
3814 
3815       if  (k != 0) {
3816         __ movdqu(xmm15, Address(rsp, 2 * wordSize));
3817         __ movdqu(xmm1, Address(rsp, 4 * wordSize));
3818       }
3819 
3820       __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
3821       __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3822       __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3823       __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
3824 
3825       DoFour(pxor, xmm_key_first);
3826       if (k == 0) {
3827         for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
3828           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3829         }
3830         DoFour(aesdeclast, xmm_key_last);
3831       } else if (k == 1) {
3832         for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
3833           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3834         }
3835         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3836         DoFour(aesdec, xmm1);  // key : 0xc0
3837         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
3838         DoFour(aesdeclast, xmm_key_last);
3839       } else if (k == 2) {
3840         for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
3841           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3842         }
3843         DoFour(aesdec, xmm1);  // key : 0xc0
3844         __ movdqu(xmm15, Address(rsp, 6 * wordSize));
3845         __ movdqu(xmm1, Address(rsp, 8 * wordSize));
3846         DoFour(aesdec, xmm15);  // key : 0xd0
3847         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3848         DoFour(aesdec, xmm1);  // key : 0xe0
3849         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
3850         DoFour(aesdeclast, xmm_key_last);
3851       }
3852 
3853       // for each result, xor with the r vector of previous cipher block
3854       __ pxor(xmm_result0, xmm_prev_block_cipher);
3855       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
3856       __ pxor(xmm_result1, xmm_prev_block_cipher);
3857       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3858       __ pxor(xmm_result2, xmm_prev_block_cipher);
3859       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3860       __ pxor(xmm_result3, xmm_prev_block_cipher);
3861       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize));   // this will carry over to next set of blocks
3862       if (k != 0) {
3863         __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
3864       }
3865 
3866       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
3867       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
3868       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
3869       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
3870 
3871       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
3872       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
3873       __ jmp(L_multiBlock_loopTop[k]);
3874 
3875       // registers used in the non-parallelized loops
3876       // xmm register assignments for the loops below
3877       const XMMRegister xmm_result = xmm0;
3878       const XMMRegister xmm_prev_block_cipher_save = xmm2;
3879       const XMMRegister xmm_key11 = xmm3;
3880       const XMMRegister xmm_key12 = xmm4;
3881       const XMMRegister key_tmp = xmm4;
3882 
3883       __ BIND(L_singleBlock_loopTopHead[k]);
3884       if (k == 1) {
3885         __ addptr(rsp, 6 * wordSize);
3886       } else if (k == 2) {
3887         __ addptr(rsp, 10 * wordSize);
3888       }
3889       __ cmpptr(len_reg, 0); // any blocks left??
3890       __ jcc(Assembler::equal, L_exit);
3891       __ BIND(L_singleBlock_loopTopHead2[k]);
3892       if (k == 1) {
3893         load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes upto 0xc0
3894         load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes upto 0xc0
3895       }
3896       if (k == 2) {
3897         load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes upto 0xe0
3898       }
3899       __ align(OptoLoopAlignment);
3900       __ BIND(L_singleBlock_loopTop[k]);
3901       __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
3902       __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
3903       __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
3904       for (int rnum = 1; rnum <= 9 ; rnum++) {
3905           __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3906       }
3907       if (k == 1) {
3908         __ aesdec(xmm_result, xmm_key11);
3909         __ aesdec(xmm_result, xmm_key12);
3910       }
3911       if (k == 2) {
3912         __ aesdec(xmm_result, xmm_key11);
3913         load_key(key_tmp, key, 0xc0);
3914         __ aesdec(xmm_result, key_tmp);
3915         load_key(key_tmp, key, 0xd0);
3916         __ aesdec(xmm_result, key_tmp);
3917         load_key(key_tmp, key, 0xe0);
3918         __ aesdec(xmm_result, key_tmp);
3919       }
3920 
3921       __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
3922       __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3923       __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3924       // no need to store r to memory until we exit
3925       __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3926       __ addptr(pos, AESBlockSize);
3927       __ subptr(len_reg, AESBlockSize);
3928       __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
3929       if (k != 2) {
3930         __ jmp(L_exit);
3931       }
3932     } //for 128/192/256
3933 
3934     __ BIND(L_exit);
3935     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
3936     __ pop(rbx);
3937 #ifdef _WIN64
3938     __ movl(rax, len_mem);
3939 #else
3940     __ pop(rax); // return length
3941 #endif
3942     __ leave(); // required for proper stackwalking of RuntimeStub frame
3943     __ ret(0);
3944     return start;
3945 }
3946 
3947   address generate_upper_word_mask() {
3948     __ align(64);
3949     StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
3950     address start = __ pc();
3951     __ emit_data64(0x0000000000000000, relocInfo::none);
3952     __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
3953     return start;
3954   }
3955 
3956   address generate_shuffle_byte_flip_mask() {
3957     __ align(64);
3958     StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
3959     address start = __ pc();
3960     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3961     __ emit_data64(0x0001020304050607, relocInfo::none);
3962     return start;
3963   }
3964 
3965   // ofs and limit are use for multi-block byte array.
3966   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3967   address generate_sha1_implCompress(bool multi_block, const char *name) {
3968     __ align(CodeEntryAlignment);
3969     StubCodeMark mark(this, "StubRoutines", name);
3970     address start = __ pc();
3971 
3972     Register buf = c_rarg0;
3973     Register state = c_rarg1;
3974     Register ofs = c_rarg2;
3975     Register limit = c_rarg3;
3976 
3977     const XMMRegister abcd = xmm0;
3978     const XMMRegister e0 = xmm1;
3979     const XMMRegister e1 = xmm2;
3980     const XMMRegister msg0 = xmm3;
3981 
3982     const XMMRegister msg1 = xmm4;
3983     const XMMRegister msg2 = xmm5;
3984     const XMMRegister msg3 = xmm6;
3985     const XMMRegister shuf_mask = xmm7;
3986 
3987     __ enter();
3988 
3989     __ subptr(rsp, 4 * wordSize);
3990 
3991     __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
3992       buf, state, ofs, limit, rsp, multi_block);
3993 
3994     __ addptr(rsp, 4 * wordSize);
3995 
3996     __ leave();
3997     __ ret(0);
3998     return start;
3999   }
4000 
4001   address generate_pshuffle_byte_flip_mask() {
4002     __ align(64);
4003     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
4004     address start = __ pc();
4005     __ emit_data64(0x0405060700010203, relocInfo::none);
4006     __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
4007 
4008     if (VM_Version::supports_avx2()) {
4009       __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
4010       __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
4011       // _SHUF_00BA
4012       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4013       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4014       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4015       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4016       // _SHUF_DC00
4017       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4018       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4019       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4020       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4021     }
4022 
4023     return start;
4024   }
4025 
4026   //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
4027   address generate_pshuffle_byte_flip_mask_sha512() {
4028     __ align(32);
4029     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
4030     address start = __ pc();
4031     if (VM_Version::supports_avx2()) {
4032       __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK
4033       __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4034       __ emit_data64(0x1011121314151617, relocInfo::none);
4035       __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none);
4036       __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO
4037       __ emit_data64(0x0000000000000000, relocInfo::none);
4038       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4039       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4040     }
4041 
4042     return start;
4043   }
4044 
4045 // ofs and limit are use for multi-block byte array.
4046 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
4047   address generate_sha256_implCompress(bool multi_block, const char *name) {
4048     assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), "");
4049     __ align(CodeEntryAlignment);
4050     StubCodeMark mark(this, "StubRoutines", name);
4051     address start = __ pc();
4052 
4053     Register buf = c_rarg0;
4054     Register state = c_rarg1;
4055     Register ofs = c_rarg2;
4056     Register limit = c_rarg3;
4057 
4058     const XMMRegister msg = xmm0;
4059     const XMMRegister state0 = xmm1;
4060     const XMMRegister state1 = xmm2;
4061     const XMMRegister msgtmp0 = xmm3;
4062 
4063     const XMMRegister msgtmp1 = xmm4;
4064     const XMMRegister msgtmp2 = xmm5;
4065     const XMMRegister msgtmp3 = xmm6;
4066     const XMMRegister msgtmp4 = xmm7;
4067 
4068     const XMMRegister shuf_mask = xmm8;
4069 
4070     __ enter();
4071 
4072     __ subptr(rsp, 4 * wordSize);
4073 
4074     if (VM_Version::supports_sha()) {
4075       __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4076         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4077     } else if (VM_Version::supports_avx2()) {
4078       __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4079         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4080     }
4081     __ addptr(rsp, 4 * wordSize);
4082     __ vzeroupper();
4083     __ leave();
4084     __ ret(0);
4085     return start;
4086   }
4087 
4088   address generate_sha512_implCompress(bool multi_block, const char *name) {
4089     assert(VM_Version::supports_avx2(), "");
4090     assert(VM_Version::supports_bmi2(), "");
4091     __ align(CodeEntryAlignment);
4092     StubCodeMark mark(this, "StubRoutines", name);
4093     address start = __ pc();
4094 
4095     Register buf = c_rarg0;
4096     Register state = c_rarg1;
4097     Register ofs = c_rarg2;
4098     Register limit = c_rarg3;
4099 
4100     const XMMRegister msg = xmm0;
4101     const XMMRegister state0 = xmm1;
4102     const XMMRegister state1 = xmm2;
4103     const XMMRegister msgtmp0 = xmm3;
4104     const XMMRegister msgtmp1 = xmm4;
4105     const XMMRegister msgtmp2 = xmm5;
4106     const XMMRegister msgtmp3 = xmm6;
4107     const XMMRegister msgtmp4 = xmm7;
4108 
4109     const XMMRegister shuf_mask = xmm8;
4110 
4111     __ enter();
4112 
4113     __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4114     buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4115 
4116     __ vzeroupper();
4117     __ leave();
4118     __ ret(0);
4119     return start;
4120   }
4121 
4122   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
4123   // to hide instruction latency
4124   //
4125   // Arguments:
4126   //
4127   // Inputs:
4128   //   c_rarg0   - source byte array address
4129   //   c_rarg1   - destination byte array address
4130   //   c_rarg2   - K (key) in little endian int array
4131   //   c_rarg3   - counter vector byte array address
4132   //   Linux
4133   //     c_rarg4   -          input length
4134   //     c_rarg5   -          saved encryptedCounter start
4135   //     rbp + 6 * wordSize - saved used length
4136   //   Windows
4137   //     rbp + 6 * wordSize - input length
4138   //     rbp + 7 * wordSize - saved encryptedCounter start
4139   //     rbp + 8 * wordSize - saved used length
4140   //
4141   // Output:
4142   //   rax       - input length
4143   //
4144   address generate_counterMode_AESCrypt_Parallel() {
4145     assert(UseAES, "need AES instructions and misaligned SSE support");
4146     __ align(CodeEntryAlignment);
4147     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
4148     address start = __ pc();
4149     const Register from = c_rarg0; // source array address
4150     const Register to = c_rarg1; // destination array address
4151     const Register key = c_rarg2; // key array address
4152     const Register counter = c_rarg3; // counter byte array initialized from counter array address
4153                                       // and updated with the incremented counter in the end
4154 #ifndef _WIN64
4155     const Register len_reg = c_rarg4;
4156     const Register saved_encCounter_start = c_rarg5;
4157     const Register used_addr = r10;
4158     const Address  used_mem(rbp, 2 * wordSize);
4159     const Register used = r11;
4160 #else
4161     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4162     const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
4163     const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
4164     const Register len_reg = r10; // pick the first volatile windows register
4165     const Register saved_encCounter_start = r11;
4166     const Register used_addr = r13;
4167     const Register used = r14;
4168 #endif
4169     const Register pos = rax;
4170 
4171     const int PARALLEL_FACTOR = 6;
4172     const XMMRegister xmm_counter_shuf_mask = xmm0;
4173     const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
4174     const XMMRegister xmm_curr_counter = xmm2;
4175 
4176     const XMMRegister xmm_key_tmp0 = xmm3;
4177     const XMMRegister xmm_key_tmp1 = xmm4;
4178 
4179     // registers holding the four results in the parallelized loop
4180     const XMMRegister xmm_result0 = xmm5;
4181     const XMMRegister xmm_result1 = xmm6;
4182     const XMMRegister xmm_result2 = xmm7;
4183     const XMMRegister xmm_result3 = xmm8;
4184     const XMMRegister xmm_result4 = xmm9;
4185     const XMMRegister xmm_result5 = xmm10;
4186 
4187     const XMMRegister xmm_from0 = xmm11;
4188     const XMMRegister xmm_from1 = xmm12;
4189     const XMMRegister xmm_from2 = xmm13;
4190     const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
4191     const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
4192     const XMMRegister xmm_from5 = xmm4;
4193 
4194     //for key_128, key_192, key_256
4195     const int rounds[3] = {10, 12, 14};
4196     Label L_exit_preLoop, L_preLoop_start;
4197     Label L_multiBlock_loopTop[3];
4198     Label L_singleBlockLoopTop[3];
4199     Label L__incCounter[3][6]; //for 6 blocks
4200     Label L__incCounter_single[3]; //for single block, key128, key192, key256
4201     Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
4202     Label L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
4203 
4204     Label L_exit;
4205 
4206     __ enter(); // required for proper stackwalking of RuntimeStub frame
4207 
4208 #ifdef _WIN64
4209     // allocate spill slots for r13, r14
4210     enum {
4211         saved_r13_offset,
4212         saved_r14_offset
4213     };
4214     __ subptr(rsp, 2 * wordSize);
4215     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
4216     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
4217 
4218     // on win64, fill len_reg from stack position
4219     __ movl(len_reg, len_mem);
4220     __ movptr(saved_encCounter_start, saved_encCounter_mem);
4221     __ movptr(used_addr, used_mem);
4222     __ movl(used, Address(used_addr, 0));
4223 #else
4224     __ push(len_reg); // Save
4225     __ movptr(used_addr, used_mem);
4226     __ movl(used, Address(used_addr, 0));
4227 #endif
4228 
4229     __ push(rbx); // Save RBX
4230     __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
4231     __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()), pos); // pos as scratch
4232     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
4233     __ movptr(pos, 0);
4234 
4235     // Use the partially used encrpyted counter from last invocation
4236     __ BIND(L_preLoop_start);
4237     __ cmpptr(used, 16);
4238     __ jcc(Assembler::aboveEqual, L_exit_preLoop);
4239       __ cmpptr(len_reg, 0);
4240       __ jcc(Assembler::lessEqual, L_exit_preLoop);
4241       __ movb(rbx, Address(saved_encCounter_start, used));
4242       __ xorb(rbx, Address(from, pos));
4243       __ movb(Address(to, pos), rbx);
4244       __ addptr(pos, 1);
4245       __ addptr(used, 1);
4246       __ subptr(len_reg, 1);
4247 
4248     __ jmp(L_preLoop_start);
4249 
4250     __ BIND(L_exit_preLoop);
4251     __ movl(Address(used_addr, 0), used);
4252 
4253     // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
4254     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); // rbx as scratch
4255     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4256     __ cmpl(rbx, 52);
4257     __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
4258     __ cmpl(rbx, 60);
4259     __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
4260 
4261 #define CTR_DoSix(opc, src_reg)                \
4262     __ opc(xmm_result0, src_reg);              \
4263     __ opc(xmm_result1, src_reg);              \
4264     __ opc(xmm_result2, src_reg);              \
4265     __ opc(xmm_result3, src_reg);              \
4266     __ opc(xmm_result4, src_reg);              \
4267     __ opc(xmm_result5, src_reg);
4268 
4269     // k == 0 :  generate code for key_128
4270     // k == 1 :  generate code for key_192
4271     // k == 2 :  generate code for key_256
4272     for (int k = 0; k < 3; ++k) {
4273       //multi blocks starts here
4274       __ align(OptoLoopAlignment);
4275       __ BIND(L_multiBlock_loopTop[k]);
4276       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
4277       __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
4278       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4279 
4280       //load, then increase counters
4281       CTR_DoSix(movdqa, xmm_curr_counter);
4282       inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
4283       inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
4284       inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
4285       inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
4286       inc_counter(rbx, xmm_result5,  0x05, L__incCounter[k][4]);
4287       inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
4288       CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
4289       CTR_DoSix(pxor, xmm_key_tmp0);   //PXOR with Round 0 key
4290 
4291       //load two ROUND_KEYs at a time
4292       for (int i = 1; i < rounds[k]; ) {
4293         load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
4294         load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
4295         CTR_DoSix(aesenc, xmm_key_tmp1);
4296         i++;
4297         if (i != rounds[k]) {
4298           CTR_DoSix(aesenc, xmm_key_tmp0);
4299         } else {
4300           CTR_DoSix(aesenclast, xmm_key_tmp0);
4301         }
4302         i++;
4303       }
4304 
4305       // get next PARALLEL_FACTOR blocks into xmm_result registers
4306       __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4307       __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4308       __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4309       __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4310       __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
4311       __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
4312 
4313       __ pxor(xmm_result0, xmm_from0);
4314       __ pxor(xmm_result1, xmm_from1);
4315       __ pxor(xmm_result2, xmm_from2);
4316       __ pxor(xmm_result3, xmm_from3);
4317       __ pxor(xmm_result4, xmm_from4);
4318       __ pxor(xmm_result5, xmm_from5);
4319 
4320       // store 6 results into the next 64 bytes of output
4321       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4322       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4323       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4324       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4325       __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
4326       __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
4327 
4328       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
4329       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
4330       __ jmp(L_multiBlock_loopTop[k]);
4331 
4332       // singleBlock starts here
4333       __ align(OptoLoopAlignment);
4334       __ BIND(L_singleBlockLoopTop[k]);
4335       __ cmpptr(len_reg, 0);
4336       __ jcc(Assembler::lessEqual, L_exit);
4337       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4338       __ movdqa(xmm_result0, xmm_curr_counter);
4339       inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
4340       __ pshufb(xmm_result0, xmm_counter_shuf_mask);
4341       __ pxor(xmm_result0, xmm_key_tmp0);
4342       for (int i = 1; i < rounds[k]; i++) {
4343         load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
4344         __ aesenc(xmm_result0, xmm_key_tmp0);
4345       }
4346       load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
4347       __ aesenclast(xmm_result0, xmm_key_tmp0);
4348       __ cmpptr(len_reg, AESBlockSize);
4349       __ jcc(Assembler::less, L_processTail_insr[k]);
4350         __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4351         __ pxor(xmm_result0, xmm_from0);
4352         __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4353         __ addptr(pos, AESBlockSize);
4354         __ subptr(len_reg, AESBlockSize);
4355         __ jmp(L_singleBlockLoopTop[k]);
4356       __ BIND(L_processTail_insr[k]);                               // Process the tail part of the input array
4357         __ addptr(pos, len_reg);                                    // 1. Insert bytes from src array into xmm_from0 register
4358         __ testptr(len_reg, 8);
4359         __ jcc(Assembler::zero, L_processTail_4_insr[k]);
4360           __ subptr(pos,8);
4361           __ pinsrq(xmm_from0, Address(from, pos), 0);
4362         __ BIND(L_processTail_4_insr[k]);
4363         __ testptr(len_reg, 4);
4364         __ jcc(Assembler::zero, L_processTail_2_insr[k]);
4365           __ subptr(pos,4);
4366           __ pslldq(xmm_from0, 4);
4367           __ pinsrd(xmm_from0, Address(from, pos), 0);
4368         __ BIND(L_processTail_2_insr[k]);
4369         __ testptr(len_reg, 2);
4370         __ jcc(Assembler::zero, L_processTail_1_insr[k]);
4371           __ subptr(pos, 2);
4372           __ pslldq(xmm_from0, 2);
4373           __ pinsrw(xmm_from0, Address(from, pos), 0);
4374         __ BIND(L_processTail_1_insr[k]);
4375         __ testptr(len_reg, 1);
4376         __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
4377           __ subptr(pos, 1);
4378           __ pslldq(xmm_from0, 1);
4379           __ pinsrb(xmm_from0, Address(from, pos), 0);
4380         __ BIND(L_processTail_exit_insr[k]);
4381 
4382         __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);  // 2. Perform pxor of the encrypted counter and plaintext Bytes.
4383         __ pxor(xmm_result0, xmm_from0);                             //    Also the encrypted counter is saved for next invocation.
4384 
4385         __ testptr(len_reg, 8);
4386         __ jcc(Assembler::zero, L_processTail_4_extr[k]);            // 3. Extract bytes from xmm_result0 into the dest. array
4387           __ pextrq(Address(to, pos), xmm_result0, 0);
4388           __ psrldq(xmm_result0, 8);
4389           __ addptr(pos, 8);
4390         __ BIND(L_processTail_4_extr[k]);
4391         __ testptr(len_reg, 4);
4392         __ jcc(Assembler::zero, L_processTail_2_extr[k]);
4393           __ pextrd(Address(to, pos), xmm_result0, 0);
4394           __ psrldq(xmm_result0, 4);
4395           __ addptr(pos, 4);
4396         __ BIND(L_processTail_2_extr[k]);
4397         __ testptr(len_reg, 2);
4398         __ jcc(Assembler::zero, L_processTail_1_extr[k]);
4399           __ pextrw(Address(to, pos), xmm_result0, 0);
4400           __ psrldq(xmm_result0, 2);
4401           __ addptr(pos, 2);
4402         __ BIND(L_processTail_1_extr[k]);
4403         __ testptr(len_reg, 1);
4404         __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
4405           __ pextrb(Address(to, pos), xmm_result0, 0);
4406 
4407         __ BIND(L_processTail_exit_extr[k]);
4408         __ movl(Address(used_addr, 0), len_reg);
4409         __ jmp(L_exit);
4410 
4411     }
4412 
4413     __ BIND(L_exit);
4414     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
4415     __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
4416     __ pop(rbx); // pop the saved RBX.
4417 #ifdef _WIN64
4418     __ movl(rax, len_mem);
4419     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
4420     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
4421     __ addptr(rsp, 2 * wordSize);
4422 #else
4423     __ pop(rax); // return 'len'
4424 #endif
4425     __ leave(); // required for proper stackwalking of RuntimeStub frame
4426     __ ret(0);
4427     return start;
4428   }
4429 
4430 void roundDec(XMMRegister xmm_reg) {
4431   __ vaesdec(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4432   __ vaesdec(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4433   __ vaesdec(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4434   __ vaesdec(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4435   __ vaesdec(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4436   __ vaesdec(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4437   __ vaesdec(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4438   __ vaesdec(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4439 }
4440 
4441 void roundDeclast(XMMRegister xmm_reg) {
4442   __ vaesdeclast(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4443   __ vaesdeclast(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4444   __ vaesdeclast(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4445   __ vaesdeclast(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4446   __ vaesdeclast(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4447   __ vaesdeclast(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4448   __ vaesdeclast(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4449   __ vaesdeclast(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4450 }
4451 
4452   void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = NULL) {
4453     __ movdqu(xmmdst, Address(key, offset));
4454     if (xmm_shuf_mask != NULL) {
4455       __ pshufb(xmmdst, xmm_shuf_mask);
4456     } else {
4457       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4458     }
4459     __ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
4460 
4461   }
4462 
4463 address generate_cipherBlockChaining_decryptVectorAESCrypt() {
4464     assert(VM_Version::supports_vaes(), "need AES instructions and misaligned SSE support");
4465     __ align(CodeEntryAlignment);
4466     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
4467     address start = __ pc();
4468 
4469     const Register from = c_rarg0;  // source array address
4470     const Register to = c_rarg1;  // destination array address
4471     const Register key = c_rarg2;  // key array address
4472     const Register rvec = c_rarg3;  // r byte array initialized from initvector array address
4473     // and left with the results of the last encryption block
4474 #ifndef _WIN64
4475     const Register len_reg = c_rarg4;  // src len (must be multiple of blocksize 16)
4476 #else
4477     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
4478     const Register len_reg = r11;      // pick the volatile windows register
4479 #endif
4480 
4481     Label Loop, Loop1, L_128, L_256, L_192, KEY_192, KEY_256, Loop2, Lcbc_dec_rem_loop,
4482           Lcbc_dec_rem_last, Lcbc_dec_ret, Lcbc_dec_rem, Lcbc_exit;
4483 
4484     __ enter();
4485 
4486 #ifdef _WIN64
4487   // on win64, fill len_reg from stack position
4488     __ movl(len_reg, len_mem);
4489 #else
4490     __ push(len_reg); // Save
4491 #endif
4492     __ push(rbx);
4493     __ vzeroupper();
4494 
4495     // Temporary variable declaration for swapping key bytes
4496     const XMMRegister xmm_key_shuf_mask = xmm1;
4497     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4498 
4499     // Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
4500     const Register rounds = rbx;
4501     __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4502 
4503     const XMMRegister IV = xmm0;
4504     // Load IV and broadcast value to 512-bits
4505     __ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit);
4506 
4507     // Temporary variables for storing round keys
4508     const XMMRegister RK0 = xmm30;
4509     const XMMRegister RK1 = xmm9;
4510     const XMMRegister RK2 = xmm18;
4511     const XMMRegister RK3 = xmm19;
4512     const XMMRegister RK4 = xmm20;
4513     const XMMRegister RK5 = xmm21;
4514     const XMMRegister RK6 = xmm22;
4515     const XMMRegister RK7 = xmm23;
4516     const XMMRegister RK8 = xmm24;
4517     const XMMRegister RK9 = xmm25;
4518     const XMMRegister RK10 = xmm26;
4519 
4520      // Load and shuffle key
4521     // the java expanded key ordering is rotated one position from what we want
4522     // so we start from 1*16 here and hit 0*16 last
4523     ev_load_key(RK1, key, 1 * 16, xmm_key_shuf_mask);
4524     ev_load_key(RK2, key, 2 * 16, xmm_key_shuf_mask);
4525     ev_load_key(RK3, key, 3 * 16, xmm_key_shuf_mask);
4526     ev_load_key(RK4, key, 4 * 16, xmm_key_shuf_mask);
4527     ev_load_key(RK5, key, 5 * 16, xmm_key_shuf_mask);
4528     ev_load_key(RK6, key, 6 * 16, xmm_key_shuf_mask);
4529     ev_load_key(RK7, key, 7 * 16, xmm_key_shuf_mask);
4530     ev_load_key(RK8, key, 8 * 16, xmm_key_shuf_mask);
4531     ev_load_key(RK9, key, 9 * 16, xmm_key_shuf_mask);
4532     ev_load_key(RK10, key, 10 * 16, xmm_key_shuf_mask);
4533     ev_load_key(RK0, key, 0*16, xmm_key_shuf_mask);
4534 
4535     // Variables for storing source cipher text
4536     const XMMRegister S0 = xmm10;
4537     const XMMRegister S1 = xmm11;
4538     const XMMRegister S2 = xmm12;
4539     const XMMRegister S3 = xmm13;
4540     const XMMRegister S4 = xmm14;
4541     const XMMRegister S5 = xmm15;
4542     const XMMRegister S6 = xmm16;
4543     const XMMRegister S7 = xmm17;
4544 
4545     // Variables for storing decrypted text
4546     const XMMRegister B0 = xmm1;
4547     const XMMRegister B1 = xmm2;
4548     const XMMRegister B2 = xmm3;
4549     const XMMRegister B3 = xmm4;
4550     const XMMRegister B4 = xmm5;
4551     const XMMRegister B5 = xmm6;
4552     const XMMRegister B6 = xmm7;
4553     const XMMRegister B7 = xmm8;
4554 
4555     __ cmpl(rounds, 44);
4556     __ jcc(Assembler::greater, KEY_192);
4557     __ jmp(Loop);
4558 
4559     __ BIND(KEY_192);
4560     const XMMRegister RK11 = xmm27;
4561     const XMMRegister RK12 = xmm28;
4562     ev_load_key(RK11, key, 11*16, xmm_key_shuf_mask);
4563     ev_load_key(RK12, key, 12*16, xmm_key_shuf_mask);
4564 
4565     __ cmpl(rounds, 52);
4566     __ jcc(Assembler::greater, KEY_256);
4567     __ jmp(Loop);
4568 
4569     __ BIND(KEY_256);
4570     const XMMRegister RK13 = xmm29;
4571     const XMMRegister RK14 = xmm31;
4572     ev_load_key(RK13, key, 13*16, xmm_key_shuf_mask);
4573     ev_load_key(RK14, key, 14*16, xmm_key_shuf_mask);
4574 
4575     __ BIND(Loop);
4576     __ cmpl(len_reg, 512);
4577     __ jcc(Assembler::below, Lcbc_dec_rem);
4578     __ BIND(Loop1);
4579     __ subl(len_reg, 512);
4580     __ evmovdquq(S0, Address(from, 0 * 64), Assembler::AVX_512bit);
4581     __ evmovdquq(S1, Address(from, 1 * 64), Assembler::AVX_512bit);
4582     __ evmovdquq(S2, Address(from, 2 * 64), Assembler::AVX_512bit);
4583     __ evmovdquq(S3, Address(from, 3 * 64), Assembler::AVX_512bit);
4584     __ evmovdquq(S4, Address(from, 4 * 64), Assembler::AVX_512bit);
4585     __ evmovdquq(S5, Address(from, 5 * 64), Assembler::AVX_512bit);
4586     __ evmovdquq(S6, Address(from, 6 * 64), Assembler::AVX_512bit);
4587     __ evmovdquq(S7, Address(from, 7 * 64), Assembler::AVX_512bit);
4588     __ leaq(from, Address(from, 8 * 64));
4589 
4590     __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
4591     __ evpxorq(B1, S1, RK1, Assembler::AVX_512bit);
4592     __ evpxorq(B2, S2, RK1, Assembler::AVX_512bit);
4593     __ evpxorq(B3, S3, RK1, Assembler::AVX_512bit);
4594     __ evpxorq(B4, S4, RK1, Assembler::AVX_512bit);
4595     __ evpxorq(B5, S5, RK1, Assembler::AVX_512bit);
4596     __ evpxorq(B6, S6, RK1, Assembler::AVX_512bit);
4597     __ evpxorq(B7, S7, RK1, Assembler::AVX_512bit);
4598 
4599     __ evalignq(IV, S0, IV, 0x06);
4600     __ evalignq(S0, S1, S0, 0x06);
4601     __ evalignq(S1, S2, S1, 0x06);
4602     __ evalignq(S2, S3, S2, 0x06);
4603     __ evalignq(S3, S4, S3, 0x06);
4604     __ evalignq(S4, S5, S4, 0x06);
4605     __ evalignq(S5, S6, S5, 0x06);
4606     __ evalignq(S6, S7, S6, 0x06);
4607 
4608     roundDec(RK2);
4609     roundDec(RK3);
4610     roundDec(RK4);
4611     roundDec(RK5);
4612     roundDec(RK6);
4613     roundDec(RK7);
4614     roundDec(RK8);
4615     roundDec(RK9);
4616     roundDec(RK10);
4617 
4618     __ cmpl(rounds, 44);
4619     __ jcc(Assembler::belowEqual, L_128);
4620     roundDec(RK11);
4621     roundDec(RK12);
4622 
4623     __ cmpl(rounds, 52);
4624     __ jcc(Assembler::belowEqual, L_192);
4625     roundDec(RK13);
4626     roundDec(RK14);
4627 
4628     __ BIND(L_256);
4629     roundDeclast(RK0);
4630     __ jmp(Loop2);
4631 
4632     __ BIND(L_128);
4633     roundDeclast(RK0);
4634     __ jmp(Loop2);
4635 
4636     __ BIND(L_192);
4637     roundDeclast(RK0);
4638 
4639     __ BIND(Loop2);
4640     __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
4641     __ evpxorq(B1, B1, S0, Assembler::AVX_512bit);
4642     __ evpxorq(B2, B2, S1, Assembler::AVX_512bit);
4643     __ evpxorq(B3, B3, S2, Assembler::AVX_512bit);
4644     __ evpxorq(B4, B4, S3, Assembler::AVX_512bit);
4645     __ evpxorq(B5, B5, S4, Assembler::AVX_512bit);
4646     __ evpxorq(B6, B6, S5, Assembler::AVX_512bit);
4647     __ evpxorq(B7, B7, S6, Assembler::AVX_512bit);
4648     __ evmovdquq(IV, S7, Assembler::AVX_512bit);
4649 
4650     __ evmovdquq(Address(to, 0 * 64), B0, Assembler::AVX_512bit);
4651     __ evmovdquq(Address(to, 1 * 64), B1, Assembler::AVX_512bit);
4652     __ evmovdquq(Address(to, 2 * 64), B2, Assembler::AVX_512bit);
4653     __ evmovdquq(Address(to, 3 * 64), B3, Assembler::AVX_512bit);
4654     __ evmovdquq(Address(to, 4 * 64), B4, Assembler::AVX_512bit);
4655     __ evmovdquq(Address(to, 5 * 64), B5, Assembler::AVX_512bit);
4656     __ evmovdquq(Address(to, 6 * 64), B6, Assembler::AVX_512bit);
4657     __ evmovdquq(Address(to, 7 * 64), B7, Assembler::AVX_512bit);
4658     __ leaq(to, Address(to, 8 * 64));
4659     __ jmp(Loop);
4660 
4661     __ BIND(Lcbc_dec_rem);
4662     __ evshufi64x2(IV, IV, IV, 0x03, Assembler::AVX_512bit);
4663 
4664     __ BIND(Lcbc_dec_rem_loop);
4665     __ subl(len_reg, 16);
4666     __ jcc(Assembler::carrySet, Lcbc_dec_ret);
4667 
4668     __ movdqu(S0, Address(from, 0));
4669     __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
4670     __ vaesdec(B0, B0, RK2, Assembler::AVX_512bit);
4671     __ vaesdec(B0, B0, RK3, Assembler::AVX_512bit);
4672     __ vaesdec(B0, B0, RK4, Assembler::AVX_512bit);
4673     __ vaesdec(B0, B0, RK5, Assembler::AVX_512bit);
4674     __ vaesdec(B0, B0, RK6, Assembler::AVX_512bit);
4675     __ vaesdec(B0, B0, RK7, Assembler::AVX_512bit);
4676     __ vaesdec(B0, B0, RK8, Assembler::AVX_512bit);
4677     __ vaesdec(B0, B0, RK9, Assembler::AVX_512bit);
4678     __ vaesdec(B0, B0, RK10, Assembler::AVX_512bit);
4679     __ cmpl(rounds, 44);
4680     __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
4681 
4682     __ vaesdec(B0, B0, RK11, Assembler::AVX_512bit);
4683     __ vaesdec(B0, B0, RK12, Assembler::AVX_512bit);
4684     __ cmpl(rounds, 52);
4685     __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
4686 
4687     __ vaesdec(B0, B0, RK13, Assembler::AVX_512bit);
4688     __ vaesdec(B0, B0, RK14, Assembler::AVX_512bit);
4689 
4690     __ BIND(Lcbc_dec_rem_last);
4691     __ vaesdeclast(B0, B0, RK0, Assembler::AVX_512bit);
4692 
4693     __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
4694     __ evmovdquq(IV, S0, Assembler::AVX_512bit);
4695     __ movdqu(Address(to, 0), B0);
4696     __ leaq(from, Address(from, 16));
4697     __ leaq(to, Address(to, 16));
4698     __ jmp(Lcbc_dec_rem_loop);
4699 
4700     __ BIND(Lcbc_dec_ret);
4701     __ movdqu(Address(rvec, 0), IV);
4702 
4703     // Zero out the round keys
4704     __ evpxorq(RK0, RK0, RK0, Assembler::AVX_512bit);
4705     __ evpxorq(RK1, RK1, RK1, Assembler::AVX_512bit);
4706     __ evpxorq(RK2, RK2, RK2, Assembler::AVX_512bit);
4707     __ evpxorq(RK3, RK3, RK3, Assembler::AVX_512bit);
4708     __ evpxorq(RK4, RK4, RK4, Assembler::AVX_512bit);
4709     __ evpxorq(RK5, RK5, RK5, Assembler::AVX_512bit);
4710     __ evpxorq(RK6, RK6, RK6, Assembler::AVX_512bit);
4711     __ evpxorq(RK7, RK7, RK7, Assembler::AVX_512bit);
4712     __ evpxorq(RK8, RK8, RK8, Assembler::AVX_512bit);
4713     __ evpxorq(RK9, RK9, RK9, Assembler::AVX_512bit);
4714     __ evpxorq(RK10, RK10, RK10, Assembler::AVX_512bit);
4715     __ cmpl(rounds, 44);
4716     __ jcc(Assembler::belowEqual, Lcbc_exit);
4717     __ evpxorq(RK11, RK11, RK11, Assembler::AVX_512bit);
4718     __ evpxorq(RK12, RK12, RK12, Assembler::AVX_512bit);
4719     __ cmpl(rounds, 52);
4720     __ jcc(Assembler::belowEqual, Lcbc_exit);
4721     __ evpxorq(RK13, RK13, RK13, Assembler::AVX_512bit);
4722     __ evpxorq(RK14, RK14, RK14, Assembler::AVX_512bit);
4723 
4724     __ BIND(Lcbc_exit);
4725     __ pop(rbx);
4726 #ifdef _WIN64
4727     __ movl(rax, len_mem);
4728 #else
4729     __ pop(rax); // return length
4730 #endif
4731     __ leave(); // required for proper stackwalking of RuntimeStub frame
4732     __ ret(0);
4733     return start;
4734 }
4735 
4736 // Polynomial x^128+x^127+x^126+x^121+1
4737 address ghash_polynomial_addr() {
4738     __ align(CodeEntryAlignment);
4739     StubCodeMark mark(this, "StubRoutines", "_ghash_poly_addr");
4740     address start = __ pc();
4741     __ emit_data64(0x0000000000000001, relocInfo::none);
4742     __ emit_data64(0xc200000000000000, relocInfo::none);
4743     return start;
4744 }
4745 
4746 address ghash_shufflemask_addr() {
4747     __ align(CodeEntryAlignment);
4748     StubCodeMark mark(this, "StubRoutines", "_ghash_shuffmask_addr");
4749     address start = __ pc();
4750     __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
4751     __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
4752     return start;
4753 }
4754 
4755 // Ghash single and multi block operations using AVX instructions
4756 address generate_avx_ghash_processBlocks() {
4757     __ align(CodeEntryAlignment);
4758 
4759     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4760     address start = __ pc();
4761 
4762     // arguments
4763     const Register state = c_rarg0;
4764     const Register htbl = c_rarg1;
4765     const Register data = c_rarg2;
4766     const Register blocks = c_rarg3;
4767     __ enter();
4768    // Save state before entering routine
4769     __ avx_ghash(state, htbl, data, blocks);
4770     __ leave(); // required for proper stackwalking of RuntimeStub frame
4771     __ ret(0);
4772     return start;
4773 }
4774 
4775   // byte swap x86 long
4776   address generate_ghash_long_swap_mask() {
4777     __ align(CodeEntryAlignment);
4778     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
4779     address start = __ pc();
4780     __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
4781     __ emit_data64(0x0706050403020100, relocInfo::none );
4782   return start;
4783   }
4784 
4785   // byte swap x86 byte array
4786   address generate_ghash_byte_swap_mask() {
4787     __ align(CodeEntryAlignment);
4788     StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
4789     address start = __ pc();
4790     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
4791     __ emit_data64(0x0001020304050607, relocInfo::none );
4792   return start;
4793   }
4794 
4795   /* Single and multi-block ghash operations */
4796   address generate_ghash_processBlocks() {
4797     __ align(CodeEntryAlignment);
4798     Label L_ghash_loop, L_exit;
4799     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4800     address start = __ pc();
4801 
4802     const Register state        = c_rarg0;
4803     const Register subkeyH      = c_rarg1;
4804     const Register data         = c_rarg2;
4805     const Register blocks       = c_rarg3;
4806 
4807     const XMMRegister xmm_temp0 = xmm0;
4808     const XMMRegister xmm_temp1 = xmm1;
4809     const XMMRegister xmm_temp2 = xmm2;
4810     const XMMRegister xmm_temp3 = xmm3;
4811     const XMMRegister xmm_temp4 = xmm4;
4812     const XMMRegister xmm_temp5 = xmm5;
4813     const XMMRegister xmm_temp6 = xmm6;
4814     const XMMRegister xmm_temp7 = xmm7;
4815     const XMMRegister xmm_temp8 = xmm8;
4816     const XMMRegister xmm_temp9 = xmm9;
4817     const XMMRegister xmm_temp10 = xmm10;
4818 
4819     __ enter();
4820 
4821     __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
4822 
4823     __ movdqu(xmm_temp0, Address(state, 0));
4824     __ pshufb(xmm_temp0, xmm_temp10);
4825 
4826 
4827     __ BIND(L_ghash_loop);
4828     __ movdqu(xmm_temp2, Address(data, 0));
4829     __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
4830 
4831     __ movdqu(xmm_temp1, Address(subkeyH, 0));
4832     __ pshufb(xmm_temp1, xmm_temp10);
4833 
4834     __ pxor(xmm_temp0, xmm_temp2);
4835 
4836     //
4837     // Multiply with the hash key
4838     //
4839     __ movdqu(xmm_temp3, xmm_temp0);
4840     __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
4841     __ movdqu(xmm_temp4, xmm_temp0);
4842     __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
4843 
4844     __ movdqu(xmm_temp5, xmm_temp0);
4845     __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
4846     __ movdqu(xmm_temp6, xmm_temp0);
4847     __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
4848 
4849     __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
4850 
4851     __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
4852     __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
4853     __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
4854     __ pxor(xmm_temp3, xmm_temp5);
4855     __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
4856                                         // of the carry-less multiplication of
4857                                         // xmm0 by xmm1.
4858 
4859     // We shift the result of the multiplication by one bit position
4860     // to the left to cope for the fact that the bits are reversed.
4861     __ movdqu(xmm_temp7, xmm_temp3);
4862     __ movdqu(xmm_temp8, xmm_temp6);
4863     __ pslld(xmm_temp3, 1);
4864     __ pslld(xmm_temp6, 1);
4865     __ psrld(xmm_temp7, 31);
4866     __ psrld(xmm_temp8, 31);
4867     __ movdqu(xmm_temp9, xmm_temp7);
4868     __ pslldq(xmm_temp8, 4);
4869     __ pslldq(xmm_temp7, 4);
4870     __ psrldq(xmm_temp9, 12);
4871     __ por(xmm_temp3, xmm_temp7);
4872     __ por(xmm_temp6, xmm_temp8);
4873     __ por(xmm_temp6, xmm_temp9);
4874 
4875     //
4876     // First phase of the reduction
4877     //
4878     // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
4879     // independently.
4880     __ movdqu(xmm_temp7, xmm_temp3);
4881     __ movdqu(xmm_temp8, xmm_temp3);
4882     __ movdqu(xmm_temp9, xmm_temp3);
4883     __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
4884     __ pslld(xmm_temp8, 30);    // packed right shift shifting << 30
4885     __ pslld(xmm_temp9, 25);    // packed right shift shifting << 25
4886     __ pxor(xmm_temp7, xmm_temp8);      // xor the shifted versions
4887     __ pxor(xmm_temp7, xmm_temp9);
4888     __ movdqu(xmm_temp8, xmm_temp7);
4889     __ pslldq(xmm_temp7, 12);
4890     __ psrldq(xmm_temp8, 4);
4891     __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
4892 
4893     //
4894     // Second phase of the reduction
4895     //
4896     // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
4897     // shift operations.
4898     __ movdqu(xmm_temp2, xmm_temp3);
4899     __ movdqu(xmm_temp4, xmm_temp3);
4900     __ movdqu(xmm_temp5, xmm_temp3);
4901     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
4902     __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
4903     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
4904     __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
4905     __ pxor(xmm_temp2, xmm_temp5);
4906     __ pxor(xmm_temp2, xmm_temp8);
4907     __ pxor(xmm_temp3, xmm_temp2);
4908     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
4909 
4910     __ decrement(blocks);
4911     __ jcc(Assembler::zero, L_exit);
4912     __ movdqu(xmm_temp0, xmm_temp6);
4913     __ addptr(data, 16);
4914     __ jmp(L_ghash_loop);
4915 
4916     __ BIND(L_exit);
4917     __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
4918     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
4919     __ leave();
4920     __ ret(0);
4921     return start;
4922   }
4923 
4924   //base64 character set
4925   address base64_charset_addr() {
4926     __ align(CodeEntryAlignment);
4927     StubCodeMark mark(this, "StubRoutines", "base64_charset");
4928     address start = __ pc();
4929     __ emit_data64(0x0000004200000041, relocInfo::none);
4930     __ emit_data64(0x0000004400000043, relocInfo::none);
4931     __ emit_data64(0x0000004600000045, relocInfo::none);
4932     __ emit_data64(0x0000004800000047, relocInfo::none);
4933     __ emit_data64(0x0000004a00000049, relocInfo::none);
4934     __ emit_data64(0x0000004c0000004b, relocInfo::none);
4935     __ emit_data64(0x0000004e0000004d, relocInfo::none);
4936     __ emit_data64(0x000000500000004f, relocInfo::none);
4937     __ emit_data64(0x0000005200000051, relocInfo::none);
4938     __ emit_data64(0x0000005400000053, relocInfo::none);
4939     __ emit_data64(0x0000005600000055, relocInfo::none);
4940     __ emit_data64(0x0000005800000057, relocInfo::none);
4941     __ emit_data64(0x0000005a00000059, relocInfo::none);
4942     __ emit_data64(0x0000006200000061, relocInfo::none);
4943     __ emit_data64(0x0000006400000063, relocInfo::none);
4944     __ emit_data64(0x0000006600000065, relocInfo::none);
4945     __ emit_data64(0x0000006800000067, relocInfo::none);
4946     __ emit_data64(0x0000006a00000069, relocInfo::none);
4947     __ emit_data64(0x0000006c0000006b, relocInfo::none);
4948     __ emit_data64(0x0000006e0000006d, relocInfo::none);
4949     __ emit_data64(0x000000700000006f, relocInfo::none);
4950     __ emit_data64(0x0000007200000071, relocInfo::none);
4951     __ emit_data64(0x0000007400000073, relocInfo::none);
4952     __ emit_data64(0x0000007600000075, relocInfo::none);
4953     __ emit_data64(0x0000007800000077, relocInfo::none);
4954     __ emit_data64(0x0000007a00000079, relocInfo::none);
4955     __ emit_data64(0x0000003100000030, relocInfo::none);
4956     __ emit_data64(0x0000003300000032, relocInfo::none);
4957     __ emit_data64(0x0000003500000034, relocInfo::none);
4958     __ emit_data64(0x0000003700000036, relocInfo::none);
4959     __ emit_data64(0x0000003900000038, relocInfo::none);
4960     __ emit_data64(0x0000002f0000002b, relocInfo::none);
4961     return start;
4962   }
4963 
4964   //base64 url character set
4965   address base64url_charset_addr() {
4966     __ align(CodeEntryAlignment);
4967     StubCodeMark mark(this, "StubRoutines", "base64url_charset");
4968     address start = __ pc();
4969     __ emit_data64(0x0000004200000041, relocInfo::none);
4970     __ emit_data64(0x0000004400000043, relocInfo::none);
4971     __ emit_data64(0x0000004600000045, relocInfo::none);
4972     __ emit_data64(0x0000004800000047, relocInfo::none);
4973     __ emit_data64(0x0000004a00000049, relocInfo::none);
4974     __ emit_data64(0x0000004c0000004b, relocInfo::none);
4975     __ emit_data64(0x0000004e0000004d, relocInfo::none);
4976     __ emit_data64(0x000000500000004f, relocInfo::none);
4977     __ emit_data64(0x0000005200000051, relocInfo::none);
4978     __ emit_data64(0x0000005400000053, relocInfo::none);
4979     __ emit_data64(0x0000005600000055, relocInfo::none);
4980     __ emit_data64(0x0000005800000057, relocInfo::none);
4981     __ emit_data64(0x0000005a00000059, relocInfo::none);
4982     __ emit_data64(0x0000006200000061, relocInfo::none);
4983     __ emit_data64(0x0000006400000063, relocInfo::none);
4984     __ emit_data64(0x0000006600000065, relocInfo::none);
4985     __ emit_data64(0x0000006800000067, relocInfo::none);
4986     __ emit_data64(0x0000006a00000069, relocInfo::none);
4987     __ emit_data64(0x0000006c0000006b, relocInfo::none);
4988     __ emit_data64(0x0000006e0000006d, relocInfo::none);
4989     __ emit_data64(0x000000700000006f, relocInfo::none);
4990     __ emit_data64(0x0000007200000071, relocInfo::none);
4991     __ emit_data64(0x0000007400000073, relocInfo::none);
4992     __ emit_data64(0x0000007600000075, relocInfo::none);
4993     __ emit_data64(0x0000007800000077, relocInfo::none);
4994     __ emit_data64(0x0000007a00000079, relocInfo::none);
4995     __ emit_data64(0x0000003100000030, relocInfo::none);
4996     __ emit_data64(0x0000003300000032, relocInfo::none);
4997     __ emit_data64(0x0000003500000034, relocInfo::none);
4998     __ emit_data64(0x0000003700000036, relocInfo::none);
4999     __ emit_data64(0x0000003900000038, relocInfo::none);
5000     __ emit_data64(0x0000005f0000002d, relocInfo::none);
5001 
5002     return start;
5003   }
5004 
5005   address base64_bswap_mask_addr() {
5006     __ align(CodeEntryAlignment);
5007     StubCodeMark mark(this, "StubRoutines", "bswap_mask_base64");
5008     address start = __ pc();
5009     __ emit_data64(0x0504038002010080, relocInfo::none);
5010     __ emit_data64(0x0b0a098008070680, relocInfo::none);
5011     __ emit_data64(0x0908078006050480, relocInfo::none);
5012     __ emit_data64(0x0f0e0d800c0b0a80, relocInfo::none);
5013     __ emit_data64(0x0605048003020180, relocInfo::none);
5014     __ emit_data64(0x0c0b0a8009080780, relocInfo::none);
5015     __ emit_data64(0x0504038002010080, relocInfo::none);
5016     __ emit_data64(0x0b0a098008070680, relocInfo::none);
5017 
5018     return start;
5019   }
5020 
5021   address base64_right_shift_mask_addr() {
5022     __ align(CodeEntryAlignment);
5023     StubCodeMark mark(this, "StubRoutines", "right_shift_mask");
5024     address start = __ pc();
5025     __ emit_data64(0x0006000400020000, relocInfo::none);
5026     __ emit_data64(0x0006000400020000, relocInfo::none);
5027     __ emit_data64(0x0006000400020000, relocInfo::none);
5028     __ emit_data64(0x0006000400020000, relocInfo::none);
5029     __ emit_data64(0x0006000400020000, relocInfo::none);
5030     __ emit_data64(0x0006000400020000, relocInfo::none);
5031     __ emit_data64(0x0006000400020000, relocInfo::none);
5032     __ emit_data64(0x0006000400020000, relocInfo::none);
5033 
5034     return start;
5035   }
5036 
5037   address base64_left_shift_mask_addr() {
5038     __ align(CodeEntryAlignment);
5039     StubCodeMark mark(this, "StubRoutines", "left_shift_mask");
5040     address start = __ pc();
5041     __ emit_data64(0x0000000200040000, relocInfo::none);
5042     __ emit_data64(0x0000000200040000, relocInfo::none);
5043     __ emit_data64(0x0000000200040000, relocInfo::none);
5044     __ emit_data64(0x0000000200040000, relocInfo::none);
5045     __ emit_data64(0x0000000200040000, relocInfo::none);
5046     __ emit_data64(0x0000000200040000, relocInfo::none);
5047     __ emit_data64(0x0000000200040000, relocInfo::none);
5048     __ emit_data64(0x0000000200040000, relocInfo::none);
5049 
5050     return start;
5051   }
5052 
5053   address base64_and_mask_addr() {
5054     __ align(CodeEntryAlignment);
5055     StubCodeMark mark(this, "StubRoutines", "and_mask");
5056     address start = __ pc();
5057     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5058     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5059     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5060     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5061     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5062     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5063     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5064     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5065     return start;
5066   }
5067 
5068   address base64_gather_mask_addr() {
5069     __ align(CodeEntryAlignment);
5070     StubCodeMark mark(this, "StubRoutines", "gather_mask");
5071     address start = __ pc();
5072     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5073     return start;
5074   }
5075 
5076 // Code for generating Base64 encoding.
5077 // Intrinsic function prototype in Base64.java:
5078 // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) {
5079   address generate_base64_encodeBlock() {
5080     __ align(CodeEntryAlignment);
5081     StubCodeMark mark(this, "StubRoutines", "implEncode");
5082     address start = __ pc();
5083     __ enter();
5084 
5085     // Save callee-saved registers before using them
5086     __ push(r12);
5087     __ push(r13);
5088     __ push(r14);
5089     __ push(r15);
5090 
5091     // arguments
5092     const Register source = c_rarg0; // Source Array
5093     const Register start_offset = c_rarg1; // start offset
5094     const Register end_offset = c_rarg2; // end offset
5095     const Register dest = c_rarg3; // destination array
5096 
5097 #ifndef _WIN64
5098     const Register dp = c_rarg4;  // Position for writing to dest array
5099     const Register isURL = c_rarg5;// Base64 or URL character set
5100 #else
5101     const Address  dp_mem(rbp, 6 * wordSize);  // length is on stack on Win64
5102     const Address isURL_mem(rbp, 7 * wordSize);
5103     const Register isURL = r10;      // pick the volatile windows register
5104     const Register dp = r12;
5105     __ movl(dp, dp_mem);
5106     __ movl(isURL, isURL_mem);
5107 #endif
5108 
5109     const Register length = r14;
5110     Label L_process80, L_process32, L_process3, L_exit, L_processdata;
5111 
5112     // calculate length from offsets
5113     __ movl(length, end_offset);
5114     __ subl(length, start_offset);
5115     __ cmpl(length, 0);
5116     __ jcc(Assembler::lessEqual, L_exit);
5117 
5118     __ lea(r11, ExternalAddress(StubRoutines::x86::base64_charset_addr()));
5119     // check if base64 charset(isURL=0) or base64 url charset(isURL=1) needs to be loaded
5120     __ cmpl(isURL, 0);
5121     __ jcc(Assembler::equal, L_processdata);
5122     __ lea(r11, ExternalAddress(StubRoutines::x86::base64url_charset_addr()));
5123 
5124     // load masks required for encoding data
5125     __ BIND(L_processdata);
5126     __ movdqu(xmm16, ExternalAddress(StubRoutines::x86::base64_gather_mask_addr()));
5127     // Set 64 bits of K register.
5128     __ evpcmpeqb(k3, xmm16, xmm16, Assembler::AVX_512bit);
5129     __ evmovdquq(xmm12, ExternalAddress(StubRoutines::x86::base64_bswap_mask_addr()), Assembler::AVX_256bit, r13);
5130     __ evmovdquq(xmm13, ExternalAddress(StubRoutines::x86::base64_right_shift_mask_addr()), Assembler::AVX_512bit, r13);
5131     __ evmovdquq(xmm14, ExternalAddress(StubRoutines::x86::base64_left_shift_mask_addr()), Assembler::AVX_512bit, r13);
5132     __ evmovdquq(xmm15, ExternalAddress(StubRoutines::x86::base64_and_mask_addr()), Assembler::AVX_512bit, r13);
5133 
5134     // Vector Base64 implementation, producing 96 bytes of encoded data
5135     __ BIND(L_process80);
5136     __ cmpl(length, 80);
5137     __ jcc(Assembler::below, L_process32);
5138     __ evmovdquq(xmm0, Address(source, start_offset, Address::times_1, 0), Assembler::AVX_256bit);
5139     __ evmovdquq(xmm1, Address(source, start_offset, Address::times_1, 24), Assembler::AVX_256bit);
5140     __ evmovdquq(xmm2, Address(source, start_offset, Address::times_1, 48), Assembler::AVX_256bit);
5141 
5142     //permute the input data in such a manner that we have continuity of the source
5143     __ vpermq(xmm3, xmm0, 148, Assembler::AVX_256bit);
5144     __ vpermq(xmm4, xmm1, 148, Assembler::AVX_256bit);
5145     __ vpermq(xmm5, xmm2, 148, Assembler::AVX_256bit);
5146 
5147     //shuffle input and group 3 bytes of data and to it add 0 as the 4th byte.
5148     //we can deal with 12 bytes at a time in a 128 bit register
5149     __ vpshufb(xmm3, xmm3, xmm12, Assembler::AVX_256bit);
5150     __ vpshufb(xmm4, xmm4, xmm12, Assembler::AVX_256bit);
5151     __ vpshufb(xmm5, xmm5, xmm12, Assembler::AVX_256bit);
5152 
5153     //convert byte to word. Each 128 bit register will have 6 bytes for processing
5154     __ vpmovzxbw(xmm3, xmm3, Assembler::AVX_512bit);
5155     __ vpmovzxbw(xmm4, xmm4, Assembler::AVX_512bit);
5156     __ vpmovzxbw(xmm5, xmm5, Assembler::AVX_512bit);
5157 
5158     // Extract bits in the following pattern 6, 4+2, 2+4, 6 to convert 3, 8 bit numbers to 4, 6 bit numbers
5159     __ evpsrlvw(xmm0, xmm3, xmm13,  Assembler::AVX_512bit);
5160     __ evpsrlvw(xmm1, xmm4, xmm13, Assembler::AVX_512bit);
5161     __ evpsrlvw(xmm2, xmm5, xmm13, Assembler::AVX_512bit);
5162 
5163     __ evpsllvw(xmm3, xmm3, xmm14, Assembler::AVX_512bit);
5164     __ evpsllvw(xmm4, xmm4, xmm14, Assembler::AVX_512bit);
5165     __ evpsllvw(xmm5, xmm5, xmm14, Assembler::AVX_512bit);
5166 
5167     __ vpsrlq(xmm0, xmm0, 8, Assembler::AVX_512bit);
5168     __ vpsrlq(xmm1, xmm1, 8, Assembler::AVX_512bit);
5169     __ vpsrlq(xmm2, xmm2, 8, Assembler::AVX_512bit);
5170 
5171     __ vpsllq(xmm3, xmm3, 8, Assembler::AVX_512bit);
5172     __ vpsllq(xmm4, xmm4, 8, Assembler::AVX_512bit);
5173     __ vpsllq(xmm5, xmm5, 8, Assembler::AVX_512bit);
5174 
5175     __ vpandq(xmm3, xmm3, xmm15, Assembler::AVX_512bit);
5176     __ vpandq(xmm4, xmm4, xmm15, Assembler::AVX_512bit);
5177     __ vpandq(xmm5, xmm5, xmm15, Assembler::AVX_512bit);
5178 
5179     // Get the final 4*6 bits base64 encoding
5180     __ vporq(xmm3, xmm3, xmm0, Assembler::AVX_512bit);
5181     __ vporq(xmm4, xmm4, xmm1, Assembler::AVX_512bit);
5182     __ vporq(xmm5, xmm5, xmm2, Assembler::AVX_512bit);
5183 
5184     // Shift
5185     __ vpsrlq(xmm3, xmm3, 8, Assembler::AVX_512bit);
5186     __ vpsrlq(xmm4, xmm4, 8, Assembler::AVX_512bit);
5187     __ vpsrlq(xmm5, xmm5, 8, Assembler::AVX_512bit);
5188 
5189     // look up 6 bits in the base64 character set to fetch the encoding
5190     // we are converting word to dword as gather instructions need dword indices for looking up encoding
5191     __ vextracti64x4(xmm6, xmm3, 0);
5192     __ vpmovzxwd(xmm0, xmm6, Assembler::AVX_512bit);
5193     __ vextracti64x4(xmm6, xmm3, 1);
5194     __ vpmovzxwd(xmm1, xmm6, Assembler::AVX_512bit);
5195 
5196     __ vextracti64x4(xmm6, xmm4, 0);
5197     __ vpmovzxwd(xmm2, xmm6, Assembler::AVX_512bit);
5198     __ vextracti64x4(xmm6, xmm4, 1);
5199     __ vpmovzxwd(xmm3, xmm6, Assembler::AVX_512bit);
5200 
5201     __ vextracti64x4(xmm4, xmm5, 0);
5202     __ vpmovzxwd(xmm6, xmm4, Assembler::AVX_512bit);
5203 
5204     __ vextracti64x4(xmm4, xmm5, 1);
5205     __ vpmovzxwd(xmm7, xmm4, Assembler::AVX_512bit);
5206 
5207     __ kmovql(k2, k3);
5208     __ evpgatherdd(xmm4, k2, Address(r11, xmm0, Address::times_4, 0), Assembler::AVX_512bit);
5209     __ kmovql(k2, k3);
5210     __ evpgatherdd(xmm5, k2, Address(r11, xmm1, Address::times_4, 0), Assembler::AVX_512bit);
5211     __ kmovql(k2, k3);
5212     __ evpgatherdd(xmm8, k2, Address(r11, xmm2, Address::times_4, 0), Assembler::AVX_512bit);
5213     __ kmovql(k2, k3);
5214     __ evpgatherdd(xmm9, k2, Address(r11, xmm3, Address::times_4, 0), Assembler::AVX_512bit);
5215     __ kmovql(k2, k3);
5216     __ evpgatherdd(xmm10, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit);
5217     __ kmovql(k2, k3);
5218     __ evpgatherdd(xmm11, k2, Address(r11, xmm7, Address::times_4, 0), Assembler::AVX_512bit);
5219 
5220     //Down convert dword to byte. Final output is 16*6 = 96 bytes long
5221     __ evpmovdb(Address(dest, dp, Address::times_1, 0), xmm4, Assembler::AVX_512bit);
5222     __ evpmovdb(Address(dest, dp, Address::times_1, 16), xmm5, Assembler::AVX_512bit);
5223     __ evpmovdb(Address(dest, dp, Address::times_1, 32), xmm8, Assembler::AVX_512bit);
5224     __ evpmovdb(Address(dest, dp, Address::times_1, 48), xmm9, Assembler::AVX_512bit);
5225     __ evpmovdb(Address(dest, dp, Address::times_1, 64), xmm10, Assembler::AVX_512bit);
5226     __ evpmovdb(Address(dest, dp, Address::times_1, 80), xmm11, Assembler::AVX_512bit);
5227 
5228     __ addq(dest, 96);
5229     __ addq(source, 72);
5230     __ subq(length, 72);
5231     __ jmp(L_process80);
5232 
5233     // Vector Base64 implementation generating 32 bytes of encoded data
5234     __ BIND(L_process32);
5235     __ cmpl(length, 32);
5236     __ jcc(Assembler::below, L_process3);
5237     __ evmovdquq(xmm0, Address(source, start_offset), Assembler::AVX_256bit);
5238     __ vpermq(xmm0, xmm0, 148, Assembler::AVX_256bit);
5239     __ vpshufb(xmm6, xmm0, xmm12, Assembler::AVX_256bit);
5240     __ vpmovzxbw(xmm6, xmm6, Assembler::AVX_512bit);
5241     __ evpsrlvw(xmm2, xmm6, xmm13, Assembler::AVX_512bit);
5242     __ evpsllvw(xmm3, xmm6, xmm14, Assembler::AVX_512bit);
5243 
5244     __ vpsrlq(xmm2, xmm2, 8, Assembler::AVX_512bit);
5245     __ vpsllq(xmm3, xmm3, 8, Assembler::AVX_512bit);
5246     __ vpandq(xmm3, xmm3, xmm15, Assembler::AVX_512bit);
5247     __ vporq(xmm1, xmm2, xmm3, Assembler::AVX_512bit);
5248     __ vpsrlq(xmm1, xmm1, 8, Assembler::AVX_512bit);
5249     __ vextracti64x4(xmm9, xmm1, 0);
5250     __ vpmovzxwd(xmm6, xmm9, Assembler::AVX_512bit);
5251     __ vextracti64x4(xmm9, xmm1, 1);
5252     __ vpmovzxwd(xmm5, xmm9,  Assembler::AVX_512bit);
5253     __ kmovql(k2, k3);
5254     __ evpgatherdd(xmm8, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit);
5255     __ kmovql(k2, k3);
5256     __ evpgatherdd(xmm10, k2, Address(r11, xmm5, Address::times_4, 0), Assembler::AVX_512bit);
5257     __ evpmovdb(Address(dest, dp, Address::times_1, 0), xmm8, Assembler::AVX_512bit);
5258     __ evpmovdb(Address(dest, dp, Address::times_1, 16), xmm10, Assembler::AVX_512bit);
5259     __ subq(length, 24);
5260     __ addq(dest, 32);
5261     __ addq(source, 24);
5262     __ jmp(L_process32);
5263 
5264     // Scalar data processing takes 3 bytes at a time and produces 4 bytes of encoded data
5265     /* This code corresponds to the scalar version of the following snippet in Base64.java
5266     ** int bits = (src[sp0++] & 0xff) << 16 |(src[sp0++] & 0xff) << 8 |(src[sp0++] & 0xff);
5267     ** dst[dp0++] = (byte)base64[(bits >> > 18) & 0x3f];
5268     ** dst[dp0++] = (byte)base64[(bits >> > 12) & 0x3f];
5269     ** dst[dp0++] = (byte)base64[(bits >> > 6) & 0x3f];
5270     ** dst[dp0++] = (byte)base64[bits & 0x3f];*/
5271     __ BIND(L_process3);
5272     __ cmpl(length, 3);
5273     __ jcc(Assembler::below, L_exit);
5274     // Read 1 byte at a time
5275     __ movzbl(rax, Address(source, start_offset));
5276     __ shll(rax, 0x10);
5277     __ movl(r15, rax);
5278     __ movzbl(rax, Address(source, start_offset, Address::times_1, 1));
5279     __ shll(rax, 0x8);
5280     __ movzwl(rax, rax);
5281     __ orl(r15, rax);
5282     __ movzbl(rax, Address(source, start_offset, Address::times_1, 2));
5283     __ orl(rax, r15);
5284     // Save 3 bytes read in r15
5285     __ movl(r15, rax);
5286     __ shrl(rax, 0x12);
5287     __ andl(rax, 0x3f);
5288     // rax contains the index, r11 contains base64 lookup table
5289     __ movb(rax, Address(r11, rax, Address::times_4));
5290     // Write the encoded byte to destination
5291     __ movb(Address(dest, dp, Address::times_1, 0), rax);
5292     __ movl(rax, r15);
5293     __ shrl(rax, 0xc);
5294     __ andl(rax, 0x3f);
5295     __ movb(rax, Address(r11, rax, Address::times_4));
5296     __ movb(Address(dest, dp, Address::times_1, 1), rax);
5297     __ movl(rax, r15);
5298     __ shrl(rax, 0x6);
5299     __ andl(rax, 0x3f);
5300     __ movb(rax, Address(r11, rax, Address::times_4));
5301     __ movb(Address(dest, dp, Address::times_1, 2), rax);
5302     __ movl(rax, r15);
5303     __ andl(rax, 0x3f);
5304     __ movb(rax, Address(r11, rax, Address::times_4));
5305     __ movb(Address(dest, dp, Address::times_1, 3), rax);
5306     __ subl(length, 3);
5307     __ addq(dest, 4);
5308     __ addq(source, 3);
5309     __ jmp(L_process3);
5310     __ BIND(L_exit);
5311     __ pop(r15);
5312     __ pop(r14);
5313     __ pop(r13);
5314     __ pop(r12);
5315     __ leave();
5316     __ ret(0);
5317     return start;
5318   }
5319 
5320   /**
5321    *  Arguments:
5322    *
5323    * Inputs:
5324    *   c_rarg0   - int crc
5325    *   c_rarg1   - byte* buf
5326    *   c_rarg2   - int length
5327    *
5328    * Ouput:
5329    *       rax   - int crc result
5330    */
5331   address generate_updateBytesCRC32() {
5332     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
5333 
5334     __ align(CodeEntryAlignment);
5335     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
5336 
5337     address start = __ pc();
5338     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5339     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5340     // rscratch1: r10
5341     const Register crc   = c_rarg0;  // crc
5342     const Register buf   = c_rarg1;  // source java byte array address
5343     const Register len   = c_rarg2;  // length
5344     const Register table = c_rarg3;  // crc_table address (reuse register)
5345     const Register tmp   = r11;
5346     assert_different_registers(crc, buf, len, table, tmp, rax);
5347 
5348     BLOCK_COMMENT("Entry:");
5349     __ enter(); // required for proper stackwalking of RuntimeStub frame
5350 
5351     __ kernel_crc32(crc, buf, len, table, tmp);
5352 
5353     __ movl(rax, crc);
5354     __ vzeroupper();
5355     __ leave(); // required for proper stackwalking of RuntimeStub frame
5356     __ ret(0);
5357 
5358     return start;
5359   }
5360 
5361   /**
5362   *  Arguments:
5363   *
5364   * Inputs:
5365   *   c_rarg0   - int crc
5366   *   c_rarg1   - byte* buf
5367   *   c_rarg2   - long length
5368   *   c_rarg3   - table_start - optional (present only when doing a library_call,
5369   *              not used by x86 algorithm)
5370   *
5371   * Ouput:
5372   *       rax   - int crc result
5373   */
5374   address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
5375       assert(UseCRC32CIntrinsics, "need SSE4_2");
5376       __ align(CodeEntryAlignment);
5377       StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
5378       address start = __ pc();
5379       //reg.arg        int#0        int#1        int#2        int#3        int#4        int#5        float regs
5380       //Windows        RCX          RDX          R8           R9           none         none         XMM0..XMM3
5381       //Lin / Sol      RDI          RSI          RDX          RCX          R8           R9           XMM0..XMM7
5382       const Register crc = c_rarg0;  // crc
5383       const Register buf = c_rarg1;  // source java byte array address
5384       const Register len = c_rarg2;  // length
5385       const Register a = rax;
5386       const Register j = r9;
5387       const Register k = r10;
5388       const Register l = r11;
5389 #ifdef _WIN64
5390       const Register y = rdi;
5391       const Register z = rsi;
5392 #else
5393       const Register y = rcx;
5394       const Register z = r8;
5395 #endif
5396       assert_different_registers(crc, buf, len, a, j, k, l, y, z);
5397 
5398       BLOCK_COMMENT("Entry:");
5399       __ enter(); // required for proper stackwalking of RuntimeStub frame
5400 #ifdef _WIN64
5401       __ push(y);
5402       __ push(z);
5403 #endif
5404       __ crc32c_ipl_alg2_alt2(crc, buf, len,
5405                               a, j, k,
5406                               l, y, z,
5407                               c_farg0, c_farg1, c_farg2,
5408                               is_pclmulqdq_supported);
5409       __ movl(rax, crc);
5410 #ifdef _WIN64
5411       __ pop(z);
5412       __ pop(y);
5413 #endif
5414       __ vzeroupper();
5415       __ leave(); // required for proper stackwalking of RuntimeStub frame
5416       __ ret(0);
5417 
5418       return start;
5419   }
5420 
5421   /**
5422    *  Arguments:
5423    *
5424    *  Input:
5425    *    c_rarg0   - x address
5426    *    c_rarg1   - x length
5427    *    c_rarg2   - y address
5428    *    c_rarg3   - y length
5429    * not Win64
5430    *    c_rarg4   - z address
5431    *    c_rarg5   - z length
5432    * Win64
5433    *    rsp+40    - z address
5434    *    rsp+48    - z length
5435    */
5436   address generate_multiplyToLen() {
5437     __ align(CodeEntryAlignment);
5438     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
5439 
5440     address start = __ pc();
5441     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5442     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5443     const Register x     = rdi;
5444     const Register xlen  = rax;
5445     const Register y     = rsi;
5446     const Register ylen  = rcx;
5447     const Register z     = r8;
5448     const Register zlen  = r11;
5449 
5450     // Next registers will be saved on stack in multiply_to_len().
5451     const Register tmp1  = r12;
5452     const Register tmp2  = r13;
5453     const Register tmp3  = r14;
5454     const Register tmp4  = r15;
5455     const Register tmp5  = rbx;
5456 
5457     BLOCK_COMMENT("Entry:");
5458     __ enter(); // required for proper stackwalking of RuntimeStub frame
5459 
5460 #ifndef _WIN64
5461     __ movptr(zlen, r9); // Save r9 in r11 - zlen
5462 #endif
5463     setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
5464                        // ylen => rcx, z => r8, zlen => r11
5465                        // r9 and r10 may be used to save non-volatile registers
5466 #ifdef _WIN64
5467     // last 2 arguments (#4, #5) are on stack on Win64
5468     __ movptr(z, Address(rsp, 6 * wordSize));
5469     __ movptr(zlen, Address(rsp, 7 * wordSize));
5470 #endif
5471 
5472     __ movptr(xlen, rsi);
5473     __ movptr(y,    rdx);
5474     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
5475 
5476     restore_arg_regs();
5477 
5478     __ leave(); // required for proper stackwalking of RuntimeStub frame
5479     __ ret(0);
5480 
5481     return start;
5482   }
5483 
5484   /**
5485   *  Arguments:
5486   *
5487   *  Input:
5488   *    c_rarg0   - obja     address
5489   *    c_rarg1   - objb     address
5490   *    c_rarg3   - length   length
5491   *    c_rarg4   - scale    log2_array_indxscale
5492   *
5493   *  Output:
5494   *        rax   - int >= mismatched index, < 0 bitwise complement of tail
5495   */
5496   address generate_vectorizedMismatch() {
5497     __ align(CodeEntryAlignment);
5498     StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
5499     address start = __ pc();
5500 
5501     BLOCK_COMMENT("Entry:");
5502     __ enter();
5503 
5504 #ifdef _WIN64  // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5505     const Register scale = c_rarg0;  //rcx, will exchange with r9
5506     const Register objb = c_rarg1;   //rdx
5507     const Register length = c_rarg2; //r8
5508     const Register obja = c_rarg3;   //r9
5509     __ xchgq(obja, scale);  //now obja and scale contains the correct contents
5510 
5511     const Register tmp1 = r10;
5512     const Register tmp2 = r11;
5513 #endif
5514 #ifndef _WIN64 // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5515     const Register obja = c_rarg0;   //U:rdi
5516     const Register objb = c_rarg1;   //U:rsi
5517     const Register length = c_rarg2; //U:rdx
5518     const Register scale = c_rarg3;  //U:rcx
5519     const Register tmp1 = r8;
5520     const Register tmp2 = r9;
5521 #endif
5522     const Register result = rax; //return value
5523     const XMMRegister vec0 = xmm0;
5524     const XMMRegister vec1 = xmm1;
5525     const XMMRegister vec2 = xmm2;
5526 
5527     __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
5528 
5529     __ vzeroupper();
5530     __ leave();
5531     __ ret(0);
5532 
5533     return start;
5534   }
5535 
5536 /**
5537    *  Arguments:
5538    *
5539   //  Input:
5540   //    c_rarg0   - x address
5541   //    c_rarg1   - x length
5542   //    c_rarg2   - z address
5543   //    c_rarg3   - z lenth
5544    *
5545    */
5546   address generate_squareToLen() {
5547 
5548     __ align(CodeEntryAlignment);
5549     StubCodeMark mark(this, "StubRoutines", "squareToLen");
5550 
5551     address start = __ pc();
5552     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5553     // Unix:  rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
5554     const Register x      = rdi;
5555     const Register len    = rsi;
5556     const Register z      = r8;
5557     const Register zlen   = rcx;
5558 
5559    const Register tmp1      = r12;
5560    const Register tmp2      = r13;
5561    const Register tmp3      = r14;
5562    const Register tmp4      = r15;
5563    const Register tmp5      = rbx;
5564 
5565     BLOCK_COMMENT("Entry:");
5566     __ enter(); // required for proper stackwalking of RuntimeStub frame
5567 
5568     setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
5569                        // zlen => rcx
5570                        // r9 and r10 may be used to save non-volatile registers
5571     __ movptr(r8, rdx);
5572     __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
5573 
5574     restore_arg_regs();
5575 
5576     __ leave(); // required for proper stackwalking of RuntimeStub frame
5577     __ ret(0);
5578 
5579     return start;
5580   }
5581 
5582   address generate_method_entry_barrier() {
5583     __ align(CodeEntryAlignment);
5584     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5585 
5586     Label deoptimize_label;
5587 
5588     address start = __ pc();
5589 
5590     __ push(-1); // cookie, this is used for writing the new rsp when deoptimizing
5591 
5592     BLOCK_COMMENT("Entry:");
5593     __ enter(); // save rbp
5594 
5595     // save c_rarg0, because we want to use that value.
5596     // We could do without it but then we depend on the number of slots used by pusha
5597     __ push(c_rarg0);
5598 
5599     __ lea(c_rarg0, Address(rsp, wordSize * 3)); // 1 for cookie, 1 for rbp, 1 for c_rarg0 - this should be the return address
5600 
5601     __ pusha();
5602 
5603     // The method may have floats as arguments, and we must spill them before calling
5604     // the VM runtime.
5605     assert(Argument::n_float_register_parameters_j == 8, "Assumption");
5606     const int xmm_size = wordSize * 2;
5607     const int xmm_spill_size = xmm_size * Argument::n_float_register_parameters_j;
5608     __ subptr(rsp, xmm_spill_size);
5609     __ movdqu(Address(rsp, xmm_size * 7), xmm7);
5610     __ movdqu(Address(rsp, xmm_size * 6), xmm6);
5611     __ movdqu(Address(rsp, xmm_size * 5), xmm5);
5612     __ movdqu(Address(rsp, xmm_size * 4), xmm4);
5613     __ movdqu(Address(rsp, xmm_size * 3), xmm3);
5614     __ movdqu(Address(rsp, xmm_size * 2), xmm2);
5615     __ movdqu(Address(rsp, xmm_size * 1), xmm1);
5616     __ movdqu(Address(rsp, xmm_size * 0), xmm0);
5617 
5618     __ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<int (*)(address*)>(BarrierSetNMethod::nmethod_stub_entry_barrier)), 1);
5619 
5620     __ movdqu(xmm0, Address(rsp, xmm_size * 0));
5621     __ movdqu(xmm1, Address(rsp, xmm_size * 1));
5622     __ movdqu(xmm2, Address(rsp, xmm_size * 2));
5623     __ movdqu(xmm3, Address(rsp, xmm_size * 3));
5624     __ movdqu(xmm4, Address(rsp, xmm_size * 4));
5625     __ movdqu(xmm5, Address(rsp, xmm_size * 5));
5626     __ movdqu(xmm6, Address(rsp, xmm_size * 6));
5627     __ movdqu(xmm7, Address(rsp, xmm_size * 7));
5628     __ addptr(rsp, xmm_spill_size);
5629 
5630     __ cmpl(rax, 1); // 1 means deoptimize
5631     __ jcc(Assembler::equal, deoptimize_label);
5632 
5633     __ popa();
5634     __ pop(c_rarg0);
5635 
5636     __ leave();
5637 
5638     __ addptr(rsp, 1 * wordSize); // cookie
5639     __ ret(0);
5640 
5641 
5642     __ BIND(deoptimize_label);
5643 
5644     __ popa();
5645     __ pop(c_rarg0);
5646 
5647     __ leave();
5648 
5649     // this can be taken out, but is good for verification purposes. getting a SIGSEGV
5650     // here while still having a correct stack is valuable
5651     __ testptr(rsp, Address(rsp, 0));
5652 
5653     __ movptr(rsp, Address(rsp, 0)); // new rsp was written in the barrier
5654     __ jmp(Address(rsp, -1 * wordSize)); // jmp target should be callers verified_entry_point
5655 
5656     return start;
5657   }
5658 
5659    /**
5660    *  Arguments:
5661    *
5662    *  Input:
5663    *    c_rarg0   - out address
5664    *    c_rarg1   - in address
5665    *    c_rarg2   - offset
5666    *    c_rarg3   - len
5667    * not Win64
5668    *    c_rarg4   - k
5669    * Win64
5670    *    rsp+40    - k
5671    */
5672   address generate_mulAdd() {
5673     __ align(CodeEntryAlignment);
5674     StubCodeMark mark(this, "StubRoutines", "mulAdd");
5675 
5676     address start = __ pc();
5677     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5678     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5679     const Register out     = rdi;
5680     const Register in      = rsi;
5681     const Register offset  = r11;
5682     const Register len     = rcx;
5683     const Register k       = r8;
5684 
5685     // Next registers will be saved on stack in mul_add().
5686     const Register tmp1  = r12;
5687     const Register tmp2  = r13;
5688     const Register tmp3  = r14;
5689     const Register tmp4  = r15;
5690     const Register tmp5  = rbx;
5691 
5692     BLOCK_COMMENT("Entry:");
5693     __ enter(); // required for proper stackwalking of RuntimeStub frame
5694 
5695     setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
5696                        // len => rcx, k => r8
5697                        // r9 and r10 may be used to save non-volatile registers
5698 #ifdef _WIN64
5699     // last argument is on stack on Win64
5700     __ movl(k, Address(rsp, 6 * wordSize));
5701 #endif
5702     __ movptr(r11, rdx);  // move offset in rdx to offset(r11)
5703     __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
5704 
5705     restore_arg_regs();
5706 
5707     __ leave(); // required for proper stackwalking of RuntimeStub frame
5708     __ ret(0);
5709 
5710     return start;
5711   }
5712 
5713   address generate_libmExp() {
5714     StubCodeMark mark(this, "StubRoutines", "libmExp");
5715 
5716     address start = __ pc();
5717 
5718     const XMMRegister x0  = xmm0;
5719     const XMMRegister x1  = xmm1;
5720     const XMMRegister x2  = xmm2;
5721     const XMMRegister x3  = xmm3;
5722 
5723     const XMMRegister x4  = xmm4;
5724     const XMMRegister x5  = xmm5;
5725     const XMMRegister x6  = xmm6;
5726     const XMMRegister x7  = xmm7;
5727 
5728     const Register tmp   = r11;
5729 
5730     BLOCK_COMMENT("Entry:");
5731     __ enter(); // required for proper stackwalking of RuntimeStub frame
5732 
5733     __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
5734 
5735     __ leave(); // required for proper stackwalking of RuntimeStub frame
5736     __ ret(0);
5737 
5738     return start;
5739 
5740   }
5741 
5742   address generate_libmLog() {
5743     StubCodeMark mark(this, "StubRoutines", "libmLog");
5744 
5745     address start = __ pc();
5746 
5747     const XMMRegister x0 = xmm0;
5748     const XMMRegister x1 = xmm1;
5749     const XMMRegister x2 = xmm2;
5750     const XMMRegister x3 = xmm3;
5751 
5752     const XMMRegister x4 = xmm4;
5753     const XMMRegister x5 = xmm5;
5754     const XMMRegister x6 = xmm6;
5755     const XMMRegister x7 = xmm7;
5756 
5757     const Register tmp1 = r11;
5758     const Register tmp2 = r8;
5759 
5760     BLOCK_COMMENT("Entry:");
5761     __ enter(); // required for proper stackwalking of RuntimeStub frame
5762 
5763     __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
5764 
5765     __ leave(); // required for proper stackwalking of RuntimeStub frame
5766     __ ret(0);
5767 
5768     return start;
5769 
5770   }
5771 
5772   address generate_libmLog10() {
5773     StubCodeMark mark(this, "StubRoutines", "libmLog10");
5774 
5775     address start = __ pc();
5776 
5777     const XMMRegister x0 = xmm0;
5778     const XMMRegister x1 = xmm1;
5779     const XMMRegister x2 = xmm2;
5780     const XMMRegister x3 = xmm3;
5781 
5782     const XMMRegister x4 = xmm4;
5783     const XMMRegister x5 = xmm5;
5784     const XMMRegister x6 = xmm6;
5785     const XMMRegister x7 = xmm7;
5786 
5787     const Register tmp = r11;
5788 
5789     BLOCK_COMMENT("Entry:");
5790     __ enter(); // required for proper stackwalking of RuntimeStub frame
5791 
5792     __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
5793 
5794     __ leave(); // required for proper stackwalking of RuntimeStub frame
5795     __ ret(0);
5796 
5797     return start;
5798 
5799   }
5800 
5801   address generate_libmPow() {
5802     StubCodeMark mark(this, "StubRoutines", "libmPow");
5803 
5804     address start = __ pc();
5805 
5806     const XMMRegister x0 = xmm0;
5807     const XMMRegister x1 = xmm1;
5808     const XMMRegister x2 = xmm2;
5809     const XMMRegister x3 = xmm3;
5810 
5811     const XMMRegister x4 = xmm4;
5812     const XMMRegister x5 = xmm5;
5813     const XMMRegister x6 = xmm6;
5814     const XMMRegister x7 = xmm7;
5815 
5816     const Register tmp1 = r8;
5817     const Register tmp2 = r9;
5818     const Register tmp3 = r10;
5819     const Register tmp4 = r11;
5820 
5821     BLOCK_COMMENT("Entry:");
5822     __ enter(); // required for proper stackwalking of RuntimeStub frame
5823 
5824     __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5825 
5826     __ leave(); // required for proper stackwalking of RuntimeStub frame
5827     __ ret(0);
5828 
5829     return start;
5830 
5831   }
5832 
5833   address generate_libmSin() {
5834     StubCodeMark mark(this, "StubRoutines", "libmSin");
5835 
5836     address start = __ pc();
5837 
5838     const XMMRegister x0 = xmm0;
5839     const XMMRegister x1 = xmm1;
5840     const XMMRegister x2 = xmm2;
5841     const XMMRegister x3 = xmm3;
5842 
5843     const XMMRegister x4 = xmm4;
5844     const XMMRegister x5 = xmm5;
5845     const XMMRegister x6 = xmm6;
5846     const XMMRegister x7 = xmm7;
5847 
5848     const Register tmp1 = r8;
5849     const Register tmp2 = r9;
5850     const Register tmp3 = r10;
5851     const Register tmp4 = r11;
5852 
5853     BLOCK_COMMENT("Entry:");
5854     __ enter(); // required for proper stackwalking of RuntimeStub frame
5855 
5856 #ifdef _WIN64
5857     __ push(rsi);
5858     __ push(rdi);
5859 #endif
5860     __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5861 
5862 #ifdef _WIN64
5863     __ pop(rdi);
5864     __ pop(rsi);
5865 #endif
5866 
5867     __ leave(); // required for proper stackwalking of RuntimeStub frame
5868     __ ret(0);
5869 
5870     return start;
5871 
5872   }
5873 
5874   address generate_libmCos() {
5875     StubCodeMark mark(this, "StubRoutines", "libmCos");
5876 
5877     address start = __ pc();
5878 
5879     const XMMRegister x0 = xmm0;
5880     const XMMRegister x1 = xmm1;
5881     const XMMRegister x2 = xmm2;
5882     const XMMRegister x3 = xmm3;
5883 
5884     const XMMRegister x4 = xmm4;
5885     const XMMRegister x5 = xmm5;
5886     const XMMRegister x6 = xmm6;
5887     const XMMRegister x7 = xmm7;
5888 
5889     const Register tmp1 = r8;
5890     const Register tmp2 = r9;
5891     const Register tmp3 = r10;
5892     const Register tmp4 = r11;
5893 
5894     BLOCK_COMMENT("Entry:");
5895     __ enter(); // required for proper stackwalking of RuntimeStub frame
5896 
5897 #ifdef _WIN64
5898     __ push(rsi);
5899     __ push(rdi);
5900 #endif
5901     __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5902 
5903 #ifdef _WIN64
5904     __ pop(rdi);
5905     __ pop(rsi);
5906 #endif
5907 
5908     __ leave(); // required for proper stackwalking of RuntimeStub frame
5909     __ ret(0);
5910 
5911     return start;
5912 
5913   }
5914 
5915   address generate_libmTan() {
5916     StubCodeMark mark(this, "StubRoutines", "libmTan");
5917 
5918     address start = __ pc();
5919 
5920     const XMMRegister x0 = xmm0;
5921     const XMMRegister x1 = xmm1;
5922     const XMMRegister x2 = xmm2;
5923     const XMMRegister x3 = xmm3;
5924 
5925     const XMMRegister x4 = xmm4;
5926     const XMMRegister x5 = xmm5;
5927     const XMMRegister x6 = xmm6;
5928     const XMMRegister x7 = xmm7;
5929 
5930     const Register tmp1 = r8;
5931     const Register tmp2 = r9;
5932     const Register tmp3 = r10;
5933     const Register tmp4 = r11;
5934 
5935     BLOCK_COMMENT("Entry:");
5936     __ enter(); // required for proper stackwalking of RuntimeStub frame
5937 
5938 #ifdef _WIN64
5939     __ push(rsi);
5940     __ push(rdi);
5941 #endif
5942     __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5943 
5944 #ifdef _WIN64
5945     __ pop(rdi);
5946     __ pop(rsi);
5947 #endif
5948 
5949     __ leave(); // required for proper stackwalking of RuntimeStub frame
5950     __ ret(0);
5951 
5952     return start;
5953 
5954   }
5955 
5956 #undef __
5957 #define __ masm->
5958 
5959   // Continuation point for throwing of implicit exceptions that are
5960   // not handled in the current activation. Fabricates an exception
5961   // oop and initiates normal exception dispatching in this
5962   // frame. Since we need to preserve callee-saved values (currently
5963   // only for C2, but done for C1 as well) we need a callee-saved oop
5964   // map and therefore have to make these stubs into RuntimeStubs
5965   // rather than BufferBlobs.  If the compiler needs all registers to
5966   // be preserved between the fault point and the exception handler
5967   // then it must assume responsibility for that in
5968   // AbstractCompiler::continuation_for_implicit_null_exception or
5969   // continuation_for_implicit_division_by_zero_exception. All other
5970   // implicit exceptions (e.g., NullPointerException or
5971   // AbstractMethodError on entry) are either at call sites or
5972   // otherwise assume that stack unwinding will be initiated, so
5973   // caller saved registers were assumed volatile in the compiler.
5974   address generate_throw_exception(const char* name,
5975                                    address runtime_entry,
5976                                    Register arg1 = noreg,
5977                                    Register arg2 = noreg) {
5978     // Information about frame layout at time of blocking runtime call.
5979     // Note that we only have to preserve callee-saved registers since
5980     // the compilers are responsible for supplying a continuation point
5981     // if they expect all registers to be preserved.
5982     enum layout {
5983       rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
5984       rbp_off2,
5985       return_off,
5986       return_off2,
5987       framesize // inclusive of return address
5988     };
5989 
5990     int insts_size = 512;
5991     int locs_size  = 64;
5992 
5993     CodeBuffer code(name, insts_size, locs_size);
5994     OopMapSet* oop_maps  = new OopMapSet();
5995     MacroAssembler* masm = new MacroAssembler(&code);
5996 
5997     address start = __ pc();
5998 
5999     // This is an inlined and slightly modified version of call_VM
6000     // which has the ability to fetch the return PC out of
6001     // thread-local storage and also sets up last_Java_sp slightly
6002     // differently than the real call_VM
6003 
6004     __ enter(); // required for proper stackwalking of RuntimeStub frame
6005 
6006     assert(is_even(framesize/2), "sp not 16-byte aligned");
6007 
6008     // return address and rbp are already in place
6009     __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
6010 
6011     int frame_complete = __ pc() - start;
6012 
6013     // Set up last_Java_sp and last_Java_fp
6014     address the_pc = __ pc();
6015     __ set_last_Java_frame(rsp, rbp, the_pc);
6016     __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
6017 
6018     // Call runtime
6019     if (arg1 != noreg) {
6020       assert(arg2 != c_rarg1, "clobbered");
6021       __ movptr(c_rarg1, arg1);
6022     }
6023     if (arg2 != noreg) {
6024       __ movptr(c_rarg2, arg2);
6025     }
6026     __ movptr(c_rarg0, r15_thread);
6027     BLOCK_COMMENT("call runtime_entry");
6028     __ call(RuntimeAddress(runtime_entry));
6029 
6030     // Generate oop map
6031     OopMap* map = new OopMap(framesize, 0);
6032 
6033     oop_maps->add_gc_map(the_pc - start, map);
6034 
6035     __ reset_last_Java_frame(true);
6036 
6037     __ leave(); // required for proper stackwalking of RuntimeStub frame
6038 
6039     // check for pending exceptions
6040 #ifdef ASSERT
6041     Label L;
6042     __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()),
6043             (int32_t) NULL_WORD);
6044     __ jcc(Assembler::notEqual, L);
6045     __ should_not_reach_here();
6046     __ bind(L);
6047 #endif // ASSERT
6048     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
6049 
6050 
6051     // codeBlob framesize is in words (not VMRegImpl::slot_size)
6052     RuntimeStub* stub =
6053       RuntimeStub::new_runtime_stub(name,
6054                                     &code,
6055                                     frame_complete,
6056                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6057                                     oop_maps, false);
6058     return stub->entry_point();
6059   }
6060 
6061   void create_control_words() {
6062     // Round to nearest, 53-bit mode, exceptions masked
6063     StubRoutines::_fpu_cntrl_wrd_std   = 0x027F;
6064     // Round to zero, 53-bit mode, exception mased
6065     StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F;
6066     // Round to nearest, 24-bit mode, exceptions masked
6067     StubRoutines::_fpu_cntrl_wrd_24    = 0x007F;
6068     // Round to nearest, 64-bit mode, exceptions masked
6069     StubRoutines::_mxcsr_std           = 0x1F80;
6070     // Note: the following two constants are 80-bit values
6071     //       layout is critical for correct loading by FPU.
6072     // Bias for strict fp multiply/divide
6073     StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000
6074     StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000;
6075     StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff;
6076     // Un-Bias for strict fp multiply/divide
6077     StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000
6078     StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000;
6079     StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff;
6080   }
6081 
6082   // Initialization
6083   void generate_initial() {
6084     // Generates all stubs and initializes the entry points
6085 
6086     // This platform-specific settings are needed by generate_call_stub()
6087     create_control_words();
6088 
6089     // entry points that exist in all platforms Note: This is code
6090     // that could be shared among different platforms - however the
6091     // benefit seems to be smaller than the disadvantage of having a
6092     // much more complicated generator structure. See also comment in
6093     // stubRoutines.hpp.
6094 
6095     StubRoutines::_forward_exception_entry = generate_forward_exception();
6096 
6097     StubRoutines::_call_stub_entry =
6098       generate_call_stub(StubRoutines::_call_stub_return_address);
6099 
6100     // is referenced by megamorphic call
6101     StubRoutines::_catch_exception_entry = generate_catch_exception();
6102 
6103     // atomic calls
6104     StubRoutines::_atomic_xchg_entry          = generate_atomic_xchg();
6105     StubRoutines::_atomic_xchg_long_entry     = generate_atomic_xchg_long();
6106     StubRoutines::_atomic_cmpxchg_entry       = generate_atomic_cmpxchg();
6107     StubRoutines::_atomic_cmpxchg_byte_entry  = generate_atomic_cmpxchg_byte();
6108     StubRoutines::_atomic_cmpxchg_long_entry  = generate_atomic_cmpxchg_long();
6109     StubRoutines::_atomic_add_entry           = generate_atomic_add();
6110     StubRoutines::_atomic_add_long_entry      = generate_atomic_add_long();
6111     StubRoutines::_fence_entry                = generate_orderaccess_fence();
6112 
6113     // platform dependent
6114     StubRoutines::x86::_get_previous_fp_entry = generate_get_previous_fp();
6115     StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp();
6116 
6117     StubRoutines::x86::_verify_mxcsr_entry    = generate_verify_mxcsr();
6118 
6119     // Build this early so it's available for the interpreter.
6120     StubRoutines::_throw_StackOverflowError_entry =
6121       generate_throw_exception("StackOverflowError throw_exception",
6122                                CAST_FROM_FN_PTR(address,
6123                                                 SharedRuntime::
6124                                                 throw_StackOverflowError));
6125     StubRoutines::_throw_delayed_StackOverflowError_entry =
6126       generate_throw_exception("delayed StackOverflowError throw_exception",
6127                                CAST_FROM_FN_PTR(address,
6128                                                 SharedRuntime::
6129                                                 throw_delayed_StackOverflowError));
6130     if (UseCRC32Intrinsics) {
6131       // set table address before stub generation which use it
6132       StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
6133       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
6134     }
6135 
6136     if (UseCRC32CIntrinsics) {
6137       bool supports_clmul = VM_Version::supports_clmul();
6138       StubRoutines::x86::generate_CRC32C_table(supports_clmul);
6139       StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
6140       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
6141     }
6142     if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) {
6143       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
6144           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
6145           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
6146         StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF;
6147         StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2;
6148         StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4;
6149         StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable;
6150         StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2;
6151         StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3;
6152         StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1;
6153         StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE;
6154         StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4;
6155         StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV;
6156         StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK;
6157         StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1;
6158         StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3;
6159         StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO;
6160       }
6161       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
6162         StubRoutines::_dexp = generate_libmExp();
6163       }
6164       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
6165         StubRoutines::_dlog = generate_libmLog();
6166       }
6167       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
6168         StubRoutines::_dlog10 = generate_libmLog10();
6169       }
6170       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
6171         StubRoutines::_dpow = generate_libmPow();
6172       }
6173       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
6174         StubRoutines::_dsin = generate_libmSin();
6175       }
6176       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
6177         StubRoutines::_dcos = generate_libmCos();
6178       }
6179       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
6180         StubRoutines::_dtan = generate_libmTan();
6181       }
6182     }
6183   }
6184 
6185   void generate_all() {
6186     // Generates all stubs and initializes the entry points
6187 
6188     // These entry points require SharedInfo::stack0 to be set up in
6189     // non-core builds and need to be relocatable, so they each
6190     // fabricate a RuntimeStub internally.
6191     StubRoutines::_throw_AbstractMethodError_entry =
6192       generate_throw_exception("AbstractMethodError throw_exception",
6193                                CAST_FROM_FN_PTR(address,
6194                                                 SharedRuntime::
6195                                                 throw_AbstractMethodError));
6196 
6197     StubRoutines::_throw_IncompatibleClassChangeError_entry =
6198       generate_throw_exception("IncompatibleClassChangeError throw_exception",
6199                                CAST_FROM_FN_PTR(address,
6200                                                 SharedRuntime::
6201                                                 throw_IncompatibleClassChangeError));
6202 
6203     StubRoutines::_throw_NullPointerException_at_call_entry =
6204       generate_throw_exception("NullPointerException at call throw_exception",
6205                                CAST_FROM_FN_PTR(address,
6206                                                 SharedRuntime::
6207                                                 throw_NullPointerException_at_call));
6208 
6209     // entry points that are platform specific
6210     StubRoutines::x86::_f2i_fixup = generate_f2i_fixup();
6211     StubRoutines::x86::_f2l_fixup = generate_f2l_fixup();
6212     StubRoutines::x86::_d2i_fixup = generate_d2i_fixup();
6213     StubRoutines::x86::_d2l_fixup = generate_d2l_fixup();
6214 
6215     StubRoutines::x86::_float_sign_mask  = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
6216     StubRoutines::x86::_float_sign_flip  = generate_fp_mask("float_sign_flip",  0x8000000080000000);
6217     StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
6218     StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
6219     StubRoutines::x86::_vector_float_sign_mask = generate_vector_fp_mask("vector_float_sign_mask", 0x7FFFFFFF7FFFFFFF);
6220     StubRoutines::x86::_vector_float_sign_flip = generate_vector_fp_mask("vector_float_sign_flip", 0x8000000080000000);
6221     StubRoutines::x86::_vector_double_sign_mask = generate_vector_fp_mask("vector_double_sign_mask", 0x7FFFFFFFFFFFFFFF);
6222     StubRoutines::x86::_vector_double_sign_flip = generate_vector_fp_mask("vector_double_sign_flip", 0x8000000000000000);
6223     StubRoutines::x86::_vector_all_bits_set = generate_vector_fp_mask("vector_all_bits_set", 0xFFFFFFFFFFFFFFFF);
6224     StubRoutines::x86::_vector_byte_bitset = generate_vector_fp_mask("vector_byte_bitset", 0x0101010101010101);
6225     StubRoutines::x86::_vector_long_perm_mask = generate_vector_custom_i32("vector_long_perm_mask", Assembler::AVX_512bit,
6226                                                                            0, 2, 4, 6, 8, 10, 12, 14);
6227     StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_fp_mask("vector_short_to_byte_mask", 0x00ff00ff00ff00ff);
6228     StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask");
6229     StubRoutines::x86::_vector_int_to_byte_mask = generate_vector_fp_mask("vector_int_to_byte_mask", 0x000000ff000000ff);
6230     StubRoutines::x86::_vector_int_to_short_mask = generate_vector_fp_mask("vector_int_to_short_mask", 0x0000ffff0000ffff);
6231     StubRoutines::x86::_vector_32_bit_mask = generate_vector_custom_i32("vector_32_bit_mask", Assembler::AVX_512bit,
6232                                                                         0xFFFFFFFF, 0, 0, 0);
6233     StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit,
6234                                                                         0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
6235     StubRoutines::x86::_vector_int_shuffle_mask = generate_vector_fp_mask("vector_int_shuffle_mask", 0x0302010003020100);
6236     StubRoutines::x86::_vector_int_size_mask = generate_vector_fp_mask("vector_int_size_mask", 0x0000000400000004);
6237     StubRoutines::x86::_vector_short_shuffle_mask = generate_vector_fp_mask("vector_short_shuffle_mask", 0x0100010001000100);
6238     StubRoutines::x86::_vector_short_size_mask = generate_vector_fp_mask("vector_short_size_mask", 0x0002000200020002);
6239     StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_fp_mask("vector_long_shuffle_mask", 0x0000000100000000);
6240     StubRoutines::x86::_vector_long_size_mask = generate_vector_fp_mask("vector_long_size_mask", 0x0000000200000002);
6241 
6242     // support for verify_oop (must happen after universe_init)
6243     StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
6244 
6245     // arraycopy stubs used by compilers
6246     generate_arraycopy_stubs();
6247 
6248     // don't bother generating these AES intrinsic stubs unless global flag is set
6249     if (UseAESIntrinsics) {
6250       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
6251       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
6252       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
6253       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
6254       if (VM_Version::supports_vaes() &&  VM_Version::supports_avx512vl() && VM_Version::supports_avx512dq() ) {
6255         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptVectorAESCrypt();
6256       } else {
6257         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
6258       }
6259     }
6260     if (UseAESCTRIntrinsics){
6261       StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
6262       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
6263     }
6264 
6265     if (UseSHA1Intrinsics) {
6266       StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
6267       StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
6268       StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
6269       StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
6270     }
6271     if (UseSHA256Intrinsics) {
6272       StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
6273       char* dst = (char*)StubRoutines::x86::_k256_W;
6274       char* src = (char*)StubRoutines::x86::_k256;
6275       for (int ii = 0; ii < 16; ++ii) {
6276         memcpy(dst + 32 * ii,      src + 16 * ii, 16);
6277         memcpy(dst + 32 * ii + 16, src + 16 * ii, 16);
6278       }
6279       StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W;
6280       StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
6281       StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
6282       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
6283     }
6284     if (UseSHA512Intrinsics) {
6285       StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W;
6286       StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512();
6287       StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
6288       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
6289     }
6290 
6291     // Generate GHASH intrinsics code
6292     if (UseGHASHIntrinsics) {
6293     StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
6294     StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
6295       if (VM_Version::supports_avx()) {
6296         StubRoutines::x86::_ghash_shuffmask_addr = ghash_shufflemask_addr();
6297         StubRoutines::x86::_ghash_poly_addr = ghash_polynomial_addr();
6298         StubRoutines::_ghash_processBlocks = generate_avx_ghash_processBlocks();
6299       } else {
6300         StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
6301       }
6302     }
6303 
6304     if (UseBASE64Intrinsics) {
6305       StubRoutines::x86::_and_mask = base64_and_mask_addr();
6306       StubRoutines::x86::_bswap_mask = base64_bswap_mask_addr();
6307       StubRoutines::x86::_base64_charset = base64_charset_addr();
6308       StubRoutines::x86::_url_charset = base64url_charset_addr();
6309       StubRoutines::x86::_gather_mask = base64_gather_mask_addr();
6310       StubRoutines::x86::_left_shift_mask = base64_left_shift_mask_addr();
6311       StubRoutines::x86::_right_shift_mask = base64_right_shift_mask_addr();
6312       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
6313     }
6314 
6315     // Safefetch stubs.
6316     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
6317                                                        &StubRoutines::_safefetch32_fault_pc,
6318                                                        &StubRoutines::_safefetch32_continuation_pc);
6319     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
6320                                                        &StubRoutines::_safefetchN_fault_pc,
6321                                                        &StubRoutines::_safefetchN_continuation_pc);
6322 
6323     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
6324     if (bs_nm != NULL) {
6325       StubRoutines::x86::_method_entry_barrier = generate_method_entry_barrier();
6326     }
6327 #ifdef COMPILER2
6328     if (UseMultiplyToLenIntrinsic) {
6329       StubRoutines::_multiplyToLen = generate_multiplyToLen();
6330     }
6331     if (UseSquareToLenIntrinsic) {
6332       StubRoutines::_squareToLen = generate_squareToLen();
6333     }
6334     if (UseMulAddIntrinsic) {
6335       StubRoutines::_mulAdd = generate_mulAdd();
6336     }
6337 #ifndef _WINDOWS
6338     if (UseMontgomeryMultiplyIntrinsic) {
6339       StubRoutines::_montgomeryMultiply
6340         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
6341     }
6342     if (UseMontgomerySquareIntrinsic) {
6343       StubRoutines::_montgomerySquare
6344         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
6345     }
6346 #endif // WINDOWS
6347 #endif // COMPILER2
6348 
6349     if (UseVectorizedMismatchIntrinsic) {
6350       StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
6351     }
6352 
6353 #ifdef __VECTOR_API_MATH_INTRINSICS_COMMON
6354     if (UseVectorApiIntrinsics) {
6355       if (UseAVX >= 1) {
6356           #if defined(__VECTOR_API_MATH_INTRINSICS_LINUX)
6357           if (UseAVX > 2) {
6358               StubRoutines::_vector_float512_exp = CAST_FROM_FN_PTR(address, __svml_expf16_ha_z0);
6359               StubRoutines::_vector_double512_exp = CAST_FROM_FN_PTR(address, __svml_exp8_ha_z0); 
6360               StubRoutines::_vector_float512_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f16_ha_z0);
6361               StubRoutines::_vector_double512_expm1 = CAST_FROM_FN_PTR(address, __svml_expm18_ha_z0);
6362               StubRoutines::_vector_float512_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf16_ha_z0);
6363               StubRoutines::_vector_double512_log1p = CAST_FROM_FN_PTR(address, __svml_log1p8_ha_z0);
6364               StubRoutines::_vector_float512_log = CAST_FROM_FN_PTR(address, __svml_logf16_ha_z0);
6365               StubRoutines::_vector_double512_log = CAST_FROM_FN_PTR(address, __svml_log8_ha_z0);
6366               StubRoutines::_vector_float512_log10 = CAST_FROM_FN_PTR(address, __svml_log10f16_ha_z0);
6367               StubRoutines::_vector_double512_log10 = CAST_FROM_FN_PTR(address, __svml_log108_ha_z0);
6368               StubRoutines::_vector_float512_sin = CAST_FROM_FN_PTR(address, __svml_sinf16_ha_z0);      
6369               StubRoutines::_vector_double512_sin = CAST_FROM_FN_PTR(address, __svml_sin8_ha_z0);
6370               StubRoutines::_vector_float512_cos = CAST_FROM_FN_PTR(address, __svml_cosf16_ha_z0);      
6371               StubRoutines::_vector_double512_cos = CAST_FROM_FN_PTR(address, __svml_cos8_ha_z0);
6372               StubRoutines::_vector_float512_tan = CAST_FROM_FN_PTR(address, __svml_tanf16_ha_z0);
6373               StubRoutines::_vector_double512_tan = CAST_FROM_FN_PTR(address, __svml_tan8_ha_z0);      
6374               StubRoutines::_vector_float512_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf16_ha_z0);
6375               StubRoutines::_vector_double512_sinh = CAST_FROM_FN_PTR(address, __svml_sinh8_ha_z0);
6376               StubRoutines::_vector_float512_cosh = CAST_FROM_FN_PTR(address, __svml_coshf16_ha_z0);
6377               StubRoutines::_vector_double512_cosh = CAST_FROM_FN_PTR(address, __svml_cosh8_ha_z0);
6378               StubRoutines::_vector_float512_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf16_ha_z0);
6379               StubRoutines::_vector_double512_tanh = CAST_FROM_FN_PTR(address, __svml_tanh8_ha_z0);
6380               StubRoutines::_vector_float512_acos = CAST_FROM_FN_PTR(address, __svml_acosf16_ha_z0);
6381               StubRoutines::_vector_double512_acos = CAST_FROM_FN_PTR(address, __svml_acos8_ha_z0);
6382               StubRoutines::_vector_float512_asin = CAST_FROM_FN_PTR(address, __svml_asinf16_ha_z0);
6383               StubRoutines::_vector_double512_asin = CAST_FROM_FN_PTR(address, __svml_asin8_ha_z0);
6384               StubRoutines::_vector_float512_atan = CAST_FROM_FN_PTR(address, __svml_atanf16_ha_z0);
6385               StubRoutines::_vector_double512_atan = CAST_FROM_FN_PTR(address, __svml_atan8_ha_z0);
6386               StubRoutines::_vector_float512_pow = CAST_FROM_FN_PTR(address, __svml_powf16_ha_z0);
6387               StubRoutines::_vector_double512_pow = CAST_FROM_FN_PTR(address, __svml_pow8_ha_z0);
6388               StubRoutines::_vector_float512_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf16_ha_z0);
6389               StubRoutines::_vector_double512_hypot = CAST_FROM_FN_PTR(address, __svml_hypot8_ha_z0);
6390               StubRoutines::_vector_float512_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf16_ha_z0);
6391               StubRoutines::_vector_double512_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt8_ha_z0);
6392               StubRoutines::_vector_float512_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f16_ha_z0);
6393               StubRoutines::_vector_double512_atan2 = CAST_FROM_FN_PTR(address, __svml_atan28_ha_z0);
6394           }
6395           #endif
6396         if (UseAVX==1) {
6397           StubRoutines::_vector_float64_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_e9);  
6398           StubRoutines::_vector_float128_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_e9);
6399           StubRoutines::_vector_float256_exp = CAST_FROM_FN_PTR(address, __svml_expf8_ha_e9); 
6400           StubRoutines::_vector_double64_exp = CAST_FROM_FN_PTR(address, __svml_exp1_ha_e9);  
6401           StubRoutines::_vector_double128_exp = CAST_FROM_FN_PTR(address, __svml_exp2_ha_e9); 
6402           StubRoutines::_vector_double256_exp = CAST_FROM_FN_PTR(address, __svml_exp4_ha_e9);
6403           StubRoutines::_vector_float64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_e9);
6404           StubRoutines::_vector_float128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_e9);
6405           StubRoutines::_vector_float256_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f8_ha_e9);
6406           StubRoutines::_vector_double64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm11_ha_e9);
6407           StubRoutines::_vector_double128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm12_ha_e9);
6408           StubRoutines::_vector_double256_expm1 = CAST_FROM_FN_PTR(address, __svml_expm14_ha_e9);
6409           StubRoutines::_vector_float64_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_e9);
6410           StubRoutines::_vector_float128_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_e9);
6411           StubRoutines::_vector_float256_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf8_ha_e9);
6412           StubRoutines::_vector_double64_log1p = CAST_FROM_FN_PTR(address, __svml_log1p1_ha_e9);
6413           StubRoutines::_vector_double128_log1p = CAST_FROM_FN_PTR(address, __svml_log1p2_ha_e9);
6414           StubRoutines::_vector_double256_log1p = CAST_FROM_FN_PTR(address, __svml_log1p4_ha_e9);
6415           StubRoutines::_vector_float64_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_e9);
6416           StubRoutines::_vector_float128_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_e9);
6417           StubRoutines::_vector_float256_log = CAST_FROM_FN_PTR(address, __svml_logf8_ha_e9);
6418           StubRoutines::_vector_double64_log = CAST_FROM_FN_PTR(address, __svml_log1_ha_e9);
6419           StubRoutines::_vector_double128_log = CAST_FROM_FN_PTR(address, __svml_log2_ha_e9);
6420           StubRoutines::_vector_double256_log = CAST_FROM_FN_PTR(address, __svml_log4_ha_e9);
6421           StubRoutines::_vector_float64_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_e9);
6422           StubRoutines::_vector_float128_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_e9);
6423           StubRoutines::_vector_float256_log10 = CAST_FROM_FN_PTR(address, __svml_log10f8_ha_e9);
6424           StubRoutines::_vector_double64_log10 = CAST_FROM_FN_PTR(address, __svml_log101_ha_e9);
6425           StubRoutines::_vector_double128_log10 = CAST_FROM_FN_PTR(address, __svml_log102_ha_e9);
6426           StubRoutines::_vector_double256_log10 = CAST_FROM_FN_PTR(address, __svml_log104_ha_e9);
6427           StubRoutines::_vector_float64_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_e9);
6428           StubRoutines::_vector_float128_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_e9);
6429           StubRoutines::_vector_float256_sin = CAST_FROM_FN_PTR(address, __svml_sinf8_ha_e9);
6430           StubRoutines::_vector_double64_sin = CAST_FROM_FN_PTR(address, __svml_sin1_ha_e9);
6431           StubRoutines::_vector_double128_sin = CAST_FROM_FN_PTR(address, __svml_sin2_ha_e9);
6432           StubRoutines::_vector_double256_sin = CAST_FROM_FN_PTR(address, __svml_sin4_ha_e9);
6433           StubRoutines::_vector_float64_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_e9);
6434           StubRoutines::_vector_float128_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_e9);
6435           StubRoutines::_vector_float256_cos = CAST_FROM_FN_PTR(address, __svml_cosf8_ha_e9);
6436           StubRoutines::_vector_double64_cos = CAST_FROM_FN_PTR(address, __svml_cos1_ha_e9);
6437           StubRoutines::_vector_double128_cos = CAST_FROM_FN_PTR(address, __svml_cos2_ha_e9);
6438           StubRoutines::_vector_double256_cos = CAST_FROM_FN_PTR(address, __svml_cos4_ha_e9);
6439           StubRoutines::_vector_float64_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_e9);
6440           StubRoutines::_vector_float128_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_e9);
6441           StubRoutines::_vector_float256_tan = CAST_FROM_FN_PTR(address, __svml_tanf8_ha_e9);
6442           StubRoutines::_vector_double64_tan = CAST_FROM_FN_PTR(address, __svml_tan1_ha_e9);
6443           StubRoutines::_vector_double128_tan = CAST_FROM_FN_PTR(address, __svml_tan2_ha_e9);
6444           StubRoutines::_vector_double256_tan = CAST_FROM_FN_PTR(address, __svml_tan4_ha_e9);
6445           StubRoutines::_vector_float64_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_e9);
6446           StubRoutines::_vector_float128_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_e9);
6447           StubRoutines::_vector_float256_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf8_ha_e9);
6448           StubRoutines::_vector_double64_sinh = CAST_FROM_FN_PTR(address, __svml_sinh1_ha_e9);
6449           StubRoutines::_vector_double128_sinh = CAST_FROM_FN_PTR(address, __svml_sinh2_ha_e9);
6450           StubRoutines::_vector_double256_sinh = CAST_FROM_FN_PTR(address, __svml_sinh4_ha_e9);
6451           StubRoutines::_vector_float64_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_e9);
6452           StubRoutines::_vector_float128_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_e9);
6453           StubRoutines::_vector_float256_cosh = CAST_FROM_FN_PTR(address, __svml_coshf8_ha_e9);
6454           StubRoutines::_vector_double64_cosh = CAST_FROM_FN_PTR(address, __svml_cosh1_ha_e9);
6455           StubRoutines::_vector_double128_cosh = CAST_FROM_FN_PTR(address, __svml_cosh2_ha_e9);
6456           StubRoutines::_vector_double256_cosh = CAST_FROM_FN_PTR(address, __svml_cosh4_ha_e9);
6457           StubRoutines::_vector_float64_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_e9);
6458           StubRoutines::_vector_float128_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_e9);
6459           StubRoutines::_vector_float256_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf8_ha_e9);
6460           StubRoutines::_vector_double64_tanh = CAST_FROM_FN_PTR(address, __svml_tanh1_ha_e9);
6461           StubRoutines::_vector_double128_tanh = CAST_FROM_FN_PTR(address, __svml_tanh2_ha_e9);
6462           StubRoutines::_vector_double256_tanh = CAST_FROM_FN_PTR(address, __svml_tanh4_ha_e9);
6463           StubRoutines::_vector_float64_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_e9);
6464           StubRoutines::_vector_float128_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_e9);
6465           StubRoutines::_vector_float256_acos = CAST_FROM_FN_PTR(address, __svml_acosf8_ha_e9);
6466           StubRoutines::_vector_double64_acos = CAST_FROM_FN_PTR(address, __svml_acos1_ha_e9);
6467           StubRoutines::_vector_double128_acos = CAST_FROM_FN_PTR(address, __svml_acos2_ha_e9);
6468           StubRoutines::_vector_double256_acos = CAST_FROM_FN_PTR(address, __svml_acos4_ha_e9);
6469           StubRoutines::_vector_float64_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_e9);
6470           StubRoutines::_vector_float128_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_e9);
6471           StubRoutines::_vector_float256_asin = CAST_FROM_FN_PTR(address, __svml_asinf8_ha_e9);
6472           StubRoutines::_vector_double64_asin = CAST_FROM_FN_PTR(address, __svml_asin1_ha_e9);
6473           StubRoutines::_vector_double128_asin = CAST_FROM_FN_PTR(address, __svml_asin2_ha_e9);
6474           StubRoutines::_vector_double256_asin = CAST_FROM_FN_PTR(address, __svml_asin4_ha_e9);
6475           StubRoutines::_vector_float64_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_e9);
6476           StubRoutines::_vector_float128_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_e9);
6477           StubRoutines::_vector_float256_atan = CAST_FROM_FN_PTR(address, __svml_atanf8_ha_e9);
6478           StubRoutines::_vector_double64_atan = CAST_FROM_FN_PTR(address, __svml_atan1_ha_e9);
6479           StubRoutines::_vector_double128_atan = CAST_FROM_FN_PTR(address, __svml_atan2_ha_e9);
6480           StubRoutines::_vector_double256_atan = CAST_FROM_FN_PTR(address, __svml_atan4_ha_e9);
6481           StubRoutines::_vector_float64_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_e9);
6482           StubRoutines::_vector_float128_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_e9);
6483           StubRoutines::_vector_float256_pow = CAST_FROM_FN_PTR(address, __svml_powf8_ha_e9);
6484           StubRoutines::_vector_double64_pow = CAST_FROM_FN_PTR(address, __svml_pow1_ha_e9);
6485           StubRoutines::_vector_double128_pow = CAST_FROM_FN_PTR(address, __svml_pow2_ha_e9);
6486           StubRoutines::_vector_double256_pow = CAST_FROM_FN_PTR(address, __svml_pow4_ha_e9);
6487           StubRoutines::_vector_float64_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_e9);
6488           StubRoutines::_vector_float128_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_e9);
6489           StubRoutines::_vector_float256_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf8_ha_e9);
6490           StubRoutines::_vector_double64_hypot = CAST_FROM_FN_PTR(address, __svml_hypot1_ha_e9);
6491           StubRoutines::_vector_double128_hypot = CAST_FROM_FN_PTR(address, __svml_hypot2_ha_e9);
6492           StubRoutines::_vector_double256_hypot = CAST_FROM_FN_PTR(address, __svml_hypot4_ha_e9);
6493           StubRoutines::_vector_float64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_e9);
6494           StubRoutines::_vector_float128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_e9);
6495           StubRoutines::_vector_float256_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf8_ha_e9);
6496           StubRoutines::_vector_double64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt1_ha_e9);
6497           StubRoutines::_vector_double128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt2_ha_e9);
6498           StubRoutines::_vector_double256_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt4_ha_e9);
6499           StubRoutines::_vector_float64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_e9);
6500           StubRoutines::_vector_float128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_e9);
6501           StubRoutines::_vector_float256_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f8_ha_e9);
6502           StubRoutines::_vector_double64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan21_ha_e9);
6503           StubRoutines::_vector_double128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan22_ha_e9);
6504           StubRoutines::_vector_double256_atan2 = CAST_FROM_FN_PTR(address, __svml_atan24_ha_e9);  
6505         }  
6506         else {
6507           StubRoutines::_vector_float64_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_l9);  
6508           StubRoutines::_vector_float128_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_l9);
6509           StubRoutines::_vector_float256_exp = CAST_FROM_FN_PTR(address, __svml_expf8_ha_l9); 
6510           StubRoutines::_vector_double64_exp = CAST_FROM_FN_PTR(address, __svml_exp1_ha_l9);  
6511           StubRoutines::_vector_double128_exp = CAST_FROM_FN_PTR(address, __svml_exp2_ha_l9); 
6512           StubRoutines::_vector_double256_exp = CAST_FROM_FN_PTR(address, __svml_exp4_ha_l9);
6513           StubRoutines::_vector_float64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_l9);
6514           StubRoutines::_vector_float128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_l9);
6515           StubRoutines::_vector_float256_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f8_ha_l9);
6516           StubRoutines::_vector_double64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm11_ha_l9);
6517           StubRoutines::_vector_double128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm12_ha_l9);
6518           StubRoutines::_vector_double256_expm1 = CAST_FROM_FN_PTR(address, __svml_expm14_ha_l9);
6519           StubRoutines::_vector_float64_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_l9);
6520           StubRoutines::_vector_float128_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_l9);
6521           StubRoutines::_vector_float256_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf8_ha_l9);
6522           StubRoutines::_vector_double64_log1p = CAST_FROM_FN_PTR(address, __svml_log1p1_ha_l9);
6523           StubRoutines::_vector_double128_log1p = CAST_FROM_FN_PTR(address, __svml_log1p2_ha_l9);
6524           StubRoutines::_vector_double256_log1p = CAST_FROM_FN_PTR(address, __svml_log1p4_ha_l9);
6525           StubRoutines::_vector_float64_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_l9);
6526           StubRoutines::_vector_float128_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_l9);
6527           StubRoutines::_vector_float256_log = CAST_FROM_FN_PTR(address, __svml_logf8_ha_l9);
6528           StubRoutines::_vector_double64_log = CAST_FROM_FN_PTR(address, __svml_log1_ha_l9);
6529           StubRoutines::_vector_double128_log = CAST_FROM_FN_PTR(address, __svml_log2_ha_l9);
6530           StubRoutines::_vector_double256_log = CAST_FROM_FN_PTR(address, __svml_log4_ha_l9);
6531           StubRoutines::_vector_float64_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_l9);
6532           StubRoutines::_vector_float128_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_l9);
6533           StubRoutines::_vector_float256_log10 = CAST_FROM_FN_PTR(address, __svml_log10f8_ha_l9);
6534           StubRoutines::_vector_double64_log10 = CAST_FROM_FN_PTR(address, __svml_log101_ha_l9);
6535           StubRoutines::_vector_double128_log10 = CAST_FROM_FN_PTR(address, __svml_log102_ha_l9);
6536           StubRoutines::_vector_double256_log10 = CAST_FROM_FN_PTR(address, __svml_log104_ha_l9);
6537           StubRoutines::_vector_float64_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_l9);
6538           StubRoutines::_vector_float128_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_l9);
6539           StubRoutines::_vector_float256_sin = CAST_FROM_FN_PTR(address, __svml_sinf8_ha_l9);
6540           StubRoutines::_vector_double64_sin = CAST_FROM_FN_PTR(address, __svml_sin1_ha_l9);
6541           StubRoutines::_vector_double128_sin = CAST_FROM_FN_PTR(address, __svml_sin2_ha_l9);
6542           StubRoutines::_vector_double256_sin = CAST_FROM_FN_PTR(address, __svml_sin4_ha_l9);
6543           StubRoutines::_vector_float64_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_l9);
6544           StubRoutines::_vector_float128_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_l9);
6545           StubRoutines::_vector_float256_cos = CAST_FROM_FN_PTR(address, __svml_cosf8_ha_l9);
6546           StubRoutines::_vector_double64_cos = CAST_FROM_FN_PTR(address, __svml_cos1_ha_l9);
6547           StubRoutines::_vector_double128_cos = CAST_FROM_FN_PTR(address, __svml_cos2_ha_l9);
6548           StubRoutines::_vector_double256_cos = CAST_FROM_FN_PTR(address, __svml_cos4_ha_l9);
6549           StubRoutines::_vector_float64_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_l9);
6550           StubRoutines::_vector_float128_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_l9);
6551           StubRoutines::_vector_float256_tan = CAST_FROM_FN_PTR(address, __svml_tanf8_ha_l9);
6552           StubRoutines::_vector_double64_tan = CAST_FROM_FN_PTR(address, __svml_tan1_ha_l9);
6553           StubRoutines::_vector_double128_tan = CAST_FROM_FN_PTR(address, __svml_tan2_ha_l9);
6554           StubRoutines::_vector_double256_tan = CAST_FROM_FN_PTR(address, __svml_tan4_ha_l9);
6555           StubRoutines::_vector_float64_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_l9);
6556           StubRoutines::_vector_float128_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_l9);
6557           StubRoutines::_vector_float256_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf8_ha_l9);
6558           StubRoutines::_vector_double64_sinh = CAST_FROM_FN_PTR(address, __svml_sinh1_ha_l9);
6559           StubRoutines::_vector_double128_sinh = CAST_FROM_FN_PTR(address, __svml_sinh2_ha_l9);
6560           StubRoutines::_vector_double256_sinh = CAST_FROM_FN_PTR(address, __svml_sinh4_ha_l9);
6561           StubRoutines::_vector_float64_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_l9);
6562           StubRoutines::_vector_float128_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_l9);
6563           StubRoutines::_vector_float256_cosh = CAST_FROM_FN_PTR(address, __svml_coshf8_ha_l9);
6564           StubRoutines::_vector_double64_cosh = CAST_FROM_FN_PTR(address, __svml_cosh1_ha_l9);
6565           StubRoutines::_vector_double128_cosh = CAST_FROM_FN_PTR(address, __svml_cosh2_ha_l9);
6566           StubRoutines::_vector_double256_cosh = CAST_FROM_FN_PTR(address, __svml_cosh4_ha_l9);
6567           StubRoutines::_vector_float64_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_l9);
6568           StubRoutines::_vector_float128_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_l9);
6569           StubRoutines::_vector_float256_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf8_ha_l9);
6570           StubRoutines::_vector_double64_tanh = CAST_FROM_FN_PTR(address, __svml_tanh1_ha_l9);
6571           StubRoutines::_vector_double128_tanh = CAST_FROM_FN_PTR(address, __svml_tanh2_ha_l9);
6572           StubRoutines::_vector_double256_tanh = CAST_FROM_FN_PTR(address, __svml_tanh4_ha_l9);
6573           StubRoutines::_vector_float64_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_l9);
6574           StubRoutines::_vector_float128_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_l9);
6575           StubRoutines::_vector_float256_acos = CAST_FROM_FN_PTR(address, __svml_acosf8_ha_l9);
6576           StubRoutines::_vector_double64_acos = CAST_FROM_FN_PTR(address, __svml_acos1_ha_l9);
6577           StubRoutines::_vector_double128_acos = CAST_FROM_FN_PTR(address, __svml_acos2_ha_l9);
6578           StubRoutines::_vector_double256_acos = CAST_FROM_FN_PTR(address, __svml_acos4_ha_l9);
6579           StubRoutines::_vector_float64_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_l9);
6580           StubRoutines::_vector_float128_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_l9);
6581           StubRoutines::_vector_float256_asin = CAST_FROM_FN_PTR(address, __svml_asinf8_ha_l9);
6582           StubRoutines::_vector_double64_asin = CAST_FROM_FN_PTR(address, __svml_asin1_ha_l9);
6583           StubRoutines::_vector_double128_asin = CAST_FROM_FN_PTR(address, __svml_asin2_ha_l9);
6584           StubRoutines::_vector_double256_asin = CAST_FROM_FN_PTR(address, __svml_asin4_ha_l9);
6585           StubRoutines::_vector_float64_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_l9);
6586           StubRoutines::_vector_float128_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_l9);
6587           StubRoutines::_vector_float256_atan = CAST_FROM_FN_PTR(address, __svml_atanf8_ha_l9);
6588           StubRoutines::_vector_double64_atan = CAST_FROM_FN_PTR(address, __svml_atan1_ha_l9);
6589           StubRoutines::_vector_double128_atan = CAST_FROM_FN_PTR(address, __svml_atan2_ha_l9);
6590           StubRoutines::_vector_double256_atan = CAST_FROM_FN_PTR(address, __svml_atan4_ha_l9);
6591           StubRoutines::_vector_float64_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_l9);
6592           StubRoutines::_vector_float128_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_l9);
6593           StubRoutines::_vector_float256_pow = CAST_FROM_FN_PTR(address, __svml_powf8_ha_l9);
6594           StubRoutines::_vector_double64_pow = CAST_FROM_FN_PTR(address, __svml_pow1_ha_l9);
6595           StubRoutines::_vector_double128_pow = CAST_FROM_FN_PTR(address, __svml_pow2_ha_l9);
6596           StubRoutines::_vector_double256_pow = CAST_FROM_FN_PTR(address, __svml_pow4_ha_l9);
6597           StubRoutines::_vector_float64_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_l9);
6598           StubRoutines::_vector_float128_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_l9);
6599           StubRoutines::_vector_float256_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf8_ha_l9);
6600           StubRoutines::_vector_double64_hypot = CAST_FROM_FN_PTR(address, __svml_hypot1_ha_l9);
6601           StubRoutines::_vector_double128_hypot = CAST_FROM_FN_PTR(address, __svml_hypot2_ha_l9);
6602           StubRoutines::_vector_double256_hypot = CAST_FROM_FN_PTR(address, __svml_hypot4_ha_l9);
6603           StubRoutines::_vector_float64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_l9);
6604           StubRoutines::_vector_float128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_l9);
6605           StubRoutines::_vector_float256_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf8_ha_l9);
6606           StubRoutines::_vector_double64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt1_ha_l9);
6607           StubRoutines::_vector_double128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt2_ha_l9);
6608           StubRoutines::_vector_double256_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt4_ha_l9);
6609           StubRoutines::_vector_float64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_l9);
6610           StubRoutines::_vector_float128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_l9);
6611           StubRoutines::_vector_float256_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f8_ha_l9);
6612           StubRoutines::_vector_double64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan21_ha_l9);
6613           StubRoutines::_vector_double128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan22_ha_l9);
6614           StubRoutines::_vector_double256_atan2 = CAST_FROM_FN_PTR(address, __svml_atan24_ha_l9);  
6615       }  
6616         
6617        
6618       } else if (UseSSE>=2) {
6619         StubRoutines::_vector_float64_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_ex);  
6620         StubRoutines::_vector_float128_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_ex);  
6621         StubRoutines::_vector_double64_exp = CAST_FROM_FN_PTR(address, __svml_exp1_ha_ex);  
6622         StubRoutines::_vector_double128_exp = CAST_FROM_FN_PTR(address, __svml_exp2_ha_ex);  
6623         StubRoutines::_vector_float64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_ex);
6624         StubRoutines::_vector_float128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_ex);  
6625         StubRoutines::_vector_double64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm11_ha_ex);
6626         StubRoutines::_vector_double128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm12_ha_ex);   
6627         StubRoutines::_vector_float64_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_ex);   
6628         StubRoutines::_vector_float128_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_ex);  
6629         StubRoutines::_vector_double64_acos = CAST_FROM_FN_PTR(address, __svml_acos1_ha_ex);
6630         StubRoutines::_vector_double128_acos = CAST_FROM_FN_PTR(address, __svml_acos2_ha_ex);      
6631         StubRoutines::_vector_float64_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_ex);
6632         StubRoutines::_vector_float128_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_ex);  
6633         StubRoutines::_vector_double64_asin = CAST_FROM_FN_PTR(address, __svml_asin1_ha_ex);
6634         StubRoutines::_vector_double128_asin = CAST_FROM_FN_PTR(address, __svml_asin2_ha_ex);      
6635         StubRoutines::_vector_float64_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_ex);
6636         StubRoutines::_vector_float128_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_ex);  
6637         StubRoutines::_vector_double64_atan = CAST_FROM_FN_PTR(address, __svml_atan1_ha_ex);
6638         StubRoutines::_vector_double128_atan = CAST_FROM_FN_PTR(address, __svml_atan2_ha_ex);      
6639         StubRoutines::_vector_float64_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_ex);
6640         StubRoutines::_vector_float128_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_ex);  
6641         StubRoutines::_vector_double64_sin = CAST_FROM_FN_PTR(address, __svml_sin1_ha_ex);
6642         StubRoutines::_vector_double128_sin = CAST_FROM_FN_PTR(address, __svml_sin2_ha_ex);      
6643         StubRoutines::_vector_float64_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_ex);
6644         StubRoutines::_vector_float128_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_ex);  
6645         StubRoutines::_vector_double64_cos = CAST_FROM_FN_PTR(address, __svml_cos1_ha_ex);
6646         StubRoutines::_vector_double128_cos = CAST_FROM_FN_PTR(address, __svml_cos2_ha_ex);      
6647         StubRoutines::_vector_float64_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_ex);
6648         StubRoutines::_vector_float128_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_ex);  
6649         StubRoutines::_vector_double64_tan = CAST_FROM_FN_PTR(address, __svml_tan1_ha_ex);
6650         StubRoutines::_vector_double128_tan = CAST_FROM_FN_PTR(address, __svml_tan2_ha_ex);      
6651         StubRoutines::_vector_float64_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_ex);
6652         StubRoutines::_vector_float128_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_ex);  
6653         StubRoutines::_vector_double64_sinh = CAST_FROM_FN_PTR(address, __svml_sinh1_ha_ex);
6654         StubRoutines::_vector_double128_sinh = CAST_FROM_FN_PTR(address, __svml_sinh2_ha_ex);      
6655         StubRoutines::_vector_float64_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_ex);
6656         StubRoutines::_vector_float128_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_ex);
6657         StubRoutines::_vector_double64_cosh = CAST_FROM_FN_PTR(address, __svml_cosh1_ha_ex);  
6658         StubRoutines::_vector_double128_cosh = CAST_FROM_FN_PTR(address, __svml_cosh2_ha_ex);      
6659         StubRoutines::_vector_float64_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_ex);
6660         StubRoutines::_vector_float128_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_ex);  
6661         StubRoutines::_vector_double64_tanh = CAST_FROM_FN_PTR(address, __svml_tanh1_ha_ex);
6662         StubRoutines::_vector_double128_tanh = CAST_FROM_FN_PTR(address, __svml_tanh2_ha_ex);      
6663         StubRoutines::_vector_float64_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_ex);
6664         StubRoutines::_vector_float128_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_ex);  
6665         StubRoutines::_vector_double64_log = CAST_FROM_FN_PTR(address, __svml_log1_ha_ex);
6666         StubRoutines::_vector_double128_log = CAST_FROM_FN_PTR(address, __svml_log2_ha_ex);      
6667         StubRoutines::_vector_float64_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_ex);
6668         StubRoutines::_vector_float128_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_ex);  
6669         StubRoutines::_vector_double64_log10 = CAST_FROM_FN_PTR(address, __svml_log101_ha_ex);
6670         StubRoutines::_vector_double128_log10 = CAST_FROM_FN_PTR(address, __svml_log102_ha_ex);      
6671         StubRoutines::_vector_float64_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_ex);
6672         StubRoutines::_vector_float128_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_ex);  
6673         StubRoutines::_vector_double64_log1p = CAST_FROM_FN_PTR(address, __svml_log1p1_ha_ex);
6674         StubRoutines::_vector_double128_log1p = CAST_FROM_FN_PTR(address, __svml_log1p2_ha_ex);      
6675         StubRoutines::_vector_float64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_ex);
6676         StubRoutines::_vector_float128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_ex); 
6677         StubRoutines::_vector_double64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan21_ha_ex);
6678         StubRoutines::_vector_double128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan22_ha_ex);      
6679         StubRoutines::_vector_float64_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_ex);
6680         StubRoutines::_vector_float128_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_ex);  
6681         StubRoutines::_vector_double64_hypot = CAST_FROM_FN_PTR(address, __svml_hypot1_ha_ex);
6682         StubRoutines::_vector_double128_hypot = CAST_FROM_FN_PTR(address, __svml_hypot2_ha_ex);      
6683         StubRoutines::_vector_float64_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_ex);
6684         StubRoutines::_vector_float128_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_ex);  
6685         StubRoutines::_vector_double64_pow = CAST_FROM_FN_PTR(address, __svml_pow1_ha_ex);
6686         StubRoutines::_vector_double128_pow = CAST_FROM_FN_PTR(address, __svml_pow2_ha_ex);      
6687         StubRoutines::_vector_float64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_ex);
6688         StubRoutines::_vector_float128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_ex);  
6689         StubRoutines::_vector_double64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt1_ha_ex);
6690         StubRoutines::_vector_double128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt2_ha_ex);      
6691       }
6692   }
6693 #endif
6694   }
6695 
6696  public:
6697   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
6698     if (all) {
6699       generate_all();
6700     } else {
6701       generate_initial();
6702     }
6703   }
6704 }; // end class declaration
6705 
6706 void StubGenerator_generate(CodeBuffer* code, bool all) {
6707   StubGenerator g(code, all);
6708 }