1 /*
   2  * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "ci/ciUtilities.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "gc/shared/barrierSetNMethod.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "nativeInst_x86.hpp"
  34 #include "oops/instanceOop.hpp"
  35 #include "oops/method.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/frame.inline.hpp"
  40 #include "runtime/handles.inline.hpp"
  41 #include "runtime/sharedRuntime.hpp"
  42 #include "runtime/stubCodeGenerator.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 #ifdef COMPILER2
  46 #include "opto/runtime.hpp"
  47 #endif
  48 #if INCLUDE_ZGC
  49 #include "gc/z/zThreadLocalData.hpp"
  50 #endif
  51 
  52 #ifdef __VECTOR_API_MATH_INTRINSICS_COMMON
  53 // Vector API SVML routines written in assembly
  54 extern "C"
  55 {
  56    float __svml_expf4_ha_ex(float a);
  57    double __svml_exp1_ha_ex(double a); 
  58    double __svml_exp2_ha_ex(double a);     
  59    float __svml_expf4_ha_l9(float a);
  60    float __svml_expf8_ha_l9(float a);
  61    float __svml_expf4_ha_e9(float a);
  62    float __svml_expf8_ha_e9(float a);
  63    float __svml_expf16_ha_z0(float a);
  64    double __svml_exp1_ha_l9(double a);    
  65    double __svml_exp2_ha_l9(double a);
  66    double __svml_exp4_ha_l9(double a);
  67    double __svml_exp1_ha_e9(double a);    
  68    double __svml_exp2_ha_e9(double a);
  69    double __svml_exp4_ha_e9(double a);
  70    double __svml_exp8_ha_z0(double a); 
  71    float  __svml_expm1f4_ha_ex(float a);
  72    double __svml_expm11_ha_ex(double a);
  73    double __svml_expm12_ha_ex(double a);
  74    float  __svml_expm1f4_ha_l9(float a);
  75    float  __svml_expm1f8_ha_l9(float a);
  76    float  __svml_expm1f4_ha_e9(float a);
  77    float  __svml_expm1f8_ha_e9(float a);
  78    float __svml_expm1f16_ha_z0(float a);
  79    double __svml_expm11_ha_l9(double a); 
  80    double __svml_expm12_ha_l9(double a);
  81    double __svml_expm14_ha_l9(double a);
  82    double __svml_expm11_ha_e9(double a); 
  83    double __svml_expm12_ha_e9(double a);
  84    double __svml_expm14_ha_e9(double a);
  85    double __svml_expm18_ha_z0(double a); 
  86    float __svml_log1pf4_ha_l9(float a);
  87    float __svml_log1pf8_ha_l9(float a);
  88    float __svml_log1pf4_ha_e9(float a);
  89    float __svml_log1pf8_ha_e9(float a);
  90    float __svml_log1pf16_ha_z0(float a);
  91    double __svml_log1p1_ha_l9(double a);
  92    double __svml_log1p2_ha_l9(double a);
  93    double __svml_log1p4_ha_l9(double a);
  94    double __svml_log1p1_ha_e9(double a);
  95    double __svml_log1p2_ha_e9(double a);
  96    double __svml_log1p4_ha_e9(double a);
  97    double __svml_log1p8_ha_z0(double a);
  98    float __svml_logf4_ha_l9(float a);
  99    float __svml_logf8_ha_l9(float a);
 100    float __svml_logf4_ha_e9(float a);
 101    float __svml_logf8_ha_e9(float a);
 102    float __svml_logf16_ha_z0(float a);
 103    double __svml_log1_ha_l9(double a);
 104    double __svml_log2_ha_l9(double a);
 105    double __svml_log4_ha_l9(double a);
 106    double __svml_log1_ha_e9(double a);
 107    double __svml_log2_ha_e9(double a);
 108    double __svml_log4_ha_e9(double a);
 109    double __svml_log8_ha_z0(double a);
 110    float __svml_log10f4_ha_l9(float a);
 111    float __svml_log10f8_ha_l9(float a);
 112    float __svml_log10f4_ha_e9(float a);
 113    float __svml_log10f8_ha_e9(float a);
 114    float __svml_log10f16_ha_z0(float a);
 115    double __svml_log101_ha_l9(double a);
 116    double __svml_log102_ha_l9(double a);
 117    double __svml_log104_ha_l9(double a); 
 118    double __svml_log101_ha_e9(double a);
 119    double __svml_log102_ha_e9(double a);
 120    double __svml_log104_ha_e9(double a);
 121    double __svml_log108_ha_z0(double a);
 122    float __svml_sinf4_ha_l9(float a);
 123    float __svml_sinf8_ha_l9(float a);
 124    float __svml_sinf4_ha_e9(float a);
 125    float __svml_sinf8_ha_e9(float a);
 126    float __svml_sinf16_ha_z0(float a);
 127    double __svml_sin1_ha_l9(double a); 
 128    double __svml_sin2_ha_l9(double a);
 129    double __svml_sin4_ha_l9(double a); 
 130    double __svml_sin1_ha_e9(double a); 
 131    double __svml_sin2_ha_e9(double a);
 132    double __svml_sin4_ha_e9(double a); 
 133    double __svml_sin8_ha_z0(double a);
 134    float __svml_cosf4_ha_l9(float a);
 135    float __svml_cosf8_ha_l9(float a);
 136    float __svml_cosf4_ha_e9(float a);
 137    float __svml_cosf8_ha_e9(float a);
 138    float __svml_cosf16_ha_z0(float a);
 139    double  __svml_cos1_ha_l9(double a);
 140    double  __svml_cos2_ha_l9(double a);
 141    double __svml_cos4_ha_l9(double a);
 142    double  __svml_cos1_ha_e9(double a);
 143    double  __svml_cos2_ha_e9(double a);
 144    double __svml_cos4_ha_e9(double a);
 145    double  __svml_cos8_ha_z0(double a);
 146    float __svml_tanf4_ha_l9(float a);
 147    float __svml_tanf8_ha_l9(float a);
 148    float __svml_tanf4_ha_e9(float a);
 149    float __svml_tanf8_ha_e9(float a);
 150    float __svml_tanf16_ha_z0(float a);
 151    double __svml_tan1_ha_l9(double a);
 152    double __svml_tan2_ha_l9(double a);
 153    double __svml_tan4_ha_l9(double a);
 154    double __svml_tan1_ha_e9(double a);
 155    double __svml_tan2_ha_e9(double a);
 156    double __svml_tan4_ha_e9(double a);
 157    double __svml_tan8_ha_z0(double a);
 158    double __svml_sinh1_ha_l9(double a);
 159    double __svml_sinh2_ha_l9(double a);
 160    double __svml_sinh4_ha_l9(double a);
 161    double __svml_sinh1_ha_e9(double a);
 162    double __svml_sinh2_ha_e9(double a);
 163    double __svml_sinh4_ha_e9(double a);
 164    double __svml_sinh8_ha_z0(double a);
 165    float __svml_sinhf4_ha_l9(float a);
 166    float __svml_sinhf8_ha_l9(float a);
 167    float __svml_sinhf4_ha_e9(float a);
 168    float __svml_sinhf8_ha_e9(float a);
 169    float __svml_sinhf16_ha_z0(float a);
 170    double __svml_cosh1_ha_l9(double a);
 171    double __svml_cosh2_ha_l9(double a);
 172    double __svml_cosh4_ha_l9(double a);
 173    double __svml_cosh1_ha_e9(double a);
 174    double __svml_cosh2_ha_e9(double a);
 175    double __svml_cosh4_ha_e9(double a);
 176    double __svml_cosh8_ha_z0(double a);
 177    float __svml_coshf4_ha_l9(float a);
 178    float __svml_coshf8_ha_l9(float a);
 179    float __svml_coshf4_ha_e9(float a);
 180    float __svml_coshf8_ha_e9(float a);
 181    float __svml_coshf16_ha_z0(float a); 
 182    double __svml_tanh1_ha_l9(double a);
 183    double __svml_tanh2_ha_l9(double a);
 184    double __svml_tanh4_ha_l9(double a);
 185    double __svml_tanh1_ha_e9(double a);
 186    double __svml_tanh2_ha_e9(double a);
 187    double __svml_tanh4_ha_e9(double a);
 188    double __svml_tanh8_ha_z0(double a);
 189    float __svml_tanhf4_ha_l9(float a);
 190    float __svml_tanhf8_ha_l9(float a);
 191    float __svml_tanhf4_ha_e9(float a);
 192    float __svml_tanhf8_ha_e9(float a);
 193    float __svml_tanhf16_ha_z0(float a);
 194    float __svml_acosf4_ha_ex(float a);
 195    float __svml_acosf4_ha_l9(float a);
 196    float __svml_acosf8_ha_l9(float a);
 197    float __svml_acosf4_ha_e9(float a);
 198    float __svml_acosf8_ha_e9(float a);
 199    float __svml_acosf16_ha_z0(float a);
 200    double __svml_acos1_ha_ex(double a);
 201    double __svml_acos2_ha_ex(double a);
 202    double __svml_acos1_ha_l9(double a);
 203    double __svml_acos2_ha_l9(double a);
 204    double __svml_acos4_ha_l9(double a);
 205    double __svml_acos1_ha_e9(double a);
 206    double __svml_acos2_ha_e9(double a);
 207    double __svml_acos4_ha_e9(double a);
 208    double __svml_acos8_ha_z0(double a);
 209    float __svml_asinf4_ha_ex(float a);
 210    double __svml_asin1_ha_ex(double a);
 211    double __svml_asin2_ha_ex(double a);
 212    double __svml_asin1_ha_l9(double a);
 213    double __svml_asin2_ha_l9(double a);
 214    double __svml_asin4_ha_l9(double a);
 215    double __svml_asin1_ha_e9(double a);
 216    double __svml_asin2_ha_e9(double a);
 217    double __svml_asin4_ha_e9(double a);
 218    double __svml_asin8_ha_z0(double a);
 219    float __svml_asinf4_ha_l9(float a);
 220    float __svml_asinf8_ha_l9(float a);
 221    float __svml_asinf4_ha_e9(float a);
 222    float __svml_asinf8_ha_e9(float a);
 223    float __svml_asinf16_ha_z0(float a);
 224    float __svml_atanf4_ha_ex(float a);
 225    double __svml_atan1_ha_ex(double a);
 226    double __svml_atan2_ha_ex(double a);
 227    double __svml_atan1_ha_l9(double a);
 228    double __svml_atan2_ha_l9(double a);
 229    double __svml_atan4_ha_l9(double a);
 230    double __svml_atan1_ha_e9(double a);
 231    double __svml_atan2_ha_e9(double a);
 232    double __svml_atan4_ha_e9(double a);
 233    double __svml_atan8_ha_z0(double a);
 234    float __svml_atanf4_ha_l9(float a);
 235    float __svml_atanf8_ha_l9(float a);
 236    float __svml_atanf4_ha_e9(float a);
 237    float __svml_atanf8_ha_e9(float a);
 238    float __svml_atanf16_ha_z0(float a);
 239    float __svml_powf4_ha_l9(float a, float b);
 240    float __svml_powf8_ha_l9(float a, float b);
 241    float __svml_powf4_ha_e9(float a, float b);
 242    float __svml_powf8_ha_e9(float a, float b);
 243    float __svml_powf16_ha_z0(float a, float b);
 244    double __svml_pow1_ha_l9(double a, double b);
 245    double __svml_pow2_ha_l9(double a, double b);
 246    double __svml_pow4_ha_l9(double a, double b);
 247    double __svml_pow1_ha_e9(double a, double b);
 248    double __svml_pow2_ha_e9(double a, double b);
 249    double __svml_pow4_ha_e9(double a, double b);
 250    double __svml_pow8_ha_z0(double a, double b);
 251    float __svml_hypotf4_ha_l9(float a, float b);
 252    float __svml_hypotf8_ha_l9(float a, float b);
 253    float __svml_hypotf4_ha_e9(float a, float b);
 254    float __svml_hypotf8_ha_e9(float a, float b);
 255    float __svml_hypotf16_ha_z0(float a, float b);
 256    double __svml_hypot1_ha_l9(double a, double b);
 257    double __svml_hypot2_ha_l9(double a, double b);
 258    double __svml_hypot4_ha_l9(double a, double b);
 259    double __svml_hypot1_ha_e9(double a, double b);
 260    double __svml_hypot2_ha_e9(double a, double b);
 261    double __svml_hypot4_ha_e9(double a, double b);
 262    double __svml_hypot8_ha_z0(double a, double b);
 263    float __svml_cbrtf4_ha_l9(float a);
 264    float __svml_cbrtf8_ha_l9(float a);
 265    float __svml_cbrtf4_ha_e9(float a);
 266    float __svml_cbrtf8_ha_e9(float a);
 267    float __svml_cbrtf16_ha_z0(float a);
 268    double __svml_cbrt1_ha_l9(double a);
 269    double __svml_cbrt2_ha_l9(double a);
 270    double __svml_cbrt4_ha_l9(double a);
 271    double __svml_cbrt1_ha_e9(double a);
 272    double __svml_cbrt2_ha_e9(double a);
 273    double __svml_cbrt4_ha_e9(double a);
 274    double __svml_cbrt8_ha_z0(double a);
 275    float __svml_atan2f4_ha_l9(float a, float b);
 276    float __svml_atan2f8_ha_l9(float a, float b);
 277    float __svml_atan2f4_ha_e9(float a, float b);
 278    float __svml_atan2f8_ha_e9(float a, float b);
 279    float __svml_atan2f16_ha_z0(float a, float b);
 280    double __svml_atan21_ha_l9(double a, double b);
 281    double __svml_atan22_ha_l9(double a, double b);
 282    double __svml_atan24_ha_l9(double a, double b);
 283    double __svml_atan28_ha_z0(double a, double b);
 284    double __svml_atan21_ha_e9(double a, double b);
 285    double __svml_atan22_ha_e9(double a, double b);
 286    double __svml_atan24_ha_e9(double a, double b);
 287    float __svml_sinf4_ha_ex(float a);
 288    double __svml_sin1_ha_ex(double a);
 289    double __svml_sin2_ha_ex(double a);
 290    float __svml_cosf4_ha_ex(float a);
 291    double __svml_cos1_ha_ex(double a);
 292    double __svml_cos2_ha_ex(double a);
 293    float __svml_tanf4_ha_ex(float a);
 294    double __svml_tan1_ha_ex(double a);
 295    double __svml_tan2_ha_ex(double a);
 296    float __svml_sinhf4_ha_ex(float a);
 297    double __svml_sinh1_ha_ex(double a);
 298    double __svml_sinh2_ha_ex(double a);
 299    float __svml_coshf4_ha_ex(float a);
 300    double __svml_cosh1_ha_ex(double a);
 301    double __svml_cosh2_ha_ex(double a);
 302    float __svml_tanhf4_ha_ex(float a);
 303    double __svml_tanh1_ha_ex(double a);
 304    double __svml_tanh2_ha_ex(double a);
 305    double __svml_log1_ha_ex(double a);
 306    double __svml_log2_ha_ex(double a);
 307    double __svml_log1p1_ha_ex(double a);
 308    double __svml_log1p2_ha_ex(double a);
 309    double __svml_log101_ha_ex(double a);
 310    double __svml_log102_ha_ex(double a);
 311    float __svml_logf4_ha_ex(float a);
 312    float __svml_log1pf4_ha_ex(float a);
 313    float __svml_log10f4_ha_ex(float a);
 314    double __svml_atan21_ha_ex(double a); 
 315    double __svml_atan22_ha_ex(double a); 
 316    float __svml_atan2f4_ha_ex(float a);
 317    float __svml_hypotf4_ha_ex(float a);
 318    double __svml_hypot1_ha_ex(double a);
 319    double __svml_hypot2_ha_ex(double a);
 320    double __svml_pow1_ha_ex(double a);
 321    double __svml_pow2_ha_ex(double a);
 322    float __svml_powf4_ha_ex(float a);
 323    double __svml_cbrt1_ha_ex(double a);
 324    double __svml_cbrt2_ha_ex(double a);
 325    float __svml_cbrtf4_ha_ex(float a);
 326 }
 327 #endif
 328 
 329 // Declaration and definition of StubGenerator (no .hpp file).
 330 // For a more detailed description of the stub routine structure
 331 // see the comment in stubRoutines.hpp
 332 
 333 #define __ _masm->
 334 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
 335 #define a__ ((Assembler*)_masm)->
 336 
 337 #ifdef PRODUCT
 338 #define BLOCK_COMMENT(str) /* nothing */
 339 #else
 340 #define BLOCK_COMMENT(str) __ block_comment(str)
 341 #endif
 342 
 343 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
 344 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
 345 
 346 // Stub Code definitions
 347 
 348 class StubGenerator: public StubCodeGenerator {
 349  private:
 350 
 351 #ifdef PRODUCT
 352 #define inc_counter_np(counter) ((void)0)
 353 #else
 354   void inc_counter_np_(int& counter) {
 355     // This can destroy rscratch1 if counter is far from the code cache
 356     __ incrementl(ExternalAddress((address)&counter));
 357   }
 358 #define inc_counter_np(counter) \
 359   BLOCK_COMMENT("inc_counter " #counter); \
 360   inc_counter_np_(counter);
 361 #endif
 362 
 363   // Call stubs are used to call Java from C
 364   //
 365   // Linux Arguments:
 366   //    c_rarg0:   call wrapper address                   address
 367   //    c_rarg1:   result                                 address
 368   //    c_rarg2:   result type                            BasicType
 369   //    c_rarg3:   method                                 Method*
 370   //    c_rarg4:   (interpreter) entry point              address
 371   //    c_rarg5:   parameters                             intptr_t*
 372   //    16(rbp): parameter size (in words)              int
 373   //    24(rbp): thread                                 Thread*
 374   //
 375   //     [ return_from_Java     ] <--- rsp
 376   //     [ argument word n      ]
 377   //      ...
 378   // -12 [ argument word 1      ]
 379   // -11 [ saved r15            ] <--- rsp_after_call
 380   // -10 [ saved r14            ]
 381   //  -9 [ saved r13            ]
 382   //  -8 [ saved r12            ]
 383   //  -7 [ saved rbx            ]
 384   //  -6 [ call wrapper         ]
 385   //  -5 [ result               ]
 386   //  -4 [ result type          ]
 387   //  -3 [ method               ]
 388   //  -2 [ entry point          ]
 389   //  -1 [ parameters           ]
 390   //   0 [ saved rbp            ] <--- rbp
 391   //   1 [ return address       ]
 392   //   2 [ parameter size       ]
 393   //   3 [ thread               ]
 394   //
 395   // Windows Arguments:
 396   //    c_rarg0:   call wrapper address                   address
 397   //    c_rarg1:   result                                 address
 398   //    c_rarg2:   result type                            BasicType
 399   //    c_rarg3:   method                                 Method*
 400   //    48(rbp): (interpreter) entry point              address
 401   //    56(rbp): parameters                             intptr_t*
 402   //    64(rbp): parameter size (in words)              int
 403   //    72(rbp): thread                                 Thread*
 404   //
 405   //     [ return_from_Java     ] <--- rsp
 406   //     [ argument word n      ]
 407   //      ...
 408   // -60 [ argument word 1      ]
 409   // -59 [ saved xmm31          ] <--- rsp after_call
 410   //     [ saved xmm16-xmm30    ] (EVEX enabled, else the space is blank)
 411   // -27 [ saved xmm15          ]
 412   //     [ saved xmm7-xmm14     ]
 413   //  -9 [ saved xmm6           ] (each xmm register takes 2 slots)
 414   //  -7 [ saved r15            ]
 415   //  -6 [ saved r14            ]
 416   //  -5 [ saved r13            ]
 417   //  -4 [ saved r12            ]
 418   //  -3 [ saved rdi            ]
 419   //  -2 [ saved rsi            ]
 420   //  -1 [ saved rbx            ]
 421   //   0 [ saved rbp            ] <--- rbp
 422   //   1 [ return address       ]
 423   //   2 [ call wrapper         ]
 424   //   3 [ result               ]
 425   //   4 [ result type          ]
 426   //   5 [ method               ]
 427   //   6 [ entry point          ]
 428   //   7 [ parameters           ]
 429   //   8 [ parameter size       ]
 430   //   9 [ thread               ]
 431   //
 432   //    Windows reserves the callers stack space for arguments 1-4.
 433   //    We spill c_rarg0-c_rarg3 to this space.
 434 
 435   // Call stub stack layout word offsets from rbp
 436   enum call_stub_layout {
 437 #ifdef _WIN64
 438     xmm_save_first     = 6,  // save from xmm6
 439     xmm_save_last      = 31, // to xmm31
 440     xmm_save_base      = -9,
 441     rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
 442     r15_off            = -7,
 443     r14_off            = -6,
 444     r13_off            = -5,
 445     r12_off            = -4,
 446     rdi_off            = -3,
 447     rsi_off            = -2,
 448     rbx_off            = -1,
 449     rbp_off            =  0,
 450     retaddr_off        =  1,
 451     call_wrapper_off   =  2,
 452     result_off         =  3,
 453     result_type_off    =  4,
 454     method_off         =  5,
 455     entry_point_off    =  6,
 456     parameters_off     =  7,
 457     parameter_size_off =  8,
 458     thread_off         =  9
 459 #else
 460     rsp_after_call_off = -12,
 461     mxcsr_off          = rsp_after_call_off,
 462     r15_off            = -11,
 463     r14_off            = -10,
 464     r13_off            = -9,
 465     r12_off            = -8,
 466     rbx_off            = -7,
 467     call_wrapper_off   = -6,
 468     result_off         = -5,
 469     result_type_off    = -4,
 470     method_off         = -3,
 471     entry_point_off    = -2,
 472     parameters_off     = -1,
 473     rbp_off            =  0,
 474     retaddr_off        =  1,
 475     parameter_size_off =  2,
 476     thread_off         =  3
 477 #endif
 478   };
 479 
 480 #ifdef _WIN64
 481   Address xmm_save(int reg) {
 482     assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range");
 483     return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize);
 484   }
 485 #endif
 486 
 487   address generate_call_stub(address& return_address) {
 488     assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 &&
 489            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 490            "adjust this code");
 491     StubCodeMark mark(this, "StubRoutines", "call_stub");
 492     address start = __ pc();
 493 
 494     // same as in generate_catch_exception()!
 495     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 496 
 497     const Address call_wrapper  (rbp, call_wrapper_off   * wordSize);
 498     const Address result        (rbp, result_off         * wordSize);
 499     const Address result_type   (rbp, result_type_off    * wordSize);
 500     const Address method        (rbp, method_off         * wordSize);
 501     const Address entry_point   (rbp, entry_point_off    * wordSize);
 502     const Address parameters    (rbp, parameters_off     * wordSize);
 503     const Address parameter_size(rbp, parameter_size_off * wordSize);
 504 
 505     // same as in generate_catch_exception()!
 506     const Address thread        (rbp, thread_off         * wordSize);
 507 
 508     const Address r15_save(rbp, r15_off * wordSize);
 509     const Address r14_save(rbp, r14_off * wordSize);
 510     const Address r13_save(rbp, r13_off * wordSize);
 511     const Address r12_save(rbp, r12_off * wordSize);
 512     const Address rbx_save(rbp, rbx_off * wordSize);
 513 
 514     // stub code
 515     __ enter();
 516     __ subptr(rsp, -rsp_after_call_off * wordSize);
 517 
 518     // save register parameters
 519 #ifndef _WIN64
 520     __ movptr(parameters,   c_rarg5); // parameters
 521     __ movptr(entry_point,  c_rarg4); // entry_point
 522 #endif
 523 
 524     __ movptr(method,       c_rarg3); // method
 525     __ movl(result_type,  c_rarg2);   // result type
 526     __ movptr(result,       c_rarg1); // result
 527     __ movptr(call_wrapper, c_rarg0); // call wrapper
 528 
 529     // save regs belonging to calling function
 530     __ movptr(rbx_save, rbx);
 531     __ movptr(r12_save, r12);
 532     __ movptr(r13_save, r13);
 533     __ movptr(r14_save, r14);
 534     __ movptr(r15_save, r15);
 535 
 536 #ifdef _WIN64
 537     int last_reg = 15;
 538     if (UseAVX > 2) {
 539       last_reg = 31;
 540     }
 541     if (VM_Version::supports_evex()) {
 542       for (int i = xmm_save_first; i <= last_reg; i++) {
 543         __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0);
 544       }
 545     } else {
 546       for (int i = xmm_save_first; i <= last_reg; i++) {
 547         __ movdqu(xmm_save(i), as_XMMRegister(i));
 548       }
 549     }
 550 
 551     const Address rdi_save(rbp, rdi_off * wordSize);
 552     const Address rsi_save(rbp, rsi_off * wordSize);
 553 
 554     __ movptr(rsi_save, rsi);
 555     __ movptr(rdi_save, rdi);
 556 #else
 557     const Address mxcsr_save(rbp, mxcsr_off * wordSize);
 558     {
 559       Label skip_ldmx;
 560       __ stmxcsr(mxcsr_save);
 561       __ movl(rax, mxcsr_save);
 562       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 563       ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
 564       __ cmp32(rax, mxcsr_std);
 565       __ jcc(Assembler::equal, skip_ldmx);
 566       __ ldmxcsr(mxcsr_std);
 567       __ bind(skip_ldmx);
 568     }
 569 #endif
 570 
 571     // Load up thread register
 572     __ movptr(r15_thread, thread);
 573     __ reinit_heapbase();
 574 
 575 #ifdef ASSERT
 576     // make sure we have no pending exceptions
 577     {
 578       Label L;
 579       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 580       __ jcc(Assembler::equal, L);
 581       __ stop("StubRoutines::call_stub: entered with pending exception");
 582       __ bind(L);
 583     }
 584 #endif
 585 
 586     // pass parameters if any
 587     BLOCK_COMMENT("pass parameters if any");
 588     Label parameters_done;
 589     __ movl(c_rarg3, parameter_size);
 590     __ testl(c_rarg3, c_rarg3);
 591     __ jcc(Assembler::zero, parameters_done);
 592 
 593     Label loop;
 594     __ movptr(c_rarg2, parameters);       // parameter pointer
 595     __ movl(c_rarg1, c_rarg3);            // parameter counter is in c_rarg1
 596     __ BIND(loop);
 597     __ movptr(rax, Address(c_rarg2, 0));// get parameter
 598     __ addptr(c_rarg2, wordSize);       // advance to next parameter
 599     __ decrementl(c_rarg1);             // decrement counter
 600     __ push(rax);                       // pass parameter
 601     __ jcc(Assembler::notZero, loop);
 602 
 603     // call Java function
 604     __ BIND(parameters_done);
 605     __ movptr(rbx, method);             // get Method*
 606     __ movptr(c_rarg1, entry_point);    // get entry_point
 607     __ mov(r13, rsp);                   // set sender sp
 608     BLOCK_COMMENT("call Java function");
 609     __ call(c_rarg1);
 610 
 611     BLOCK_COMMENT("call_stub_return_address:");
 612     return_address = __ pc();
 613 
 614     // store result depending on type (everything that is not
 615     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 616     __ movptr(c_rarg0, result);
 617     Label is_long, is_float, is_double, exit;
 618     __ movl(c_rarg1, result_type);
 619     __ cmpl(c_rarg1, T_OBJECT);
 620     __ jcc(Assembler::equal, is_long);
 621     __ cmpl(c_rarg1, T_LONG);
 622     __ jcc(Assembler::equal, is_long);
 623     __ cmpl(c_rarg1, T_FLOAT);
 624     __ jcc(Assembler::equal, is_float);
 625     __ cmpl(c_rarg1, T_DOUBLE);
 626     __ jcc(Assembler::equal, is_double);
 627 
 628     // handle T_INT case
 629     __ movl(Address(c_rarg0, 0), rax);
 630 
 631     __ BIND(exit);
 632 
 633     // pop parameters
 634     __ lea(rsp, rsp_after_call);
 635 
 636 #ifdef ASSERT
 637     // verify that threads correspond
 638     {
 639      Label L1, L2, L3;
 640       __ cmpptr(r15_thread, thread);
 641       __ jcc(Assembler::equal, L1);
 642       __ stop("StubRoutines::call_stub: r15_thread is corrupted");
 643       __ bind(L1);
 644       __ get_thread(rbx);
 645       __ cmpptr(r15_thread, thread);
 646       __ jcc(Assembler::equal, L2);
 647       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
 648       __ bind(L2);
 649       __ cmpptr(r15_thread, rbx);
 650       __ jcc(Assembler::equal, L3);
 651       __ stop("StubRoutines::call_stub: threads must correspond");
 652       __ bind(L3);
 653     }
 654 #endif
 655 
 656     // restore regs belonging to calling function
 657 #ifdef _WIN64
 658     // emit the restores for xmm regs
 659     if (VM_Version::supports_evex()) {
 660       for (int i = xmm_save_first; i <= last_reg; i++) {
 661         __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
 662       }
 663     } else {
 664       for (int i = xmm_save_first; i <= last_reg; i++) {
 665         __ movdqu(as_XMMRegister(i), xmm_save(i));
 666       }
 667     }
 668 #endif
 669     __ movptr(r15, r15_save);
 670     __ movptr(r14, r14_save);
 671     __ movptr(r13, r13_save);
 672     __ movptr(r12, r12_save);
 673     __ movptr(rbx, rbx_save);
 674 
 675 #ifdef _WIN64
 676     __ movptr(rdi, rdi_save);
 677     __ movptr(rsi, rsi_save);
 678 #else
 679     __ ldmxcsr(mxcsr_save);
 680 #endif
 681 
 682     // restore rsp
 683     __ addptr(rsp, -rsp_after_call_off * wordSize);
 684 
 685     // return
 686     __ vzeroupper();
 687     __ pop(rbp);
 688     __ ret(0);
 689 
 690     // handle return types different from T_INT
 691     __ BIND(is_long);
 692     __ movq(Address(c_rarg0, 0), rax);
 693     __ jmp(exit);
 694 
 695     __ BIND(is_float);
 696     __ movflt(Address(c_rarg0, 0), xmm0);
 697     __ jmp(exit);
 698 
 699     __ BIND(is_double);
 700     __ movdbl(Address(c_rarg0, 0), xmm0);
 701     __ jmp(exit);
 702 
 703     return start;
 704   }
 705 
 706   // Return point for a Java call if there's an exception thrown in
 707   // Java code.  The exception is caught and transformed into a
 708   // pending exception stored in JavaThread that can be tested from
 709   // within the VM.
 710   //
 711   // Note: Usually the parameters are removed by the callee. In case
 712   // of an exception crossing an activation frame boundary, that is
 713   // not the case if the callee is compiled code => need to setup the
 714   // rsp.
 715   //
 716   // rax: exception oop
 717 
 718   address generate_catch_exception() {
 719     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 720     address start = __ pc();
 721 
 722     // same as in generate_call_stub():
 723     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 724     const Address thread        (rbp, thread_off         * wordSize);
 725 
 726 #ifdef ASSERT
 727     // verify that threads correspond
 728     {
 729       Label L1, L2, L3;
 730       __ cmpptr(r15_thread, thread);
 731       __ jcc(Assembler::equal, L1);
 732       __ stop("StubRoutines::catch_exception: r15_thread is corrupted");
 733       __ bind(L1);
 734       __ get_thread(rbx);
 735       __ cmpptr(r15_thread, thread);
 736       __ jcc(Assembler::equal, L2);
 737       __ stop("StubRoutines::catch_exception: r15_thread is modified by call");
 738       __ bind(L2);
 739       __ cmpptr(r15_thread, rbx);
 740       __ jcc(Assembler::equal, L3);
 741       __ stop("StubRoutines::catch_exception: threads must correspond");
 742       __ bind(L3);
 743     }
 744 #endif
 745 
 746     // set pending exception
 747     __ verify_oop(rax);
 748 
 749     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
 750     __ lea(rscratch1, ExternalAddress((address)__FILE__));
 751     __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
 752     __ movl(Address(r15_thread, Thread::exception_line_offset()), (int)  __LINE__);
 753 
 754     // complete return to VM
 755     assert(StubRoutines::_call_stub_return_address != NULL,
 756            "_call_stub_return_address must have been generated before");
 757     __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
 758 
 759     return start;
 760   }
 761 
 762   // Continuation point for runtime calls returning with a pending
 763   // exception.  The pending exception check happened in the runtime
 764   // or native call stub.  The pending exception in Thread is
 765   // converted into a Java-level exception.
 766   //
 767   // Contract with Java-level exception handlers:
 768   // rax: exception
 769   // rdx: throwing pc
 770   //
 771   // NOTE: At entry of this stub, exception-pc must be on stack !!
 772 
 773   address generate_forward_exception() {
 774     StubCodeMark mark(this, "StubRoutines", "forward exception");
 775     address start = __ pc();
 776 
 777     // Upon entry, the sp points to the return address returning into
 778     // Java (interpreted or compiled) code; i.e., the return address
 779     // becomes the throwing pc.
 780     //
 781     // Arguments pushed before the runtime call are still on the stack
 782     // but the exception handler will reset the stack pointer ->
 783     // ignore them.  A potential result in registers can be ignored as
 784     // well.
 785 
 786 #ifdef ASSERT
 787     // make sure this code is only executed if there is a pending exception
 788     {
 789       Label L;
 790       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL);
 791       __ jcc(Assembler::notEqual, L);
 792       __ stop("StubRoutines::forward exception: no pending exception (1)");
 793       __ bind(L);
 794     }
 795 #endif
 796 
 797     // compute exception handler into rbx
 798     __ movptr(c_rarg0, Address(rsp, 0));
 799     BLOCK_COMMENT("call exception_handler_for_return_address");
 800     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 801                          SharedRuntime::exception_handler_for_return_address),
 802                     r15_thread, c_rarg0);
 803     __ mov(rbx, rax);
 804 
 805     // setup rax & rdx, remove return address & clear pending exception
 806     __ pop(rdx);
 807     __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 808     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 809 
 810 #ifdef ASSERT
 811     // make sure exception is set
 812     {
 813       Label L;
 814       __ testptr(rax, rax);
 815       __ jcc(Assembler::notEqual, L);
 816       __ stop("StubRoutines::forward exception: no pending exception (2)");
 817       __ bind(L);
 818     }
 819 #endif
 820 
 821     // continue at exception handler (return address removed)
 822     // rax: exception
 823     // rbx: exception handler
 824     // rdx: throwing pc
 825     __ verify_oop(rax);
 826     __ jmp(rbx);
 827 
 828     return start;
 829   }
 830 
 831   // Support for jint atomic::xchg(jint exchange_value, volatile jint* dest)
 832   //
 833   // Arguments :
 834   //    c_rarg0: exchange_value
 835   //    c_rarg0: dest
 836   //
 837   // Result:
 838   //    *dest <- ex, return (orig *dest)
 839   address generate_atomic_xchg() {
 840     StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
 841     address start = __ pc();
 842 
 843     __ movl(rax, c_rarg0); // Copy to eax we need a return value anyhow
 844     __ xchgl(rax, Address(c_rarg1, 0)); // automatic LOCK
 845     __ ret(0);
 846 
 847     return start;
 848   }
 849 
 850   // Support for intptr_t atomic::xchg_long(jlong exchange_value, volatile jlong* dest)
 851   //
 852   // Arguments :
 853   //    c_rarg0: exchange_value
 854   //    c_rarg1: dest
 855   //
 856   // Result:
 857   //    *dest <- ex, return (orig *dest)
 858   address generate_atomic_xchg_long() {
 859     StubCodeMark mark(this, "StubRoutines", "atomic_xchg_long");
 860     address start = __ pc();
 861 
 862     __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
 863     __ xchgptr(rax, Address(c_rarg1, 0)); // automatic LOCK
 864     __ ret(0);
 865 
 866     return start;
 867   }
 868 
 869   // Support for jint atomic::atomic_cmpxchg(jint exchange_value, volatile jint* dest,
 870   //                                         jint compare_value)
 871   //
 872   // Arguments :
 873   //    c_rarg0: exchange_value
 874   //    c_rarg1: dest
 875   //    c_rarg2: compare_value
 876   //
 877   // Result:
 878   //    if ( compare_value == *dest ) {
 879   //       *dest = exchange_value
 880   //       return compare_value;
 881   //    else
 882   //       return *dest;
 883   address generate_atomic_cmpxchg() {
 884     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
 885     address start = __ pc();
 886 
 887     __ movl(rax, c_rarg2);
 888     __ lock();
 889     __ cmpxchgl(c_rarg0, Address(c_rarg1, 0));
 890     __ ret(0);
 891 
 892     return start;
 893   }
 894 
 895   // Support for int8_t atomic::atomic_cmpxchg(int8_t exchange_value, volatile int8_t* dest,
 896   //                                           int8_t compare_value)
 897   //
 898   // Arguments :
 899   //    c_rarg0: exchange_value
 900   //    c_rarg1: dest
 901   //    c_rarg2: compare_value
 902   //
 903   // Result:
 904   //    if ( compare_value == *dest ) {
 905   //       *dest = exchange_value
 906   //       return compare_value;
 907   //    else
 908   //       return *dest;
 909   address generate_atomic_cmpxchg_byte() {
 910     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_byte");
 911     address start = __ pc();
 912 
 913     __ movsbq(rax, c_rarg2);
 914     __ lock();
 915     __ cmpxchgb(c_rarg0, Address(c_rarg1, 0));
 916     __ ret(0);
 917 
 918     return start;
 919   }
 920 
 921   // Support for int64_t atomic::atomic_cmpxchg(int64_t exchange_value,
 922   //                                            volatile int64_t* dest,
 923   //                                            int64_t compare_value)
 924   // Arguments :
 925   //    c_rarg0: exchange_value
 926   //    c_rarg1: dest
 927   //    c_rarg2: compare_value
 928   //
 929   // Result:
 930   //    if ( compare_value == *dest ) {
 931   //       *dest = exchange_value
 932   //       return compare_value;
 933   //    else
 934   //       return *dest;
 935   address generate_atomic_cmpxchg_long() {
 936     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
 937     address start = __ pc();
 938 
 939     __ movq(rax, c_rarg2);
 940     __ lock();
 941     __ cmpxchgq(c_rarg0, Address(c_rarg1, 0));
 942     __ ret(0);
 943 
 944     return start;
 945   }
 946 
 947   // Support for jint atomic::add(jint add_value, volatile jint* dest)
 948   //
 949   // Arguments :
 950   //    c_rarg0: add_value
 951   //    c_rarg1: dest
 952   //
 953   // Result:
 954   //    *dest += add_value
 955   //    return *dest;
 956   address generate_atomic_add() {
 957     StubCodeMark mark(this, "StubRoutines", "atomic_add");
 958     address start = __ pc();
 959 
 960     __ movl(rax, c_rarg0);
 961     __ lock();
 962     __ xaddl(Address(c_rarg1, 0), c_rarg0);
 963     __ addl(rax, c_rarg0);
 964     __ ret(0);
 965 
 966     return start;
 967   }
 968 
 969   // Support for intptr_t atomic::add_ptr(intptr_t add_value, volatile intptr_t* dest)
 970   //
 971   // Arguments :
 972   //    c_rarg0: add_value
 973   //    c_rarg1: dest
 974   //
 975   // Result:
 976   //    *dest += add_value
 977   //    return *dest;
 978   address generate_atomic_add_long() {
 979     StubCodeMark mark(this, "StubRoutines", "atomic_add_long");
 980     address start = __ pc();
 981 
 982     __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
 983     __ lock();
 984     __ xaddptr(Address(c_rarg1, 0), c_rarg0);
 985     __ addptr(rax, c_rarg0);
 986     __ ret(0);
 987 
 988     return start;
 989   }
 990 
 991   // Support for intptr_t OrderAccess::fence()
 992   //
 993   // Arguments :
 994   //
 995   // Result:
 996   address generate_orderaccess_fence() {
 997     StubCodeMark mark(this, "StubRoutines", "orderaccess_fence");
 998     address start = __ pc();
 999     __ membar(Assembler::StoreLoad);
1000     __ ret(0);
1001 
1002     return start;
1003   }
1004 
1005   // Support for intptr_t get_previous_fp()
1006   //
1007   // This routine is used to find the previous frame pointer for the
1008   // caller (current_frame_guess). This is used as part of debugging
1009   // ps() is seemingly lost trying to find frames.
1010   // This code assumes that caller current_frame_guess) has a frame.
1011   address generate_get_previous_fp() {
1012     StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
1013     const Address old_fp(rbp, 0);
1014     const Address older_fp(rax, 0);
1015     address start = __ pc();
1016 
1017     __ enter();
1018     __ movptr(rax, old_fp); // callers fp
1019     __ movptr(rax, older_fp); // the frame for ps()
1020     __ pop(rbp);
1021     __ ret(0);
1022 
1023     return start;
1024   }
1025 
1026   // Support for intptr_t get_previous_sp()
1027   //
1028   // This routine is used to find the previous stack pointer for the
1029   // caller.
1030   address generate_get_previous_sp() {
1031     StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
1032     address start = __ pc();
1033 
1034     __ movptr(rax, rsp);
1035     __ addptr(rax, 8); // return address is at the top of the stack.
1036     __ ret(0);
1037 
1038     return start;
1039   }
1040 
1041   //----------------------------------------------------------------------------------------------------
1042   // Support for void verify_mxcsr()
1043   //
1044   // This routine is used with -Xcheck:jni to verify that native
1045   // JNI code does not return to Java code without restoring the
1046   // MXCSR register to our expected state.
1047 
1048   address generate_verify_mxcsr() {
1049     StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
1050     address start = __ pc();
1051 
1052     const Address mxcsr_save(rsp, 0);
1053 
1054     if (CheckJNICalls) {
1055       Label ok_ret;
1056       ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
1057       __ push(rax);
1058       __ subptr(rsp, wordSize);      // allocate a temp location
1059       __ stmxcsr(mxcsr_save);
1060       __ movl(rax, mxcsr_save);
1061       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
1062       __ cmp32(rax, mxcsr_std);
1063       __ jcc(Assembler::equal, ok_ret);
1064 
1065       __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
1066 
1067       __ ldmxcsr(mxcsr_std);
1068 
1069       __ bind(ok_ret);
1070       __ addptr(rsp, wordSize);
1071       __ pop(rax);
1072     }
1073 
1074     __ ret(0);
1075 
1076     return start;
1077   }
1078 
1079   address generate_f2i_fixup() {
1080     StubCodeMark mark(this, "StubRoutines", "f2i_fixup");
1081     Address inout(rsp, 5 * wordSize); // return address + 4 saves
1082 
1083     address start = __ pc();
1084 
1085     Label L;
1086 
1087     __ push(rax);
1088     __ push(c_rarg3);
1089     __ push(c_rarg2);
1090     __ push(c_rarg1);
1091 
1092     __ movl(rax, 0x7f800000);
1093     __ xorl(c_rarg3, c_rarg3);
1094     __ movl(c_rarg2, inout);
1095     __ movl(c_rarg1, c_rarg2);
1096     __ andl(c_rarg1, 0x7fffffff);
1097     __ cmpl(rax, c_rarg1); // NaN? -> 0
1098     __ jcc(Assembler::negative, L);
1099     __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint
1100     __ movl(c_rarg3, 0x80000000);
1101     __ movl(rax, 0x7fffffff);
1102     __ cmovl(Assembler::positive, c_rarg3, rax);
1103 
1104     __ bind(L);
1105     __ movptr(inout, c_rarg3);
1106 
1107     __ pop(c_rarg1);
1108     __ pop(c_rarg2);
1109     __ pop(c_rarg3);
1110     __ pop(rax);
1111 
1112     __ ret(0);
1113 
1114     return start;
1115   }
1116 
1117   address generate_f2l_fixup() {
1118     StubCodeMark mark(this, "StubRoutines", "f2l_fixup");
1119     Address inout(rsp, 5 * wordSize); // return address + 4 saves
1120     address start = __ pc();
1121 
1122     Label L;
1123 
1124     __ push(rax);
1125     __ push(c_rarg3);
1126     __ push(c_rarg2);
1127     __ push(c_rarg1);
1128 
1129     __ movl(rax, 0x7f800000);
1130     __ xorl(c_rarg3, c_rarg3);
1131     __ movl(c_rarg2, inout);
1132     __ movl(c_rarg1, c_rarg2);
1133     __ andl(c_rarg1, 0x7fffffff);
1134     __ cmpl(rax, c_rarg1); // NaN? -> 0
1135     __ jcc(Assembler::negative, L);
1136     __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
1137     __ mov64(c_rarg3, 0x8000000000000000);
1138     __ mov64(rax, 0x7fffffffffffffff);
1139     __ cmov(Assembler::positive, c_rarg3, rax);
1140 
1141     __ bind(L);
1142     __ movptr(inout, c_rarg3);
1143 
1144     __ pop(c_rarg1);
1145     __ pop(c_rarg2);
1146     __ pop(c_rarg3);
1147     __ pop(rax);
1148 
1149     __ ret(0);
1150 
1151     return start;
1152   }
1153 
1154   address generate_d2i_fixup() {
1155     StubCodeMark mark(this, "StubRoutines", "d2i_fixup");
1156     Address inout(rsp, 6 * wordSize); // return address + 5 saves
1157 
1158     address start = __ pc();
1159 
1160     Label L;
1161 
1162     __ push(rax);
1163     __ push(c_rarg3);
1164     __ push(c_rarg2);
1165     __ push(c_rarg1);
1166     __ push(c_rarg0);
1167 
1168     __ movl(rax, 0x7ff00000);
1169     __ movq(c_rarg2, inout);
1170     __ movl(c_rarg3, c_rarg2);
1171     __ mov(c_rarg1, c_rarg2);
1172     __ mov(c_rarg0, c_rarg2);
1173     __ negl(c_rarg3);
1174     __ shrptr(c_rarg1, 0x20);
1175     __ orl(c_rarg3, c_rarg2);
1176     __ andl(c_rarg1, 0x7fffffff);
1177     __ xorl(c_rarg2, c_rarg2);
1178     __ shrl(c_rarg3, 0x1f);
1179     __ orl(c_rarg1, c_rarg3);
1180     __ cmpl(rax, c_rarg1);
1181     __ jcc(Assembler::negative, L); // NaN -> 0
1182     __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
1183     __ movl(c_rarg2, 0x80000000);
1184     __ movl(rax, 0x7fffffff);
1185     __ cmov(Assembler::positive, c_rarg2, rax);
1186 
1187     __ bind(L);
1188     __ movptr(inout, c_rarg2);
1189 
1190     __ pop(c_rarg0);
1191     __ pop(c_rarg1);
1192     __ pop(c_rarg2);
1193     __ pop(c_rarg3);
1194     __ pop(rax);
1195 
1196     __ ret(0);
1197 
1198     return start;
1199   }
1200 
1201   address generate_d2l_fixup() {
1202     StubCodeMark mark(this, "StubRoutines", "d2l_fixup");
1203     Address inout(rsp, 6 * wordSize); // return address + 5 saves
1204 
1205     address start = __ pc();
1206 
1207     Label L;
1208 
1209     __ push(rax);
1210     __ push(c_rarg3);
1211     __ push(c_rarg2);
1212     __ push(c_rarg1);
1213     __ push(c_rarg0);
1214 
1215     __ movl(rax, 0x7ff00000);
1216     __ movq(c_rarg2, inout);
1217     __ movl(c_rarg3, c_rarg2);
1218     __ mov(c_rarg1, c_rarg2);
1219     __ mov(c_rarg0, c_rarg2);
1220     __ negl(c_rarg3);
1221     __ shrptr(c_rarg1, 0x20);
1222     __ orl(c_rarg3, c_rarg2);
1223     __ andl(c_rarg1, 0x7fffffff);
1224     __ xorl(c_rarg2, c_rarg2);
1225     __ shrl(c_rarg3, 0x1f);
1226     __ orl(c_rarg1, c_rarg3);
1227     __ cmpl(rax, c_rarg1);
1228     __ jcc(Assembler::negative, L); // NaN -> 0
1229     __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong
1230     __ mov64(c_rarg2, 0x8000000000000000);
1231     __ mov64(rax, 0x7fffffffffffffff);
1232     __ cmovq(Assembler::positive, c_rarg2, rax);
1233 
1234     __ bind(L);
1235     __ movq(inout, c_rarg2);
1236 
1237     __ pop(c_rarg0);
1238     __ pop(c_rarg1);
1239     __ pop(c_rarg2);
1240     __ pop(c_rarg3);
1241     __ pop(rax);
1242 
1243     __ ret(0);
1244 
1245     return start;
1246   }
1247 
1248   address generate_iota_indices(const char *stub_name) {
1249     __ align(CodeEntryAlignment);
1250     StubCodeMark mark(this, "StubRoutines", stub_name);
1251     address start = __ pc();
1252     __ emit_data64(0x0706050403020100, relocInfo::none);
1253     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
1254     __ emit_data64(0x1716151413121110, relocInfo::none);
1255     __ emit_data64(0x1F1E1D1C1B1A1918, relocInfo::none);
1256     __ emit_data64(0x2726252423222120, relocInfo::none);
1257     __ emit_data64(0x2F2E2D2C2B2A2928, relocInfo::none);
1258     __ emit_data64(0x3736353433323130, relocInfo::none);
1259     __ emit_data64(0x3F3E3D3C3B3A3938, relocInfo::none);
1260     return start;
1261   }
1262 
1263   address generate_fp_mask(const char *stub_name, int64_t mask) {
1264     __ align(CodeEntryAlignment);
1265     StubCodeMark mark(this, "StubRoutines", stub_name);
1266     address start = __ pc();
1267 
1268     __ emit_data64( mask, relocInfo::none );
1269     __ emit_data64( mask, relocInfo::none );
1270 
1271     return start;
1272   }
1273 
1274   address generate_vector_fp_mask(const char *stub_name, int64_t mask) {
1275     __ align(CodeEntryAlignment);
1276     StubCodeMark mark(this, "StubRoutines", stub_name);
1277     address start = __ pc();
1278 
1279     __ emit_data64(mask, relocInfo::none);
1280     __ emit_data64(mask, relocInfo::none);
1281     __ emit_data64(mask, relocInfo::none);
1282     __ emit_data64(mask, relocInfo::none);
1283     __ emit_data64(mask, relocInfo::none);
1284     __ emit_data64(mask, relocInfo::none);
1285     __ emit_data64(mask, relocInfo::none);
1286     __ emit_data64(mask, relocInfo::none);
1287 
1288     return start;
1289   }
1290 
1291   address generate_vector_byte_perm_mask(const char *stub_name) {
1292     __ align(CodeEntryAlignment);
1293     StubCodeMark mark(this, "StubRoutines", stub_name);
1294     address start = __ pc();
1295 
1296     __ emit_data64(0x0000000000000001, relocInfo::none);
1297     __ emit_data64(0x0000000000000003, relocInfo::none);
1298     __ emit_data64(0x0000000000000005, relocInfo::none);
1299     __ emit_data64(0x0000000000000007, relocInfo::none);
1300     __ emit_data64(0x0000000000000000, relocInfo::none);
1301     __ emit_data64(0x0000000000000002, relocInfo::none);
1302     __ emit_data64(0x0000000000000004, relocInfo::none);
1303     __ emit_data64(0x0000000000000006, relocInfo::none);
1304 
1305     return start;
1306   }
1307 
1308   address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len,
1309                                      int32_t val0, int32_t val1, int32_t val2, int32_t val3,
1310                                      int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0,
1311                                      int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0,
1312                                      int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0) {
1313     __ align(CodeEntryAlignment);
1314     StubCodeMark mark(this, "StubRoutines", stub_name);
1315     address start = __ pc();
1316 
1317     assert(len != Assembler::AVX_NoVec, "vector len must be specified");
1318     __ emit_data(val0, relocInfo::none, 0);
1319     __ emit_data(val1, relocInfo::none, 0);
1320     __ emit_data(val2, relocInfo::none, 0);
1321     __ emit_data(val3, relocInfo::none, 0);
1322     if (len >= Assembler::AVX_256bit) {
1323       __ emit_data(val4, relocInfo::none, 0);
1324       __ emit_data(val5, relocInfo::none, 0);
1325       __ emit_data(val6, relocInfo::none, 0);
1326       __ emit_data(val7, relocInfo::none, 0);
1327       if (len >= Assembler::AVX_512bit) {
1328         __ emit_data(val8, relocInfo::none, 0);
1329         __ emit_data(val9, relocInfo::none, 0);
1330         __ emit_data(val10, relocInfo::none, 0);
1331         __ emit_data(val11, relocInfo::none, 0);
1332         __ emit_data(val12, relocInfo::none, 0);
1333         __ emit_data(val13, relocInfo::none, 0);
1334         __ emit_data(val14, relocInfo::none, 0);
1335         __ emit_data(val15, relocInfo::none, 0);
1336       }
1337     }
1338 
1339     return start;
1340   }
1341 
1342   // Non-destructive plausibility checks for oops
1343   //
1344   // Arguments:
1345   //    all args on stack!
1346   //
1347   // Stack after saving c_rarg3:
1348   //    [tos + 0]: saved c_rarg3
1349   //    [tos + 1]: saved c_rarg2
1350   //    [tos + 2]: saved r12 (several TemplateTable methods use it)
1351   //    [tos + 3]: saved flags
1352   //    [tos + 4]: return address
1353   //  * [tos + 5]: error message (char*)
1354   //  * [tos + 6]: object to verify (oop)
1355   //  * [tos + 7]: saved rax - saved by caller and bashed
1356   //  * [tos + 8]: saved r10 (rscratch1) - saved by caller
1357   //  * = popped on exit
1358   address generate_verify_oop() {
1359     StubCodeMark mark(this, "StubRoutines", "verify_oop");
1360     address start = __ pc();
1361 
1362     Label exit, error;
1363 
1364     __ pushf();
1365     __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
1366 
1367     __ push(r12);
1368 
1369     // save c_rarg2 and c_rarg3
1370     __ push(c_rarg2);
1371     __ push(c_rarg3);
1372 
1373     enum {
1374            // After previous pushes.
1375            oop_to_verify = 6 * wordSize,
1376            saved_rax     = 7 * wordSize,
1377            saved_r10     = 8 * wordSize,
1378 
1379            // Before the call to MacroAssembler::debug(), see below.
1380            return_addr   = 16 * wordSize,
1381            error_msg     = 17 * wordSize
1382     };
1383 
1384     // get object
1385     __ movptr(rax, Address(rsp, oop_to_verify));
1386 
1387     // make sure object is 'reasonable'
1388     __ testptr(rax, rax);
1389     __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
1390 
1391 #if INCLUDE_ZGC
1392     if (UseZGC) {
1393       // Check if metadata bits indicate a bad oop
1394       __ testptr(rax, Address(r15_thread, ZThreadLocalData::address_bad_mask_offset()));
1395       __ jcc(Assembler::notZero, error);
1396     }
1397 #endif
1398 
1399     // Check if the oop is in the right area of memory
1400     __ movptr(c_rarg2, rax);
1401     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
1402     __ andptr(c_rarg2, c_rarg3);
1403     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
1404     __ cmpptr(c_rarg2, c_rarg3);
1405     __ jcc(Assembler::notZero, error);
1406 
1407     // set r12 to heapbase for load_klass()
1408     __ reinit_heapbase();
1409 
1410     // make sure klass is 'reasonable', which is not zero.
1411     __ load_klass(rax, rax);  // get klass
1412     __ testptr(rax, rax);
1413     __ jcc(Assembler::zero, error); // if klass is NULL it is broken
1414 
1415     // return if everything seems ok
1416     __ bind(exit);
1417     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1418     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1419     __ pop(c_rarg3);                             // restore c_rarg3
1420     __ pop(c_rarg2);                             // restore c_rarg2
1421     __ pop(r12);                                 // restore r12
1422     __ popf();                                   // restore flags
1423     __ ret(4 * wordSize);                        // pop caller saved stuff
1424 
1425     // handle errors
1426     __ bind(error);
1427     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1428     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1429     __ pop(c_rarg3);                             // get saved c_rarg3 back
1430     __ pop(c_rarg2);                             // get saved c_rarg2 back
1431     __ pop(r12);                                 // get saved r12 back
1432     __ popf();                                   // get saved flags off stack --
1433                                                  // will be ignored
1434 
1435     __ pusha();                                  // push registers
1436                                                  // (rip is already
1437                                                  // already pushed)
1438     // debug(char* msg, int64_t pc, int64_t regs[])
1439     // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and
1440     // pushed all the registers, so now the stack looks like:
1441     //     [tos +  0] 16 saved registers
1442     //     [tos + 16] return address
1443     //   * [tos + 17] error message (char*)
1444     //   * [tos + 18] object to verify (oop)
1445     //   * [tos + 19] saved rax - saved by caller and bashed
1446     //   * [tos + 20] saved r10 (rscratch1) - saved by caller
1447     //   * = popped on exit
1448 
1449     __ movptr(c_rarg0, Address(rsp, error_msg));    // pass address of error message
1450     __ movptr(c_rarg1, Address(rsp, return_addr));  // pass return address
1451     __ movq(c_rarg2, rsp);                          // pass address of regs on stack
1452     __ mov(r12, rsp);                               // remember rsp
1453     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1454     __ andptr(rsp, -16);                            // align stack as required by ABI
1455     BLOCK_COMMENT("call MacroAssembler::debug");
1456     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
1457     __ mov(rsp, r12);                               // restore rsp
1458     __ popa();                                      // pop registers (includes r12)
1459     __ ret(4 * wordSize);                           // pop caller saved stuff
1460 
1461     return start;
1462   }
1463 
1464   //
1465   // Verify that a register contains clean 32-bits positive value
1466   // (high 32-bits are 0) so it could be used in 64-bits shifts.
1467   //
1468   //  Input:
1469   //    Rint  -  32-bits value
1470   //    Rtmp  -  scratch
1471   //
1472   void assert_clean_int(Register Rint, Register Rtmp) {
1473 #ifdef ASSERT
1474     Label L;
1475     assert_different_registers(Rtmp, Rint);
1476     __ movslq(Rtmp, Rint);
1477     __ cmpq(Rtmp, Rint);
1478     __ jcc(Assembler::equal, L);
1479     __ stop("high 32-bits of int value are not 0");
1480     __ bind(L);
1481 #endif
1482   }
1483 
1484   //  Generate overlap test for array copy stubs
1485   //
1486   //  Input:
1487   //     c_rarg0 - from
1488   //     c_rarg1 - to
1489   //     c_rarg2 - element count
1490   //
1491   //  Output:
1492   //     rax   - &from[element count - 1]
1493   //
1494   void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
1495     assert(no_overlap_target != NULL, "must be generated");
1496     array_overlap_test(no_overlap_target, NULL, sf);
1497   }
1498   void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
1499     array_overlap_test(NULL, &L_no_overlap, sf);
1500   }
1501   void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
1502     const Register from     = c_rarg0;
1503     const Register to       = c_rarg1;
1504     const Register count    = c_rarg2;
1505     const Register end_from = rax;
1506 
1507     __ cmpptr(to, from);
1508     __ lea(end_from, Address(from, count, sf, 0));
1509     if (NOLp == NULL) {
1510       ExternalAddress no_overlap(no_overlap_target);
1511       __ jump_cc(Assembler::belowEqual, no_overlap);
1512       __ cmpptr(to, end_from);
1513       __ jump_cc(Assembler::aboveEqual, no_overlap);
1514     } else {
1515       __ jcc(Assembler::belowEqual, (*NOLp));
1516       __ cmpptr(to, end_from);
1517       __ jcc(Assembler::aboveEqual, (*NOLp));
1518     }
1519   }
1520 
1521   // Shuffle first three arg regs on Windows into Linux/Solaris locations.
1522   //
1523   // Outputs:
1524   //    rdi - rcx
1525   //    rsi - rdx
1526   //    rdx - r8
1527   //    rcx - r9
1528   //
1529   // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter
1530   // are non-volatile.  r9 and r10 should not be used by the caller.
1531   //
1532   DEBUG_ONLY(bool regs_in_thread;)
1533 
1534   void setup_arg_regs(int nargs = 3) {
1535     const Register saved_rdi = r9;
1536     const Register saved_rsi = r10;
1537     assert(nargs == 3 || nargs == 4, "else fix");
1538 #ifdef _WIN64
1539     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1540            "unexpected argument registers");
1541     if (nargs >= 4)
1542       __ mov(rax, r9);  // r9 is also saved_rdi
1543     __ movptr(saved_rdi, rdi);
1544     __ movptr(saved_rsi, rsi);
1545     __ mov(rdi, rcx); // c_rarg0
1546     __ mov(rsi, rdx); // c_rarg1
1547     __ mov(rdx, r8);  // c_rarg2
1548     if (nargs >= 4)
1549       __ mov(rcx, rax); // c_rarg3 (via rax)
1550 #else
1551     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1552            "unexpected argument registers");
1553 #endif
1554     DEBUG_ONLY(regs_in_thread = false;)
1555   }
1556 
1557   void restore_arg_regs() {
1558     assert(!regs_in_thread, "wrong call to restore_arg_regs");
1559     const Register saved_rdi = r9;
1560     const Register saved_rsi = r10;
1561 #ifdef _WIN64
1562     __ movptr(rdi, saved_rdi);
1563     __ movptr(rsi, saved_rsi);
1564 #endif
1565   }
1566 
1567   // This is used in places where r10 is a scratch register, and can
1568   // be adapted if r9 is needed also.
1569   void setup_arg_regs_using_thread() {
1570     const Register saved_r15 = r9;
1571 #ifdef _WIN64
1572     __ mov(saved_r15, r15);  // r15 is callee saved and needs to be restored
1573     __ get_thread(r15_thread);
1574     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1575            "unexpected argument registers");
1576     __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())), rdi);
1577     __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())), rsi);
1578 
1579     __ mov(rdi, rcx); // c_rarg0
1580     __ mov(rsi, rdx); // c_rarg1
1581     __ mov(rdx, r8);  // c_rarg2
1582 #else
1583     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1584            "unexpected argument registers");
1585 #endif
1586     DEBUG_ONLY(regs_in_thread = true;)
1587   }
1588 
1589   void restore_arg_regs_using_thread() {
1590     assert(regs_in_thread, "wrong call to restore_arg_regs");
1591     const Register saved_r15 = r9;
1592 #ifdef _WIN64
1593     __ get_thread(r15_thread);
1594     __ movptr(rsi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())));
1595     __ movptr(rdi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())));
1596     __ mov(r15, saved_r15);  // r15 is callee saved and needs to be restored
1597 #endif
1598   }
1599 
1600   // Copy big chunks forward
1601   //
1602   // Inputs:
1603   //   end_from     - source arrays end address
1604   //   end_to       - destination array end address
1605   //   qword_count  - 64-bits element count, negative
1606   //   to           - scratch
1607   //   L_copy_bytes - entry label
1608   //   L_copy_8_bytes  - exit  label
1609   //
1610   void copy_bytes_forward(Register end_from, Register end_to,
1611                              Register qword_count, Register to,
1612                              Label& L_copy_bytes, Label& L_copy_8_bytes) {
1613     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1614     Label L_loop;
1615     __ align(OptoLoopAlignment);
1616     if (UseUnalignedLoadStores) {
1617       Label L_end;
1618       // Copy 64-bytes per iteration
1619       __ BIND(L_loop);
1620       if (UseAVX > 2) {
1621         __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
1622         __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
1623       } else if (UseAVX == 2) {
1624         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1625         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1626         __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
1627         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1628       } else {
1629         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1630         __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1631         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
1632         __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
1633         __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
1634         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
1635         __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
1636         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
1637       }
1638       __ BIND(L_copy_bytes);
1639       __ addptr(qword_count, 8);
1640       __ jcc(Assembler::lessEqual, L_loop);
1641       __ subptr(qword_count, 4);  // sub(8) and add(4)
1642       __ jccb(Assembler::greater, L_end);
1643       // Copy trailing 32 bytes
1644       if (UseAVX >= 2) {
1645         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1646         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1647       } else {
1648         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1649         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1650         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1651         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1652       }
1653       __ addptr(qword_count, 4);
1654       __ BIND(L_end);
1655       if (UseAVX >= 2) {
1656         // clean upper bits of YMM registers
1657         __ vpxor(xmm0, xmm0);
1658         __ vpxor(xmm1, xmm1);
1659       }
1660     } else {
1661       // Copy 32-bytes per iteration
1662       __ BIND(L_loop);
1663       __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1664       __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1665       __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1666       __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1667       __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1668       __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1669       __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1670       __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1671 
1672       __ BIND(L_copy_bytes);
1673       __ addptr(qword_count, 4);
1674       __ jcc(Assembler::lessEqual, L_loop);
1675     }
1676     __ subptr(qword_count, 4);
1677     __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1678   }
1679 
1680   // Copy big chunks backward
1681   //
1682   // Inputs:
1683   //   from         - source arrays address
1684   //   dest         - destination array address
1685   //   qword_count  - 64-bits element count
1686   //   to           - scratch
1687   //   L_copy_bytes - entry label
1688   //   L_copy_8_bytes  - exit  label
1689   //
1690   void copy_bytes_backward(Register from, Register dest,
1691                               Register qword_count, Register to,
1692                               Label& L_copy_bytes, Label& L_copy_8_bytes) {
1693     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1694     Label L_loop;
1695     __ align(OptoLoopAlignment);
1696     if (UseUnalignedLoadStores) {
1697       Label L_end;
1698       // Copy 64-bytes per iteration
1699       __ BIND(L_loop);
1700       if (UseAVX > 2) {
1701         __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit);
1702         __ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit);
1703       } else if (UseAVX == 2) {
1704         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
1705         __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1706         __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1707         __ vmovdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1708       } else {
1709         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
1710         __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
1711         __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
1712         __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
1713         __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
1714         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
1715         __ movdqu(xmm3, Address(from, qword_count, Address::times_8,  0));
1716         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm3);
1717       }
1718       __ BIND(L_copy_bytes);
1719       __ subptr(qword_count, 8);
1720       __ jcc(Assembler::greaterEqual, L_loop);
1721 
1722       __ addptr(qword_count, 4);  // add(8) and sub(4)
1723       __ jccb(Assembler::less, L_end);
1724       // Copy trailing 32 bytes
1725       if (UseAVX >= 2) {
1726         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
1727         __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
1728       } else {
1729         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1730         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1731         __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1732         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1733       }
1734       __ subptr(qword_count, 4);
1735       __ BIND(L_end);
1736       if (UseAVX >= 2) {
1737         // clean upper bits of YMM registers
1738         __ vpxor(xmm0, xmm0);
1739         __ vpxor(xmm1, xmm1);
1740       }
1741     } else {
1742       // Copy 32-bytes per iteration
1743       __ BIND(L_loop);
1744       __ movq(to, Address(from, qword_count, Address::times_8, 24));
1745       __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1746       __ movq(to, Address(from, qword_count, Address::times_8, 16));
1747       __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1748       __ movq(to, Address(from, qword_count, Address::times_8,  8));
1749       __ movq(Address(dest, qword_count, Address::times_8,  8), to);
1750       __ movq(to, Address(from, qword_count, Address::times_8,  0));
1751       __ movq(Address(dest, qword_count, Address::times_8,  0), to);
1752 
1753       __ BIND(L_copy_bytes);
1754       __ subptr(qword_count, 4);
1755       __ jcc(Assembler::greaterEqual, L_loop);
1756     }
1757     __ addptr(qword_count, 4);
1758     __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1759   }
1760 
1761 
1762   // Arguments:
1763   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1764   //             ignored
1765   //   name    - stub name string
1766   //
1767   // Inputs:
1768   //   c_rarg0   - source array address
1769   //   c_rarg1   - destination array address
1770   //   c_rarg2   - element count, treated as ssize_t, can be zero
1771   //
1772   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1773   // we let the hardware handle it.  The one to eight bytes within words,
1774   // dwords or qwords that span cache line boundaries will still be loaded
1775   // and stored atomically.
1776   //
1777   // Side Effects:
1778   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1779   //   used by generate_conjoint_byte_copy().
1780   //
1781   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1782     __ align(CodeEntryAlignment);
1783     StubCodeMark mark(this, "StubRoutines", name);
1784     address start = __ pc();
1785 
1786     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1787     Label L_copy_byte, L_exit;
1788     const Register from        = rdi;  // source array address
1789     const Register to          = rsi;  // destination array address
1790     const Register count       = rdx;  // elements count
1791     const Register byte_count  = rcx;
1792     const Register qword_count = count;
1793     const Register end_from    = from; // source array end address
1794     const Register end_to      = to;   // destination array end address
1795     // End pointers are inclusive, and if count is not zero they point
1796     // to the last unit copied:  end_to[0] := end_from[0]
1797 
1798     __ enter(); // required for proper stackwalking of RuntimeStub frame
1799     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1800 
1801     if (entry != NULL) {
1802       *entry = __ pc();
1803        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1804       BLOCK_COMMENT("Entry:");
1805     }
1806 
1807     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1808                       // r9 and r10 may be used to save non-volatile registers
1809 
1810     // 'from', 'to' and 'count' are now valid
1811     __ movptr(byte_count, count);
1812     __ shrptr(count, 3); // count => qword_count
1813 
1814     // Copy from low to high addresses.  Use 'to' as scratch.
1815     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1816     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1817     __ negptr(qword_count); // make the count negative
1818     __ jmp(L_copy_bytes);
1819 
1820     // Copy trailing qwords
1821   __ BIND(L_copy_8_bytes);
1822     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1823     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1824     __ increment(qword_count);
1825     __ jcc(Assembler::notZero, L_copy_8_bytes);
1826 
1827     // Check for and copy trailing dword
1828   __ BIND(L_copy_4_bytes);
1829     __ testl(byte_count, 4);
1830     __ jccb(Assembler::zero, L_copy_2_bytes);
1831     __ movl(rax, Address(end_from, 8));
1832     __ movl(Address(end_to, 8), rax);
1833 
1834     __ addptr(end_from, 4);
1835     __ addptr(end_to, 4);
1836 
1837     // Check for and copy trailing word
1838   __ BIND(L_copy_2_bytes);
1839     __ testl(byte_count, 2);
1840     __ jccb(Assembler::zero, L_copy_byte);
1841     __ movw(rax, Address(end_from, 8));
1842     __ movw(Address(end_to, 8), rax);
1843 
1844     __ addptr(end_from, 2);
1845     __ addptr(end_to, 2);
1846 
1847     // Check for and copy trailing byte
1848   __ BIND(L_copy_byte);
1849     __ testl(byte_count, 1);
1850     __ jccb(Assembler::zero, L_exit);
1851     __ movb(rax, Address(end_from, 8));
1852     __ movb(Address(end_to, 8), rax);
1853 
1854   __ BIND(L_exit);
1855     restore_arg_regs();
1856     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1857     __ xorptr(rax, rax); // return 0
1858     __ vzeroupper();
1859     __ leave(); // required for proper stackwalking of RuntimeStub frame
1860     __ ret(0);
1861 
1862     // Copy in multi-bytes chunks
1863     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1864     __ jmp(L_copy_4_bytes);
1865 
1866     return start;
1867   }
1868 
1869   // Arguments:
1870   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1871   //             ignored
1872   //   name    - stub name string
1873   //
1874   // Inputs:
1875   //   c_rarg0   - source array address
1876   //   c_rarg1   - destination array address
1877   //   c_rarg2   - element count, treated as ssize_t, can be zero
1878   //
1879   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1880   // we let the hardware handle it.  The one to eight bytes within words,
1881   // dwords or qwords that span cache line boundaries will still be loaded
1882   // and stored atomically.
1883   //
1884   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1885                                       address* entry, const char *name) {
1886     __ align(CodeEntryAlignment);
1887     StubCodeMark mark(this, "StubRoutines", name);
1888     address start = __ pc();
1889 
1890     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1891     const Register from        = rdi;  // source array address
1892     const Register to          = rsi;  // destination array address
1893     const Register count       = rdx;  // elements count
1894     const Register byte_count  = rcx;
1895     const Register qword_count = count;
1896 
1897     __ enter(); // required for proper stackwalking of RuntimeStub frame
1898     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1899 
1900     if (entry != NULL) {
1901       *entry = __ pc();
1902       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1903       BLOCK_COMMENT("Entry:");
1904     }
1905 
1906     array_overlap_test(nooverlap_target, Address::times_1);
1907     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1908                       // r9 and r10 may be used to save non-volatile registers
1909 
1910     // 'from', 'to' and 'count' are now valid
1911     __ movptr(byte_count, count);
1912     __ shrptr(count, 3);   // count => qword_count
1913 
1914     // Copy from high to low addresses.
1915 
1916     // Check for and copy trailing byte
1917     __ testl(byte_count, 1);
1918     __ jcc(Assembler::zero, L_copy_2_bytes);
1919     __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1920     __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1921     __ decrement(byte_count); // Adjust for possible trailing word
1922 
1923     // Check for and copy trailing word
1924   __ BIND(L_copy_2_bytes);
1925     __ testl(byte_count, 2);
1926     __ jcc(Assembler::zero, L_copy_4_bytes);
1927     __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1928     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1929 
1930     // Check for and copy trailing dword
1931   __ BIND(L_copy_4_bytes);
1932     __ testl(byte_count, 4);
1933     __ jcc(Assembler::zero, L_copy_bytes);
1934     __ movl(rax, Address(from, qword_count, Address::times_8));
1935     __ movl(Address(to, qword_count, Address::times_8), rax);
1936     __ jmp(L_copy_bytes);
1937 
1938     // Copy trailing qwords
1939   __ BIND(L_copy_8_bytes);
1940     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1941     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1942     __ decrement(qword_count);
1943     __ jcc(Assembler::notZero, L_copy_8_bytes);
1944 
1945     restore_arg_regs();
1946     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1947     __ xorptr(rax, rax); // return 0
1948     __ vzeroupper();
1949     __ leave(); // required for proper stackwalking of RuntimeStub frame
1950     __ ret(0);
1951 
1952     // Copy in multi-bytes chunks
1953     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1954 
1955     restore_arg_regs();
1956     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1957     __ xorptr(rax, rax); // return 0
1958     __ vzeroupper();
1959     __ leave(); // required for proper stackwalking of RuntimeStub frame
1960     __ ret(0);
1961 
1962     return start;
1963   }
1964 
1965   // Arguments:
1966   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1967   //             ignored
1968   //   name    - stub name string
1969   //
1970   // Inputs:
1971   //   c_rarg0   - source array address
1972   //   c_rarg1   - destination array address
1973   //   c_rarg2   - element count, treated as ssize_t, can be zero
1974   //
1975   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1976   // let the hardware handle it.  The two or four words within dwords
1977   // or qwords that span cache line boundaries will still be loaded
1978   // and stored atomically.
1979   //
1980   // Side Effects:
1981   //   disjoint_short_copy_entry is set to the no-overlap entry point
1982   //   used by generate_conjoint_short_copy().
1983   //
1984   address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
1985     __ align(CodeEntryAlignment);
1986     StubCodeMark mark(this, "StubRoutines", name);
1987     address start = __ pc();
1988 
1989     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1990     const Register from        = rdi;  // source array address
1991     const Register to          = rsi;  // destination array address
1992     const Register count       = rdx;  // elements count
1993     const Register word_count  = rcx;
1994     const Register qword_count = count;
1995     const Register end_from    = from; // source array end address
1996     const Register end_to      = to;   // destination array end address
1997     // End pointers are inclusive, and if count is not zero they point
1998     // to the last unit copied:  end_to[0] := end_from[0]
1999 
2000     __ enter(); // required for proper stackwalking of RuntimeStub frame
2001     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2002 
2003     if (entry != NULL) {
2004       *entry = __ pc();
2005       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2006       BLOCK_COMMENT("Entry:");
2007     }
2008 
2009     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2010                       // r9 and r10 may be used to save non-volatile registers
2011 
2012     // 'from', 'to' and 'count' are now valid
2013     __ movptr(word_count, count);
2014     __ shrptr(count, 2); // count => qword_count
2015 
2016     // Copy from low to high addresses.  Use 'to' as scratch.
2017     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2018     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2019     __ negptr(qword_count);
2020     __ jmp(L_copy_bytes);
2021 
2022     // Copy trailing qwords
2023   __ BIND(L_copy_8_bytes);
2024     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2025     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2026     __ increment(qword_count);
2027     __ jcc(Assembler::notZero, L_copy_8_bytes);
2028 
2029     // Original 'dest' is trashed, so we can't use it as a
2030     // base register for a possible trailing word copy
2031 
2032     // Check for and copy trailing dword
2033   __ BIND(L_copy_4_bytes);
2034     __ testl(word_count, 2);
2035     __ jccb(Assembler::zero, L_copy_2_bytes);
2036     __ movl(rax, Address(end_from, 8));
2037     __ movl(Address(end_to, 8), rax);
2038 
2039     __ addptr(end_from, 4);
2040     __ addptr(end_to, 4);
2041 
2042     // Check for and copy trailing word
2043   __ BIND(L_copy_2_bytes);
2044     __ testl(word_count, 1);
2045     __ jccb(Assembler::zero, L_exit);
2046     __ movw(rax, Address(end_from, 8));
2047     __ movw(Address(end_to, 8), rax);
2048 
2049   __ BIND(L_exit);
2050     restore_arg_regs();
2051     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2052     __ xorptr(rax, rax); // return 0
2053     __ vzeroupper();
2054     __ leave(); // required for proper stackwalking of RuntimeStub frame
2055     __ ret(0);
2056 
2057     // Copy in multi-bytes chunks
2058     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2059     __ jmp(L_copy_4_bytes);
2060 
2061     return start;
2062   }
2063 
2064   address generate_fill(BasicType t, bool aligned, const char *name) {
2065     __ align(CodeEntryAlignment);
2066     StubCodeMark mark(this, "StubRoutines", name);
2067     address start = __ pc();
2068 
2069     BLOCK_COMMENT("Entry:");
2070 
2071     const Register to       = c_rarg0;  // source array address
2072     const Register value    = c_rarg1;  // value
2073     const Register count    = c_rarg2;  // elements count
2074 
2075     __ enter(); // required for proper stackwalking of RuntimeStub frame
2076 
2077     __ generate_fill(t, aligned, to, value, count, rax, xmm0);
2078 
2079     __ vzeroupper();
2080     __ leave(); // required for proper stackwalking of RuntimeStub frame
2081     __ ret(0);
2082     return start;
2083   }
2084 
2085   // Arguments:
2086   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2087   //             ignored
2088   //   name    - stub name string
2089   //
2090   // Inputs:
2091   //   c_rarg0   - source array address
2092   //   c_rarg1   - destination array address
2093   //   c_rarg2   - element count, treated as ssize_t, can be zero
2094   //
2095   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
2096   // let the hardware handle it.  The two or four words within dwords
2097   // or qwords that span cache line boundaries will still be loaded
2098   // and stored atomically.
2099   //
2100   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
2101                                        address *entry, const char *name) {
2102     __ align(CodeEntryAlignment);
2103     StubCodeMark mark(this, "StubRoutines", name);
2104     address start = __ pc();
2105 
2106     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
2107     const Register from        = rdi;  // source array address
2108     const Register to          = rsi;  // destination array address
2109     const Register count       = rdx;  // elements count
2110     const Register word_count  = rcx;
2111     const Register qword_count = count;
2112 
2113     __ enter(); // required for proper stackwalking of RuntimeStub frame
2114     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2115 
2116     if (entry != NULL) {
2117       *entry = __ pc();
2118       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2119       BLOCK_COMMENT("Entry:");
2120     }
2121 
2122     array_overlap_test(nooverlap_target, Address::times_2);
2123     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2124                       // r9 and r10 may be used to save non-volatile registers
2125 
2126     // 'from', 'to' and 'count' are now valid
2127     __ movptr(word_count, count);
2128     __ shrptr(count, 2); // count => qword_count
2129 
2130     // Copy from high to low addresses.  Use 'to' as scratch.
2131 
2132     // Check for and copy trailing word
2133     __ testl(word_count, 1);
2134     __ jccb(Assembler::zero, L_copy_4_bytes);
2135     __ movw(rax, Address(from, word_count, Address::times_2, -2));
2136     __ movw(Address(to, word_count, Address::times_2, -2), rax);
2137 
2138     // Check for and copy trailing dword
2139   __ BIND(L_copy_4_bytes);
2140     __ testl(word_count, 2);
2141     __ jcc(Assembler::zero, L_copy_bytes);
2142     __ movl(rax, Address(from, qword_count, Address::times_8));
2143     __ movl(Address(to, qword_count, Address::times_8), rax);
2144     __ jmp(L_copy_bytes);
2145 
2146     // Copy trailing qwords
2147   __ BIND(L_copy_8_bytes);
2148     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2149     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2150     __ decrement(qword_count);
2151     __ jcc(Assembler::notZero, L_copy_8_bytes);
2152 
2153     restore_arg_regs();
2154     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2155     __ xorptr(rax, rax); // return 0
2156     __ vzeroupper();
2157     __ leave(); // required for proper stackwalking of RuntimeStub frame
2158     __ ret(0);
2159 
2160     // Copy in multi-bytes chunks
2161     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2162 
2163     restore_arg_regs();
2164     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
2165     __ xorptr(rax, rax); // return 0
2166     __ vzeroupper();
2167     __ leave(); // required for proper stackwalking of RuntimeStub frame
2168     __ ret(0);
2169 
2170     return start;
2171   }
2172 
2173   // Arguments:
2174   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2175   //             ignored
2176   //   is_oop  - true => oop array, so generate store check code
2177   //   name    - stub name string
2178   //
2179   // Inputs:
2180   //   c_rarg0   - source array address
2181   //   c_rarg1   - destination array address
2182   //   c_rarg2   - element count, treated as ssize_t, can be zero
2183   //
2184   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2185   // the hardware handle it.  The two dwords within qwords that span
2186   // cache line boundaries will still be loaded and stored atomicly.
2187   //
2188   // Side Effects:
2189   //   disjoint_int_copy_entry is set to the no-overlap entry point
2190   //   used by generate_conjoint_int_oop_copy().
2191   //
2192   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
2193                                          const char *name, bool dest_uninitialized = false) {
2194     __ align(CodeEntryAlignment);
2195     StubCodeMark mark(this, "StubRoutines", name);
2196     address start = __ pc();
2197 
2198     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
2199     const Register from        = rdi;  // source array address
2200     const Register to          = rsi;  // destination array address
2201     const Register count       = rdx;  // elements count
2202     const Register dword_count = rcx;
2203     const Register qword_count = count;
2204     const Register end_from    = from; // source array end address
2205     const Register end_to      = to;   // destination array end address
2206     // End pointers are inclusive, and if count is not zero they point
2207     // to the last unit copied:  end_to[0] := end_from[0]
2208 
2209     __ enter(); // required for proper stackwalking of RuntimeStub frame
2210     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2211 
2212     if (entry != NULL) {
2213       *entry = __ pc();
2214       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2215       BLOCK_COMMENT("Entry:");
2216     }
2217 
2218     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2219                                    // r9 is used to save r15_thread
2220 
2221     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2222     if (dest_uninitialized) {
2223       decorators |= IS_DEST_UNINITIALIZED;
2224     }
2225     if (aligned) {
2226       decorators |= ARRAYCOPY_ALIGNED;
2227     }
2228 
2229     BasicType type = is_oop ? T_OBJECT : T_INT;
2230     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2231     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2232 
2233     // 'from', 'to' and 'count' are now valid
2234     __ movptr(dword_count, count);
2235     __ shrptr(count, 1); // count => qword_count
2236 
2237     // Copy from low to high addresses.  Use 'to' as scratch.
2238     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2239     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2240     __ negptr(qword_count);
2241     __ jmp(L_copy_bytes);
2242 
2243     // Copy trailing qwords
2244   __ BIND(L_copy_8_bytes);
2245     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2246     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2247     __ increment(qword_count);
2248     __ jcc(Assembler::notZero, L_copy_8_bytes);
2249 
2250     // Check for and copy trailing dword
2251   __ BIND(L_copy_4_bytes);
2252     __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
2253     __ jccb(Assembler::zero, L_exit);
2254     __ movl(rax, Address(end_from, 8));
2255     __ movl(Address(end_to, 8), rax);
2256 
2257   __ BIND(L_exit);
2258     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2259     restore_arg_regs_using_thread();
2260     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2261     __ vzeroupper();
2262     __ xorptr(rax, rax); // return 0
2263     __ leave(); // required for proper stackwalking of RuntimeStub frame
2264     __ ret(0);
2265 
2266     // Copy in multi-bytes chunks
2267     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2268     __ jmp(L_copy_4_bytes);
2269 
2270     return start;
2271   }
2272 
2273   // Arguments:
2274   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
2275   //             ignored
2276   //   is_oop  - true => oop array, so generate store check code
2277   //   name    - stub name string
2278   //
2279   // Inputs:
2280   //   c_rarg0   - source array address
2281   //   c_rarg1   - destination array address
2282   //   c_rarg2   - element count, treated as ssize_t, can be zero
2283   //
2284   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
2285   // the hardware handle it.  The two dwords within qwords that span
2286   // cache line boundaries will still be loaded and stored atomicly.
2287   //
2288   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
2289                                          address *entry, const char *name,
2290                                          bool dest_uninitialized = false) {
2291     __ align(CodeEntryAlignment);
2292     StubCodeMark mark(this, "StubRoutines", name);
2293     address start = __ pc();
2294 
2295     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2296     const Register from        = rdi;  // source array address
2297     const Register to          = rsi;  // destination array address
2298     const Register count       = rdx;  // elements count
2299     const Register dword_count = rcx;
2300     const Register qword_count = count;
2301 
2302     __ enter(); // required for proper stackwalking of RuntimeStub frame
2303     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2304 
2305     if (entry != NULL) {
2306       *entry = __ pc();
2307        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2308       BLOCK_COMMENT("Entry:");
2309     }
2310 
2311     array_overlap_test(nooverlap_target, Address::times_4);
2312     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2313                                    // r9 is used to save r15_thread
2314 
2315     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2316     if (dest_uninitialized) {
2317       decorators |= IS_DEST_UNINITIALIZED;
2318     }
2319     if (aligned) {
2320       decorators |= ARRAYCOPY_ALIGNED;
2321     }
2322 
2323     BasicType type = is_oop ? T_OBJECT : T_INT;
2324     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2325     // no registers are destroyed by this call
2326     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2327 
2328     assert_clean_int(count, rax); // Make sure 'count' is clean int.
2329     // 'from', 'to' and 'count' are now valid
2330     __ movptr(dword_count, count);
2331     __ shrptr(count, 1); // count => qword_count
2332 
2333     // Copy from high to low addresses.  Use 'to' as scratch.
2334 
2335     // Check for and copy trailing dword
2336     __ testl(dword_count, 1);
2337     __ jcc(Assembler::zero, L_copy_bytes);
2338     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2339     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2340     __ jmp(L_copy_bytes);
2341 
2342     // Copy trailing qwords
2343   __ BIND(L_copy_8_bytes);
2344     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2345     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2346     __ decrement(qword_count);
2347     __ jcc(Assembler::notZero, L_copy_8_bytes);
2348 
2349     if (is_oop) {
2350       __ jmp(L_exit);
2351     }
2352     restore_arg_regs_using_thread();
2353     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2354     __ xorptr(rax, rax); // return 0
2355     __ vzeroupper();
2356     __ leave(); // required for proper stackwalking of RuntimeStub frame
2357     __ ret(0);
2358 
2359     // Copy in multi-bytes chunks
2360     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2361 
2362   __ BIND(L_exit);
2363     bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count);
2364     restore_arg_regs_using_thread();
2365     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2366     __ xorptr(rax, rax); // return 0
2367     __ vzeroupper();
2368     __ leave(); // required for proper stackwalking of RuntimeStub frame
2369     __ ret(0);
2370 
2371     return start;
2372   }
2373 
2374   // Arguments:
2375   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2376   //             ignored
2377   //   is_oop  - true => oop array, so generate store check code
2378   //   name    - stub name string
2379   //
2380   // Inputs:
2381   //   c_rarg0   - source array address
2382   //   c_rarg1   - destination array address
2383   //   c_rarg2   - element count, treated as ssize_t, can be zero
2384   //
2385  // Side Effects:
2386   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2387   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
2388   //
2389   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
2390                                           const char *name, bool dest_uninitialized = false) {
2391     __ align(CodeEntryAlignment);
2392     StubCodeMark mark(this, "StubRoutines", name);
2393     address start = __ pc();
2394 
2395     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2396     const Register from        = rdi;  // source array address
2397     const Register to          = rsi;  // destination array address
2398     const Register qword_count = rdx;  // elements count
2399     const Register end_from    = from; // source array end address
2400     const Register end_to      = rcx;  // destination array end address
2401     const Register saved_count = r11;
2402     // End pointers are inclusive, and if count is not zero they point
2403     // to the last unit copied:  end_to[0] := end_from[0]
2404 
2405     __ enter(); // required for proper stackwalking of RuntimeStub frame
2406     // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2407     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2408 
2409     if (entry != NULL) {
2410       *entry = __ pc();
2411       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2412       BLOCK_COMMENT("Entry:");
2413     }
2414 
2415     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2416                                      // r9 is used to save r15_thread
2417     // 'from', 'to' and 'qword_count' are now valid
2418 
2419     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2420     if (dest_uninitialized) {
2421       decorators |= IS_DEST_UNINITIALIZED;
2422     }
2423     if (aligned) {
2424       decorators |= ARRAYCOPY_ALIGNED;
2425     }
2426 
2427     BasicType type = is_oop ? T_OBJECT : T_LONG;
2428     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2429     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2430 
2431     // Copy from low to high addresses.  Use 'to' as scratch.
2432     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2433     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2434     __ negptr(qword_count);
2435     __ jmp(L_copy_bytes);
2436 
2437     // Copy trailing qwords
2438   __ BIND(L_copy_8_bytes);
2439     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2440     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2441     __ increment(qword_count);
2442     __ jcc(Assembler::notZero, L_copy_8_bytes);
2443 
2444     if (is_oop) {
2445       __ jmp(L_exit);
2446     } else {
2447       restore_arg_regs_using_thread();
2448       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2449       __ xorptr(rax, rax); // return 0
2450       __ vzeroupper();
2451       __ leave(); // required for proper stackwalking of RuntimeStub frame
2452       __ ret(0);
2453     }
2454 
2455     // Copy in multi-bytes chunks
2456     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2457 
2458     __ BIND(L_exit);
2459     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2460     restore_arg_regs_using_thread();
2461     if (is_oop) {
2462       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2463     } else {
2464       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2465     }
2466     __ vzeroupper();
2467     __ xorptr(rax, rax); // return 0
2468     __ leave(); // required for proper stackwalking of RuntimeStub frame
2469     __ ret(0);
2470 
2471     return start;
2472   }
2473 
2474   // Arguments:
2475   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2476   //             ignored
2477   //   is_oop  - true => oop array, so generate store check code
2478   //   name    - stub name string
2479   //
2480   // Inputs:
2481   //   c_rarg0   - source array address
2482   //   c_rarg1   - destination array address
2483   //   c_rarg2   - element count, treated as ssize_t, can be zero
2484   //
2485   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2486                                           address nooverlap_target, address *entry,
2487                                           const char *name, bool dest_uninitialized = false) {
2488     __ align(CodeEntryAlignment);
2489     StubCodeMark mark(this, "StubRoutines", name);
2490     address start = __ pc();
2491 
2492     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2493     const Register from        = rdi;  // source array address
2494     const Register to          = rsi;  // destination array address
2495     const Register qword_count = rdx;  // elements count
2496     const Register saved_count = rcx;
2497 
2498     __ enter(); // required for proper stackwalking of RuntimeStub frame
2499     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2500 
2501     if (entry != NULL) {
2502       *entry = __ pc();
2503       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2504       BLOCK_COMMENT("Entry:");
2505     }
2506 
2507     array_overlap_test(nooverlap_target, Address::times_8);
2508     setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx
2509                                    // r9 is used to save r15_thread
2510     // 'from', 'to' and 'qword_count' are now valid
2511 
2512     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2513     if (dest_uninitialized) {
2514       decorators |= IS_DEST_UNINITIALIZED;
2515     }
2516     if (aligned) {
2517       decorators |= ARRAYCOPY_ALIGNED;
2518     }
2519 
2520     BasicType type = is_oop ? T_OBJECT : T_LONG;
2521     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2522     bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count);
2523 
2524     __ jmp(L_copy_bytes);
2525 
2526     // Copy trailing qwords
2527   __ BIND(L_copy_8_bytes);
2528     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2529     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2530     __ decrement(qword_count);
2531     __ jcc(Assembler::notZero, L_copy_8_bytes);
2532 
2533     if (is_oop) {
2534       __ jmp(L_exit);
2535     } else {
2536       restore_arg_regs_using_thread();
2537       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2538       __ xorptr(rax, rax); // return 0
2539       __ vzeroupper();
2540       __ leave(); // required for proper stackwalking of RuntimeStub frame
2541       __ ret(0);
2542     }
2543 
2544     // Copy in multi-bytes chunks
2545     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2546 
2547     __ BIND(L_exit);
2548     bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count);
2549     restore_arg_regs_using_thread();
2550     if (is_oop) {
2551       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2552     } else {
2553       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2554     }
2555     __ vzeroupper();
2556     __ xorptr(rax, rax); // return 0
2557     __ leave(); // required for proper stackwalking of RuntimeStub frame
2558     __ ret(0);
2559 
2560     return start;
2561   }
2562 
2563 
2564   // Helper for generating a dynamic type check.
2565   // Smashes no registers.
2566   void generate_type_check(Register sub_klass,
2567                            Register super_check_offset,
2568                            Register super_klass,
2569                            Label& L_success) {
2570     assert_different_registers(sub_klass, super_check_offset, super_klass);
2571 
2572     BLOCK_COMMENT("type_check:");
2573 
2574     Label L_miss;
2575 
2576     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
2577                                      super_check_offset);
2578     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
2579 
2580     // Fall through on failure!
2581     __ BIND(L_miss);
2582   }
2583 
2584   //
2585   //  Generate checkcasting array copy stub
2586   //
2587   //  Input:
2588   //    c_rarg0   - source array address
2589   //    c_rarg1   - destination array address
2590   //    c_rarg2   - element count, treated as ssize_t, can be zero
2591   //    c_rarg3   - size_t ckoff (super_check_offset)
2592   // not Win64
2593   //    c_rarg4   - oop ckval (super_klass)
2594   // Win64
2595   //    rsp+40    - oop ckval (super_klass)
2596   //
2597   //  Output:
2598   //    rax ==  0  -  success
2599   //    rax == -1^K - failure, where K is partial transfer count
2600   //
2601   address generate_checkcast_copy(const char *name, address *entry,
2602                                   bool dest_uninitialized = false) {
2603 
2604     Label L_load_element, L_store_element, L_do_card_marks, L_done;
2605 
2606     // Input registers (after setup_arg_regs)
2607     const Register from        = rdi;   // source array address
2608     const Register to          = rsi;   // destination array address
2609     const Register length      = rdx;   // elements count
2610     const Register ckoff       = rcx;   // super_check_offset
2611     const Register ckval       = r8;    // super_klass
2612 
2613     // Registers used as temps (r13, r14 are save-on-entry)
2614     const Register end_from    = from;  // source array end address
2615     const Register end_to      = r13;   // destination array end address
2616     const Register count       = rdx;   // -(count_remaining)
2617     const Register r14_length  = r14;   // saved copy of length
2618     // End pointers are inclusive, and if length is not zero they point
2619     // to the last unit copied:  end_to[0] := end_from[0]
2620 
2621     const Register rax_oop    = rax;    // actual oop copied
2622     const Register r11_klass  = r11;    // oop._klass
2623 
2624     //---------------------------------------------------------------
2625     // Assembler stub will be used for this call to arraycopy
2626     // if the two arrays are subtypes of Object[] but the
2627     // destination array type is not equal to or a supertype
2628     // of the source type.  Each element must be separately
2629     // checked.
2630 
2631     __ align(CodeEntryAlignment);
2632     StubCodeMark mark(this, "StubRoutines", name);
2633     address start = __ pc();
2634 
2635     __ enter(); // required for proper stackwalking of RuntimeStub frame
2636 
2637 #ifdef ASSERT
2638     // caller guarantees that the arrays really are different
2639     // otherwise, we would have to make conjoint checks
2640     { Label L;
2641       array_overlap_test(L, TIMES_OOP);
2642       __ stop("checkcast_copy within a single array");
2643       __ bind(L);
2644     }
2645 #endif //ASSERT
2646 
2647     setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
2648                        // ckoff => rcx, ckval => r8
2649                        // r9 and r10 may be used to save non-volatile registers
2650 #ifdef _WIN64
2651     // last argument (#4) is on stack on Win64
2652     __ movptr(ckval, Address(rsp, 6 * wordSize));
2653 #endif
2654 
2655     // Caller of this entry point must set up the argument registers.
2656     if (entry != NULL) {
2657       *entry = __ pc();
2658       BLOCK_COMMENT("Entry:");
2659     }
2660 
2661     // allocate spill slots for r13, r14
2662     enum {
2663       saved_r13_offset,
2664       saved_r14_offset,
2665       saved_r10_offset,
2666       saved_rbp_offset
2667     };
2668     __ subptr(rsp, saved_rbp_offset * wordSize);
2669     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2670     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2671     __ movptr(Address(rsp, saved_r10_offset * wordSize), r10);
2672 
2673 #ifdef ASSERT
2674       Label L2;
2675       __ get_thread(r14);
2676       __ cmpptr(r15_thread, r14);
2677       __ jcc(Assembler::equal, L2);
2678       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
2679       __ bind(L2);
2680 #endif // ASSERT
2681 
2682     // check that int operands are properly extended to size_t
2683     assert_clean_int(length, rax);
2684     assert_clean_int(ckoff, rax);
2685 
2686 #ifdef ASSERT
2687     BLOCK_COMMENT("assert consistent ckoff/ckval");
2688     // The ckoff and ckval must be mutually consistent,
2689     // even though caller generates both.
2690     { Label L;
2691       int sco_offset = in_bytes(Klass::super_check_offset_offset());
2692       __ cmpl(ckoff, Address(ckval, sco_offset));
2693       __ jcc(Assembler::equal, L);
2694       __ stop("super_check_offset inconsistent");
2695       __ bind(L);
2696     }
2697 #endif //ASSERT
2698 
2699     // Loop-invariant addresses.  They are exclusive end pointers.
2700     Address end_from_addr(from, length, TIMES_OOP, 0);
2701     Address   end_to_addr(to,   length, TIMES_OOP, 0);
2702     // Loop-variant addresses.  They assume post-incremented count < 0.
2703     Address from_element_addr(end_from, count, TIMES_OOP, 0);
2704     Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2705 
2706     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
2707     if (dest_uninitialized) {
2708       decorators |= IS_DEST_UNINITIALIZED;
2709     }
2710 
2711     BasicType type = T_OBJECT;
2712     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2713     bs->arraycopy_prologue(_masm, decorators, type, from, to, count);
2714 
2715     // Copy from low to high addresses, indexed from the end of each array.
2716     __ lea(end_from, end_from_addr);
2717     __ lea(end_to,   end_to_addr);
2718     __ movptr(r14_length, length);        // save a copy of the length
2719     assert(length == count, "");          // else fix next line:
2720     __ negptr(count);                     // negate and test the length
2721     __ jcc(Assembler::notZero, L_load_element);
2722 
2723     // Empty array:  Nothing to do.
2724     __ xorptr(rax, rax);                  // return 0 on (trivial) success
2725     __ jmp(L_done);
2726 
2727     // ======== begin loop ========
2728     // (Loop is rotated; its entry is L_load_element.)
2729     // Loop control:
2730     //   for (count = -count; count != 0; count++)
2731     // Base pointers src, dst are biased by 8*(count-1),to last element.
2732     __ align(OptoLoopAlignment);
2733 
2734     __ BIND(L_store_element);
2735     __ store_heap_oop(to_element_addr, rax_oop, noreg, noreg, AS_RAW);  // store the oop
2736     __ increment(count);               // increment the count toward zero
2737     __ jcc(Assembler::zero, L_do_card_marks);
2738 
2739     // ======== loop entry is here ========
2740     __ BIND(L_load_element);
2741     __ load_heap_oop(rax_oop, from_element_addr, noreg, noreg, AS_RAW); // load the oop
2742     __ testptr(rax_oop, rax_oop);
2743     __ jcc(Assembler::zero, L_store_element);
2744 
2745     __ load_klass(r11_klass, rax_oop);// query the object klass
2746     generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2747     // ======== end loop ========
2748 
2749     // It was a real error; we must depend on the caller to finish the job.
2750     // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2751     // Emit GC store barriers for the oops we have copied (r14 + rdx),
2752     // and report their number to the caller.
2753     assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2754     Label L_post_barrier;
2755     __ addptr(r14_length, count);     // K = (original - remaining) oops
2756     __ movptr(rax, r14_length);       // save the value
2757     __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
2758     __ jccb(Assembler::notZero, L_post_barrier);
2759     __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2760 
2761     // Come here on success only.
2762     __ BIND(L_do_card_marks);
2763     __ xorptr(rax, rax);              // return 0 on success
2764 
2765     __ BIND(L_post_barrier);
2766     bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length);
2767 
2768     // Common exit point (success or failure).
2769     __ BIND(L_done);
2770     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2771     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2772     __ movptr(r10, Address(rsp, saved_r10_offset * wordSize));
2773     restore_arg_regs();
2774     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free
2775     __ leave(); // required for proper stackwalking of RuntimeStub frame
2776     __ ret(0);
2777 
2778     return start;
2779   }
2780 
2781   //
2782   //  Generate 'unsafe' array copy stub
2783   //  Though just as safe as the other stubs, it takes an unscaled
2784   //  size_t argument instead of an element count.
2785   //
2786   //  Input:
2787   //    c_rarg0   - source array address
2788   //    c_rarg1   - destination array address
2789   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2790   //
2791   // Examines the alignment of the operands and dispatches
2792   // to a long, int, short, or byte copy loop.
2793   //
2794   address generate_unsafe_copy(const char *name,
2795                                address byte_copy_entry, address short_copy_entry,
2796                                address int_copy_entry, address long_copy_entry) {
2797 
2798     Label L_long_aligned, L_int_aligned, L_short_aligned;
2799 
2800     // Input registers (before setup_arg_regs)
2801     const Register from        = c_rarg0;  // source array address
2802     const Register to          = c_rarg1;  // destination array address
2803     const Register size        = c_rarg2;  // byte count (size_t)
2804 
2805     // Register used as a temp
2806     const Register bits        = rax;      // test copy of low bits
2807 
2808     __ align(CodeEntryAlignment);
2809     StubCodeMark mark(this, "StubRoutines", name);
2810     address start = __ pc();
2811 
2812     __ enter(); // required for proper stackwalking of RuntimeStub frame
2813 
2814     // bump this on entry, not on exit:
2815     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2816 
2817     __ mov(bits, from);
2818     __ orptr(bits, to);
2819     __ orptr(bits, size);
2820 
2821     __ testb(bits, BytesPerLong-1);
2822     __ jccb(Assembler::zero, L_long_aligned);
2823 
2824     __ testb(bits, BytesPerInt-1);
2825     __ jccb(Assembler::zero, L_int_aligned);
2826 
2827     __ testb(bits, BytesPerShort-1);
2828     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2829 
2830     __ BIND(L_short_aligned);
2831     __ shrptr(size, LogBytesPerShort); // size => short_count
2832     __ jump(RuntimeAddress(short_copy_entry));
2833 
2834     __ BIND(L_int_aligned);
2835     __ shrptr(size, LogBytesPerInt); // size => int_count
2836     __ jump(RuntimeAddress(int_copy_entry));
2837 
2838     __ BIND(L_long_aligned);
2839     __ shrptr(size, LogBytesPerLong); // size => qword_count
2840     __ jump(RuntimeAddress(long_copy_entry));
2841 
2842     return start;
2843   }
2844 
2845   // Perform range checks on the proposed arraycopy.
2846   // Kills temp, but nothing else.
2847   // Also, clean the sign bits of src_pos and dst_pos.
2848   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2849                               Register src_pos, // source position (c_rarg1)
2850                               Register dst,     // destination array oo (c_rarg2)
2851                               Register dst_pos, // destination position (c_rarg3)
2852                               Register length,
2853                               Register temp,
2854                               Label& L_failed) {
2855     BLOCK_COMMENT("arraycopy_range_checks:");
2856 
2857     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2858     __ movl(temp, length);
2859     __ addl(temp, src_pos);             // src_pos + length
2860     __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2861     __ jcc(Assembler::above, L_failed);
2862 
2863     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2864     __ movl(temp, length);
2865     __ addl(temp, dst_pos);             // dst_pos + length
2866     __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2867     __ jcc(Assembler::above, L_failed);
2868 
2869     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2870     // Move with sign extension can be used since they are positive.
2871     __ movslq(src_pos, src_pos);
2872     __ movslq(dst_pos, dst_pos);
2873 
2874     BLOCK_COMMENT("arraycopy_range_checks done");
2875   }
2876 
2877   //
2878   //  Generate generic array copy stubs
2879   //
2880   //  Input:
2881   //    c_rarg0    -  src oop
2882   //    c_rarg1    -  src_pos (32-bits)
2883   //    c_rarg2    -  dst oop
2884   //    c_rarg3    -  dst_pos (32-bits)
2885   // not Win64
2886   //    c_rarg4    -  element count (32-bits)
2887   // Win64
2888   //    rsp+40     -  element count (32-bits)
2889   //
2890   //  Output:
2891   //    rax ==  0  -  success
2892   //    rax == -1^K - failure, where K is partial transfer count
2893   //
2894   address generate_generic_copy(const char *name,
2895                                 address byte_copy_entry, address short_copy_entry,
2896                                 address int_copy_entry, address oop_copy_entry,
2897                                 address long_copy_entry, address checkcast_copy_entry) {
2898 
2899     Label L_failed, L_failed_0, L_objArray;
2900     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2901 
2902     // Input registers
2903     const Register src        = c_rarg0;  // source array oop
2904     const Register src_pos    = c_rarg1;  // source position
2905     const Register dst        = c_rarg2;  // destination array oop
2906     const Register dst_pos    = c_rarg3;  // destination position
2907 #ifndef _WIN64
2908     const Register length     = c_rarg4;
2909 #else
2910     const Address  length(rsp, 6 * wordSize);  // elements count is on stack on Win64
2911 #endif
2912 
2913     { int modulus = CodeEntryAlignment;
2914       int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
2915       int advance = target - (__ offset() % modulus);
2916       if (advance < 0)  advance += modulus;
2917       if (advance > 0)  __ nop(advance);
2918     }
2919     StubCodeMark mark(this, "StubRoutines", name);
2920 
2921     // Short-hop target to L_failed.  Makes for denser prologue code.
2922     __ BIND(L_failed_0);
2923     __ jmp(L_failed);
2924     assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2925 
2926     __ align(CodeEntryAlignment);
2927     address start = __ pc();
2928 
2929     __ enter(); // required for proper stackwalking of RuntimeStub frame
2930 
2931     // bump this on entry, not on exit:
2932     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2933 
2934     //-----------------------------------------------------------------------
2935     // Assembler stub will be used for this call to arraycopy
2936     // if the following conditions are met:
2937     //
2938     // (1) src and dst must not be null.
2939     // (2) src_pos must not be negative.
2940     // (3) dst_pos must not be negative.
2941     // (4) length  must not be negative.
2942     // (5) src klass and dst klass should be the same and not NULL.
2943     // (6) src and dst should be arrays.
2944     // (7) src_pos + length must not exceed length of src.
2945     // (8) dst_pos + length must not exceed length of dst.
2946     //
2947 
2948     //  if (src == NULL) return -1;
2949     __ testptr(src, src);         // src oop
2950     size_t j1off = __ offset();
2951     __ jccb(Assembler::zero, L_failed_0);
2952 
2953     //  if (src_pos < 0) return -1;
2954     __ testl(src_pos, src_pos); // src_pos (32-bits)
2955     __ jccb(Assembler::negative, L_failed_0);
2956 
2957     //  if (dst == NULL) return -1;
2958     __ testptr(dst, dst);         // dst oop
2959     __ jccb(Assembler::zero, L_failed_0);
2960 
2961     //  if (dst_pos < 0) return -1;
2962     __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
2963     size_t j4off = __ offset();
2964     __ jccb(Assembler::negative, L_failed_0);
2965 
2966     // The first four tests are very dense code,
2967     // but not quite dense enough to put four
2968     // jumps in a 16-byte instruction fetch buffer.
2969     // That's good, because some branch predicters
2970     // do not like jumps so close together.
2971     // Make sure of this.
2972     guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
2973 
2974     // registers used as temp
2975     const Register r11_length    = r11; // elements count to copy
2976     const Register r10_src_klass = r10; // array klass
2977 
2978     //  if (length < 0) return -1;
2979     __ movl(r11_length, length);        // length (elements count, 32-bits value)
2980     __ testl(r11_length, r11_length);
2981     __ jccb(Assembler::negative, L_failed_0);
2982 
2983     __ load_klass(r10_src_klass, src);
2984 #ifdef ASSERT
2985     //  assert(src->klass() != NULL);
2986     {
2987       BLOCK_COMMENT("assert klasses not null {");
2988       Label L1, L2;
2989       __ testptr(r10_src_klass, r10_src_klass);
2990       __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
2991       __ bind(L1);
2992       __ stop("broken null klass");
2993       __ bind(L2);
2994       __ load_klass(rax, dst);
2995       __ cmpq(rax, 0);
2996       __ jcc(Assembler::equal, L1);     // this would be broken also
2997       BLOCK_COMMENT("} assert klasses not null done");
2998     }
2999 #endif
3000 
3001     // Load layout helper (32-bits)
3002     //
3003     //  |array_tag|     | header_size | element_type |     |log2_element_size|
3004     // 32        30    24            16              8     2                 0
3005     //
3006     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
3007     //
3008 
3009     const int lh_offset = in_bytes(Klass::layout_helper_offset());
3010 
3011     // Handle objArrays completely differently...
3012     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
3013     __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
3014     __ jcc(Assembler::equal, L_objArray);
3015 
3016     //  if (src->klass() != dst->klass()) return -1;
3017     __ load_klass(rax, dst);
3018     __ cmpq(r10_src_klass, rax);
3019     __ jcc(Assembler::notEqual, L_failed);
3020 
3021     const Register rax_lh = rax;  // layout helper
3022     __ movl(rax_lh, Address(r10_src_klass, lh_offset));
3023 
3024     //  if (!src->is_Array()) return -1;
3025     __ cmpl(rax_lh, Klass::_lh_neutral_value);
3026     __ jcc(Assembler::greaterEqual, L_failed);
3027 
3028     // At this point, it is known to be a typeArray (array_tag 0x3).
3029 #ifdef ASSERT
3030     {
3031       BLOCK_COMMENT("assert primitive array {");
3032       Label L;
3033       __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
3034       __ jcc(Assembler::greaterEqual, L);
3035       __ stop("must be a primitive array");
3036       __ bind(L);
3037       BLOCK_COMMENT("} assert primitive array done");
3038     }
3039 #endif
3040 
3041     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3042                            r10, L_failed);
3043 
3044     // TypeArrayKlass
3045     //
3046     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3047     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3048     //
3049 
3050     const Register r10_offset = r10;    // array offset
3051     const Register rax_elsize = rax_lh; // element size
3052 
3053     __ movl(r10_offset, rax_lh);
3054     __ shrl(r10_offset, Klass::_lh_header_size_shift);
3055     __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
3056     __ addptr(src, r10_offset);           // src array offset
3057     __ addptr(dst, r10_offset);           // dst array offset
3058     BLOCK_COMMENT("choose copy loop based on element size");
3059     __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
3060 
3061     // next registers should be set before the jump to corresponding stub
3062     const Register from     = c_rarg0;  // source array address
3063     const Register to       = c_rarg1;  // destination array address
3064     const Register count    = c_rarg2;  // elements count
3065 
3066     // 'from', 'to', 'count' registers should be set in such order
3067     // since they are the same as 'src', 'src_pos', 'dst'.
3068 
3069   __ BIND(L_copy_bytes);
3070     __ cmpl(rax_elsize, 0);
3071     __ jccb(Assembler::notEqual, L_copy_shorts);
3072     __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
3073     __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
3074     __ movl2ptr(count, r11_length); // length
3075     __ jump(RuntimeAddress(byte_copy_entry));
3076 
3077   __ BIND(L_copy_shorts);
3078     __ cmpl(rax_elsize, LogBytesPerShort);
3079     __ jccb(Assembler::notEqual, L_copy_ints);
3080     __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
3081     __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
3082     __ movl2ptr(count, r11_length); // length
3083     __ jump(RuntimeAddress(short_copy_entry));
3084 
3085   __ BIND(L_copy_ints);
3086     __ cmpl(rax_elsize, LogBytesPerInt);
3087     __ jccb(Assembler::notEqual, L_copy_longs);
3088     __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
3089     __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
3090     __ movl2ptr(count, r11_length); // length
3091     __ jump(RuntimeAddress(int_copy_entry));
3092 
3093   __ BIND(L_copy_longs);
3094 #ifdef ASSERT
3095     {
3096       BLOCK_COMMENT("assert long copy {");
3097       Label L;
3098       __ cmpl(rax_elsize, LogBytesPerLong);
3099       __ jcc(Assembler::equal, L);
3100       __ stop("must be long copy, but elsize is wrong");
3101       __ bind(L);
3102       BLOCK_COMMENT("} assert long copy done");
3103     }
3104 #endif
3105     __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
3106     __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
3107     __ movl2ptr(count, r11_length); // length
3108     __ jump(RuntimeAddress(long_copy_entry));
3109 
3110     // ObjArrayKlass
3111   __ BIND(L_objArray);
3112     // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
3113 
3114     Label L_plain_copy, L_checkcast_copy;
3115     //  test array classes for subtyping
3116     __ load_klass(rax, dst);
3117     __ cmpq(r10_src_klass, rax); // usual case is exact equality
3118     __ jcc(Assembler::notEqual, L_checkcast_copy);
3119 
3120     // Identically typed arrays can be copied without element-wise checks.
3121     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3122                            r10, L_failed);
3123 
3124     __ lea(from, Address(src, src_pos, TIMES_OOP,
3125                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
3126     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3127                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
3128     __ movl2ptr(count, r11_length); // length
3129   __ BIND(L_plain_copy);
3130     __ jump(RuntimeAddress(oop_copy_entry));
3131 
3132   __ BIND(L_checkcast_copy);
3133     // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
3134     {
3135       // Before looking at dst.length, make sure dst is also an objArray.
3136       __ cmpl(Address(rax, lh_offset), objArray_lh);
3137       __ jcc(Assembler::notEqual, L_failed);
3138 
3139       // It is safe to examine both src.length and dst.length.
3140       arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
3141                              rax, L_failed);
3142 
3143       const Register r11_dst_klass = r11;
3144       __ load_klass(r11_dst_klass, dst); // reload
3145 
3146       // Marshal the base address arguments now, freeing registers.
3147       __ lea(from, Address(src, src_pos, TIMES_OOP,
3148                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3149       __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
3150                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
3151       __ movl(count, length);           // length (reloaded)
3152       Register sco_temp = c_rarg3;      // this register is free now
3153       assert_different_registers(from, to, count, sco_temp,
3154                                  r11_dst_klass, r10_src_klass);
3155       assert_clean_int(count, sco_temp);
3156 
3157       // Generate the type check.
3158       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
3159       __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
3160       assert_clean_int(sco_temp, rax);
3161       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
3162 
3163       // Fetch destination element klass from the ObjArrayKlass header.
3164       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3165       __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
3166       __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
3167       assert_clean_int(sco_temp, rax);
3168 
3169       // the checkcast_copy loop needs two extra arguments:
3170       assert(c_rarg3 == sco_temp, "#3 already in place");
3171       // Set up arguments for checkcast_copy_entry.
3172       setup_arg_regs(4);
3173       __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
3174       __ jump(RuntimeAddress(checkcast_copy_entry));
3175     }
3176 
3177   __ BIND(L_failed);
3178     __ xorptr(rax, rax);
3179     __ notptr(rax); // return -1
3180     __ leave();   // required for proper stackwalking of RuntimeStub frame
3181     __ ret(0);
3182 
3183     return start;
3184   }
3185 
3186   void generate_arraycopy_stubs() {
3187     address entry;
3188     address entry_jbyte_arraycopy;
3189     address entry_jshort_arraycopy;
3190     address entry_jint_arraycopy;
3191     address entry_oop_arraycopy;
3192     address entry_jlong_arraycopy;
3193     address entry_checkcast_arraycopy;
3194 
3195     StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, &entry,
3196                                                                            "jbyte_disjoint_arraycopy");
3197     StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
3198                                                                            "jbyte_arraycopy");
3199 
3200     StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
3201                                                                             "jshort_disjoint_arraycopy");
3202     StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
3203                                                                             "jshort_arraycopy");
3204 
3205     StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, &entry,
3206                                                                               "jint_disjoint_arraycopy");
3207     StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(false, false, entry,
3208                                                                               &entry_jint_arraycopy, "jint_arraycopy");
3209 
3210     StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, false, &entry,
3211                                                                                "jlong_disjoint_arraycopy");
3212     StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(false, false, entry,
3213                                                                                &entry_jlong_arraycopy, "jlong_arraycopy");
3214 
3215 
3216     if (UseCompressedOops) {
3217       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(false, true, &entry,
3218                                                                               "oop_disjoint_arraycopy");
3219       StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(false, true, entry,
3220                                                                               &entry_oop_arraycopy, "oop_arraycopy");
3221       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(false, true, &entry,
3222                                                                                      "oop_disjoint_arraycopy_uninit",
3223                                                                                      /*dest_uninitialized*/true);
3224       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(false, true, entry,
3225                                                                                      NULL, "oop_arraycopy_uninit",
3226                                                                                      /*dest_uninitialized*/true);
3227     } else {
3228       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, true, &entry,
3229                                                                                "oop_disjoint_arraycopy");
3230       StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(false, true, entry,
3231                                                                                &entry_oop_arraycopy, "oop_arraycopy");
3232       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(false, true, &entry,
3233                                                                                       "oop_disjoint_arraycopy_uninit",
3234                                                                                       /*dest_uninitialized*/true);
3235       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(false, true, entry,
3236                                                                                       NULL, "oop_arraycopy_uninit",
3237                                                                                       /*dest_uninitialized*/true);
3238     }
3239 
3240     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3241     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3242                                                                         /*dest_uninitialized*/true);
3243 
3244     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
3245                                                               entry_jbyte_arraycopy,
3246                                                               entry_jshort_arraycopy,
3247                                                               entry_jint_arraycopy,
3248                                                               entry_jlong_arraycopy);
3249     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
3250                                                                entry_jbyte_arraycopy,
3251                                                                entry_jshort_arraycopy,
3252                                                                entry_jint_arraycopy,
3253                                                                entry_oop_arraycopy,
3254                                                                entry_jlong_arraycopy,
3255                                                                entry_checkcast_arraycopy);
3256 
3257     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3258     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3259     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3260     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3261     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3262     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3263 
3264     // We don't generate specialized code for HeapWord-aligned source
3265     // arrays, so just use the code we've already generated
3266     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
3267     StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
3268 
3269     StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
3270     StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
3271 
3272     StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
3273     StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
3274 
3275     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
3276     StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
3277 
3278     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
3279     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
3280 
3281     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
3282     StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
3283   }
3284 
3285   // AES intrinsic stubs
3286   enum {AESBlockSize = 16};
3287 
3288   address generate_key_shuffle_mask() {
3289     __ align(16);
3290     StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
3291     address start = __ pc();
3292     __ emit_data64( 0x0405060700010203, relocInfo::none );
3293     __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
3294     return start;
3295   }
3296 
3297   address generate_counter_shuffle_mask() {
3298     __ align(16);
3299     StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
3300     address start = __ pc();
3301     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3302     __ emit_data64(0x0001020304050607, relocInfo::none);
3303     return start;
3304   }
3305 
3306   // Utility routine for loading a 128-bit key word in little endian format
3307   // can optionally specify that the shuffle mask is already in an xmmregister
3308   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
3309     __ movdqu(xmmdst, Address(key, offset));
3310     if (xmm_shuf_mask != NULL) {
3311       __ pshufb(xmmdst, xmm_shuf_mask);
3312     } else {
3313       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3314     }
3315   }
3316 
3317   // Utility routine for increase 128bit counter (iv in CTR mode)
3318   void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
3319     __ pextrq(reg, xmmdst, 0x0);
3320     __ addq(reg, inc_delta);
3321     __ pinsrq(xmmdst, reg, 0x0);
3322     __ jcc(Assembler::carryClear, next_block); // jump if no carry
3323     __ pextrq(reg, xmmdst, 0x01); // Carry
3324     __ addq(reg, 0x01);
3325     __ pinsrq(xmmdst, reg, 0x01); //Carry end
3326     __ BIND(next_block);          // next instruction
3327   }
3328 
3329   // Arguments:
3330   //
3331   // Inputs:
3332   //   c_rarg0   - source byte array address
3333   //   c_rarg1   - destination byte array address
3334   //   c_rarg2   - K (key) in little endian int array
3335   //
3336   address generate_aescrypt_encryptBlock() {
3337     assert(UseAES, "need AES instructions and misaligned SSE support");
3338     __ align(CodeEntryAlignment);
3339     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3340     Label L_doLast;
3341     address start = __ pc();
3342 
3343     const Register from        = c_rarg0;  // source array address
3344     const Register to          = c_rarg1;  // destination array address
3345     const Register key         = c_rarg2;  // key array address
3346     const Register keylen      = rax;
3347 
3348     const XMMRegister xmm_result = xmm0;
3349     const XMMRegister xmm_key_shuf_mask = xmm1;
3350     // On win64 xmm6-xmm15 must be preserved so don't use them.
3351     const XMMRegister xmm_temp1  = xmm2;
3352     const XMMRegister xmm_temp2  = xmm3;
3353     const XMMRegister xmm_temp3  = xmm4;
3354     const XMMRegister xmm_temp4  = xmm5;
3355 
3356     __ enter(); // required for proper stackwalking of RuntimeStub frame
3357 
3358     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3359     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3360 
3361     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3362     __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
3363 
3364     // For encryption, the java expanded key ordering is just what we need
3365     // we don't know if the key is aligned, hence not using load-execute form
3366 
3367     load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
3368     __ pxor(xmm_result, xmm_temp1);
3369 
3370     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3371     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3372     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3373     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3374 
3375     __ aesenc(xmm_result, xmm_temp1);
3376     __ aesenc(xmm_result, xmm_temp2);
3377     __ aesenc(xmm_result, xmm_temp3);
3378     __ aesenc(xmm_result, xmm_temp4);
3379 
3380     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3381     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3382     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3383     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3384 
3385     __ aesenc(xmm_result, xmm_temp1);
3386     __ aesenc(xmm_result, xmm_temp2);
3387     __ aesenc(xmm_result, xmm_temp3);
3388     __ aesenc(xmm_result, xmm_temp4);
3389 
3390     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3391     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3392 
3393     __ cmpl(keylen, 44);
3394     __ jccb(Assembler::equal, L_doLast);
3395 
3396     __ aesenc(xmm_result, xmm_temp1);
3397     __ aesenc(xmm_result, xmm_temp2);
3398 
3399     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3400     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3401 
3402     __ cmpl(keylen, 52);
3403     __ jccb(Assembler::equal, L_doLast);
3404 
3405     __ aesenc(xmm_result, xmm_temp1);
3406     __ aesenc(xmm_result, xmm_temp2);
3407 
3408     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3409     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3410 
3411     __ BIND(L_doLast);
3412     __ aesenc(xmm_result, xmm_temp1);
3413     __ aesenclast(xmm_result, xmm_temp2);
3414     __ movdqu(Address(to, 0), xmm_result);        // store the result
3415     __ xorptr(rax, rax); // return 0
3416     __ leave(); // required for proper stackwalking of RuntimeStub frame
3417     __ ret(0);
3418 
3419     return start;
3420   }
3421 
3422 
3423   // Arguments:
3424   //
3425   // Inputs:
3426   //   c_rarg0   - source byte array address
3427   //   c_rarg1   - destination byte array address
3428   //   c_rarg2   - K (key) in little endian int array
3429   //
3430   address generate_aescrypt_decryptBlock() {
3431     assert(UseAES, "need AES instructions and misaligned SSE support");
3432     __ align(CodeEntryAlignment);
3433     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3434     Label L_doLast;
3435     address start = __ pc();
3436 
3437     const Register from        = c_rarg0;  // source array address
3438     const Register to          = c_rarg1;  // destination array address
3439     const Register key         = c_rarg2;  // key array address
3440     const Register keylen      = rax;
3441 
3442     const XMMRegister xmm_result = xmm0;
3443     const XMMRegister xmm_key_shuf_mask = xmm1;
3444     // On win64 xmm6-xmm15 must be preserved so don't use them.
3445     const XMMRegister xmm_temp1  = xmm2;
3446     const XMMRegister xmm_temp2  = xmm3;
3447     const XMMRegister xmm_temp3  = xmm4;
3448     const XMMRegister xmm_temp4  = xmm5;
3449 
3450     __ enter(); // required for proper stackwalking of RuntimeStub frame
3451 
3452     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3453     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3454 
3455     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3456     __ movdqu(xmm_result, Address(from, 0));
3457 
3458     // for decryption java expanded key ordering is rotated one position from what we want
3459     // so we start from 0x10 here and hit 0x00 last
3460     // we don't know if the key is aligned, hence not using load-execute form
3461     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3462     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3463     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3464     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3465 
3466     __ pxor  (xmm_result, xmm_temp1);
3467     __ aesdec(xmm_result, xmm_temp2);
3468     __ aesdec(xmm_result, xmm_temp3);
3469     __ aesdec(xmm_result, xmm_temp4);
3470 
3471     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3472     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3473     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3474     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3475 
3476     __ aesdec(xmm_result, xmm_temp1);
3477     __ aesdec(xmm_result, xmm_temp2);
3478     __ aesdec(xmm_result, xmm_temp3);
3479     __ aesdec(xmm_result, xmm_temp4);
3480 
3481     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3482     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3483     load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
3484 
3485     __ cmpl(keylen, 44);
3486     __ jccb(Assembler::equal, L_doLast);
3487 
3488     __ aesdec(xmm_result, xmm_temp1);
3489     __ aesdec(xmm_result, xmm_temp2);
3490 
3491     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3492     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3493 
3494     __ cmpl(keylen, 52);
3495     __ jccb(Assembler::equal, L_doLast);
3496 
3497     __ aesdec(xmm_result, xmm_temp1);
3498     __ aesdec(xmm_result, xmm_temp2);
3499 
3500     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3501     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3502 
3503     __ BIND(L_doLast);
3504     __ aesdec(xmm_result, xmm_temp1);
3505     __ aesdec(xmm_result, xmm_temp2);
3506 
3507     // for decryption the aesdeclast operation is always on key+0x00
3508     __ aesdeclast(xmm_result, xmm_temp3);
3509     __ movdqu(Address(to, 0), xmm_result);  // store the result
3510     __ xorptr(rax, rax); // return 0
3511     __ leave(); // required for proper stackwalking of RuntimeStub frame
3512     __ ret(0);
3513 
3514     return start;
3515   }
3516 
3517 
3518   // Arguments:
3519   //
3520   // Inputs:
3521   //   c_rarg0   - source byte array address
3522   //   c_rarg1   - destination byte array address
3523   //   c_rarg2   - K (key) in little endian int array
3524   //   c_rarg3   - r vector byte array address
3525   //   c_rarg4   - input length
3526   //
3527   // Output:
3528   //   rax       - input length
3529   //
3530   address generate_cipherBlockChaining_encryptAESCrypt() {
3531     assert(UseAES, "need AES instructions and misaligned SSE support");
3532     __ align(CodeEntryAlignment);
3533     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3534     address start = __ pc();
3535 
3536     Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3537     const Register from        = c_rarg0;  // source array address
3538     const Register to          = c_rarg1;  // destination array address
3539     const Register key         = c_rarg2;  // key array address
3540     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3541                                            // and left with the results of the last encryption block
3542 #ifndef _WIN64
3543     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3544 #else
3545     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3546     const Register len_reg     = r11;      // pick the volatile windows register
3547 #endif
3548     const Register pos         = rax;
3549 
3550     // xmm register assignments for the loops below
3551     const XMMRegister xmm_result = xmm0;
3552     const XMMRegister xmm_temp   = xmm1;
3553     // keys 0-10 preloaded into xmm2-xmm12
3554     const int XMM_REG_NUM_KEY_FIRST = 2;
3555     const int XMM_REG_NUM_KEY_LAST  = 15;
3556     const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3557     const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3558     const XMMRegister xmm_key11  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3559     const XMMRegister xmm_key12  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3560     const XMMRegister xmm_key13  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3561 
3562     __ enter(); // required for proper stackwalking of RuntimeStub frame
3563 
3564 #ifdef _WIN64
3565     // on win64, fill len_reg from stack position
3566     __ movl(len_reg, len_mem);
3567 #else
3568     __ push(len_reg); // Save
3569 #endif
3570 
3571     const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
3572     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3573     // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3574     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3575       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3576       offset += 0x10;
3577     }
3578     __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
3579 
3580     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3581     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3582     __ cmpl(rax, 44);
3583     __ jcc(Assembler::notEqual, L_key_192_256);
3584 
3585     // 128 bit code follows here
3586     __ movptr(pos, 0);
3587     __ align(OptoLoopAlignment);
3588 
3589     __ BIND(L_loopTop_128);
3590     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3591     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3592     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3593     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3594       __ aesenc(xmm_result, as_XMMRegister(rnum));
3595     }
3596     __ aesenclast(xmm_result, xmm_key10);
3597     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3598     // no need to store r to memory until we exit
3599     __ addptr(pos, AESBlockSize);
3600     __ subptr(len_reg, AESBlockSize);
3601     __ jcc(Assembler::notEqual, L_loopTop_128);
3602 
3603     __ BIND(L_exit);
3604     __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
3605 
3606 #ifdef _WIN64
3607     __ movl(rax, len_mem);
3608 #else
3609     __ pop(rax); // return length
3610 #endif
3611     __ leave(); // required for proper stackwalking of RuntimeStub frame
3612     __ ret(0);
3613 
3614     __ BIND(L_key_192_256);
3615     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3616     load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3617     load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3618     __ cmpl(rax, 52);
3619     __ jcc(Assembler::notEqual, L_key_256);
3620 
3621     // 192-bit code follows here (could be changed to use more xmm registers)
3622     __ movptr(pos, 0);
3623     __ align(OptoLoopAlignment);
3624 
3625     __ BIND(L_loopTop_192);
3626     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3627     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3628     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3629     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
3630       __ aesenc(xmm_result, as_XMMRegister(rnum));
3631     }
3632     __ aesenclast(xmm_result, xmm_key12);
3633     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3634     // no need to store r to memory until we exit
3635     __ addptr(pos, AESBlockSize);
3636     __ subptr(len_reg, AESBlockSize);
3637     __ jcc(Assembler::notEqual, L_loopTop_192);
3638     __ jmp(L_exit);
3639 
3640     __ BIND(L_key_256);
3641     // 256-bit code follows here (could be changed to use more xmm registers)
3642     load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
3643     __ movptr(pos, 0);
3644     __ align(OptoLoopAlignment);
3645 
3646     __ BIND(L_loopTop_256);
3647     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3648     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3649     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3650     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
3651       __ aesenc(xmm_result, as_XMMRegister(rnum));
3652     }
3653     load_key(xmm_temp, key, 0xe0);
3654     __ aesenclast(xmm_result, xmm_temp);
3655     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3656     // no need to store r to memory until we exit
3657     __ addptr(pos, AESBlockSize);
3658     __ subptr(len_reg, AESBlockSize);
3659     __ jcc(Assembler::notEqual, L_loopTop_256);
3660     __ jmp(L_exit);
3661 
3662     return start;
3663   }
3664 
3665   // Safefetch stubs.
3666   void generate_safefetch(const char* name, int size, address* entry,
3667                           address* fault_pc, address* continuation_pc) {
3668     // safefetch signatures:
3669     //   int      SafeFetch32(int*      adr, int      errValue);
3670     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3671     //
3672     // arguments:
3673     //   c_rarg0 = adr
3674     //   c_rarg1 = errValue
3675     //
3676     // result:
3677     //   PPC_RET  = *adr or errValue
3678 
3679     StubCodeMark mark(this, "StubRoutines", name);
3680 
3681     // Entry point, pc or function descriptor.
3682     *entry = __ pc();
3683 
3684     // Load *adr into c_rarg1, may fault.
3685     *fault_pc = __ pc();
3686     switch (size) {
3687       case 4:
3688         // int32_t
3689         __ movl(c_rarg1, Address(c_rarg0, 0));
3690         break;
3691       case 8:
3692         // int64_t
3693         __ movq(c_rarg1, Address(c_rarg0, 0));
3694         break;
3695       default:
3696         ShouldNotReachHere();
3697     }
3698 
3699     // return errValue or *adr
3700     *continuation_pc = __ pc();
3701     __ movq(rax, c_rarg1);
3702     __ ret(0);
3703   }
3704 
3705   // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3706   // to hide instruction latency
3707   //
3708   // Arguments:
3709   //
3710   // Inputs:
3711   //   c_rarg0   - source byte array address
3712   //   c_rarg1   - destination byte array address
3713   //   c_rarg2   - K (key) in little endian int array
3714   //   c_rarg3   - r vector byte array address
3715   //   c_rarg4   - input length
3716   //
3717   // Output:
3718   //   rax       - input length
3719   //
3720   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3721     assert(UseAES, "need AES instructions and misaligned SSE support");
3722     __ align(CodeEntryAlignment);
3723     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3724     address start = __ pc();
3725 
3726     const Register from        = c_rarg0;  // source array address
3727     const Register to          = c_rarg1;  // destination array address
3728     const Register key         = c_rarg2;  // key array address
3729     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3730                                            // and left with the results of the last encryption block
3731 #ifndef _WIN64
3732     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3733 #else
3734     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3735     const Register len_reg     = r11;      // pick the volatile windows register
3736 #endif
3737     const Register pos         = rax;
3738 
3739     const int PARALLEL_FACTOR = 4;
3740     const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256
3741 
3742     Label L_exit;
3743     Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
3744     Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
3745     Label L_singleBlock_loopTop[3]; // 128, 192, 256
3746     Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
3747     Label L_multiBlock_loopTop[3]; // 128, 192, 256
3748 
3749     // keys 0-10 preloaded into xmm5-xmm15
3750     const int XMM_REG_NUM_KEY_FIRST = 5;
3751     const int XMM_REG_NUM_KEY_LAST  = 15;
3752     const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3753     const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3754 
3755     __ enter(); // required for proper stackwalking of RuntimeStub frame
3756 
3757 #ifdef _WIN64
3758     // on win64, fill len_reg from stack position
3759     __ movl(len_reg, len_mem);
3760 #else
3761     __ push(len_reg); // Save
3762 #endif
3763     __ push(rbx);
3764     // the java expanded key ordering is rotated one position from what we want
3765     // so we start from 0x10 here and hit 0x00 last
3766     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
3767     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3768     // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3769     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3770       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3771       offset += 0x10;
3772     }
3773     load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3774 
3775     const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
3776 
3777     // registers holding the four results in the parallelized loop
3778     const XMMRegister xmm_result0 = xmm0;
3779     const XMMRegister xmm_result1 = xmm2;
3780     const XMMRegister xmm_result2 = xmm3;
3781     const XMMRegister xmm_result3 = xmm4;
3782 
3783     __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
3784 
3785     __ xorptr(pos, pos);
3786 
3787     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3788     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3789     __ cmpl(rbx, 52);
3790     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
3791     __ cmpl(rbx, 60);
3792     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
3793 
3794 #define DoFour(opc, src_reg)           \
3795   __ opc(xmm_result0, src_reg);         \
3796   __ opc(xmm_result1, src_reg);         \
3797   __ opc(xmm_result2, src_reg);         \
3798   __ opc(xmm_result3, src_reg);         \
3799 
3800     for (int k = 0; k < 3; ++k) {
3801       __ BIND(L_multiBlock_loopTopHead[k]);
3802       if (k != 0) {
3803         __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3804         __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
3805       }
3806       if (k == 1) {
3807         __ subptr(rsp, 6 * wordSize);
3808         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3809         load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
3810         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3811         load_key(xmm1, key, 0xc0);  // 0xc0;
3812         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3813       } else if (k == 2) {
3814         __ subptr(rsp, 10 * wordSize);
3815         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3816         load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes upto 0xe0
3817         __ movdqu(Address(rsp, 6 * wordSize), xmm15);
3818         load_key(xmm1, key, 0xe0);  // 0xe0;
3819         __ movdqu(Address(rsp, 8 * wordSize), xmm1);
3820         load_key(xmm15, key, 0xb0); // 0xb0;
3821         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3822         load_key(xmm1, key, 0xc0);  // 0xc0;
3823         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3824       }
3825       __ align(OptoLoopAlignment);
3826       __ BIND(L_multiBlock_loopTop[k]);
3827       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3828       __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);
3829 
3830       if  (k != 0) {
3831         __ movdqu(xmm15, Address(rsp, 2 * wordSize));
3832         __ movdqu(xmm1, Address(rsp, 4 * wordSize));
3833       }
3834 
3835       __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
3836       __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3837       __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3838       __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
3839 
3840       DoFour(pxor, xmm_key_first);
3841       if (k == 0) {
3842         for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
3843           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3844         }
3845         DoFour(aesdeclast, xmm_key_last);
3846       } else if (k == 1) {
3847         for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
3848           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3849         }
3850         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3851         DoFour(aesdec, xmm1);  // key : 0xc0
3852         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
3853         DoFour(aesdeclast, xmm_key_last);
3854       } else if (k == 2) {
3855         for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
3856           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3857         }
3858         DoFour(aesdec, xmm1);  // key : 0xc0
3859         __ movdqu(xmm15, Address(rsp, 6 * wordSize));
3860         __ movdqu(xmm1, Address(rsp, 8 * wordSize));
3861         DoFour(aesdec, xmm15);  // key : 0xd0
3862         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3863         DoFour(aesdec, xmm1);  // key : 0xe0
3864         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
3865         DoFour(aesdeclast, xmm_key_last);
3866       }
3867 
3868       // for each result, xor with the r vector of previous cipher block
3869       __ pxor(xmm_result0, xmm_prev_block_cipher);
3870       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
3871       __ pxor(xmm_result1, xmm_prev_block_cipher);
3872       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3873       __ pxor(xmm_result2, xmm_prev_block_cipher);
3874       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3875       __ pxor(xmm_result3, xmm_prev_block_cipher);
3876       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize));   // this will carry over to next set of blocks
3877       if (k != 0) {
3878         __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
3879       }
3880 
3881       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
3882       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
3883       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
3884       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
3885 
3886       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
3887       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
3888       __ jmp(L_multiBlock_loopTop[k]);
3889 
3890       // registers used in the non-parallelized loops
3891       // xmm register assignments for the loops below
3892       const XMMRegister xmm_result = xmm0;
3893       const XMMRegister xmm_prev_block_cipher_save = xmm2;
3894       const XMMRegister xmm_key11 = xmm3;
3895       const XMMRegister xmm_key12 = xmm4;
3896       const XMMRegister key_tmp = xmm4;
3897 
3898       __ BIND(L_singleBlock_loopTopHead[k]);
3899       if (k == 1) {
3900         __ addptr(rsp, 6 * wordSize);
3901       } else if (k == 2) {
3902         __ addptr(rsp, 10 * wordSize);
3903       }
3904       __ cmpptr(len_reg, 0); // any blocks left??
3905       __ jcc(Assembler::equal, L_exit);
3906       __ BIND(L_singleBlock_loopTopHead2[k]);
3907       if (k == 1) {
3908         load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes upto 0xc0
3909         load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes upto 0xc0
3910       }
3911       if (k == 2) {
3912         load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes upto 0xe0
3913       }
3914       __ align(OptoLoopAlignment);
3915       __ BIND(L_singleBlock_loopTop[k]);
3916       __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
3917       __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
3918       __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
3919       for (int rnum = 1; rnum <= 9 ; rnum++) {
3920           __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3921       }
3922       if (k == 1) {
3923         __ aesdec(xmm_result, xmm_key11);
3924         __ aesdec(xmm_result, xmm_key12);
3925       }
3926       if (k == 2) {
3927         __ aesdec(xmm_result, xmm_key11);
3928         load_key(key_tmp, key, 0xc0);
3929         __ aesdec(xmm_result, key_tmp);
3930         load_key(key_tmp, key, 0xd0);
3931         __ aesdec(xmm_result, key_tmp);
3932         load_key(key_tmp, key, 0xe0);
3933         __ aesdec(xmm_result, key_tmp);
3934       }
3935 
3936       __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
3937       __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3938       __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3939       // no need to store r to memory until we exit
3940       __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3941       __ addptr(pos, AESBlockSize);
3942       __ subptr(len_reg, AESBlockSize);
3943       __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
3944       if (k != 2) {
3945         __ jmp(L_exit);
3946       }
3947     } //for 128/192/256
3948 
3949     __ BIND(L_exit);
3950     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
3951     __ pop(rbx);
3952 #ifdef _WIN64
3953     __ movl(rax, len_mem);
3954 #else
3955     __ pop(rax); // return length
3956 #endif
3957     __ leave(); // required for proper stackwalking of RuntimeStub frame
3958     __ ret(0);
3959     return start;
3960 }
3961 
3962   address generate_upper_word_mask() {
3963     __ align(64);
3964     StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
3965     address start = __ pc();
3966     __ emit_data64(0x0000000000000000, relocInfo::none);
3967     __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
3968     return start;
3969   }
3970 
3971   address generate_shuffle_byte_flip_mask() {
3972     __ align(64);
3973     StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
3974     address start = __ pc();
3975     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3976     __ emit_data64(0x0001020304050607, relocInfo::none);
3977     return start;
3978   }
3979 
3980   // ofs and limit are use for multi-block byte array.
3981   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3982   address generate_sha1_implCompress(bool multi_block, const char *name) {
3983     __ align(CodeEntryAlignment);
3984     StubCodeMark mark(this, "StubRoutines", name);
3985     address start = __ pc();
3986 
3987     Register buf = c_rarg0;
3988     Register state = c_rarg1;
3989     Register ofs = c_rarg2;
3990     Register limit = c_rarg3;
3991 
3992     const XMMRegister abcd = xmm0;
3993     const XMMRegister e0 = xmm1;
3994     const XMMRegister e1 = xmm2;
3995     const XMMRegister msg0 = xmm3;
3996 
3997     const XMMRegister msg1 = xmm4;
3998     const XMMRegister msg2 = xmm5;
3999     const XMMRegister msg3 = xmm6;
4000     const XMMRegister shuf_mask = xmm7;
4001 
4002     __ enter();
4003 
4004     __ subptr(rsp, 4 * wordSize);
4005 
4006     __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
4007       buf, state, ofs, limit, rsp, multi_block);
4008 
4009     __ addptr(rsp, 4 * wordSize);
4010 
4011     __ leave();
4012     __ ret(0);
4013     return start;
4014   }
4015 
4016   address generate_pshuffle_byte_flip_mask() {
4017     __ align(64);
4018     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
4019     address start = __ pc();
4020     __ emit_data64(0x0405060700010203, relocInfo::none);
4021     __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
4022 
4023     if (VM_Version::supports_avx2()) {
4024       __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
4025       __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
4026       // _SHUF_00BA
4027       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4028       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4029       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4030       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4031       // _SHUF_DC00
4032       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4033       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4034       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4035       __ emit_data64(0x0b0a090803020100, relocInfo::none);
4036     }
4037 
4038     return start;
4039   }
4040 
4041   //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
4042   address generate_pshuffle_byte_flip_mask_sha512() {
4043     __ align(32);
4044     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
4045     address start = __ pc();
4046     if (VM_Version::supports_avx2()) {
4047       __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK
4048       __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
4049       __ emit_data64(0x1011121314151617, relocInfo::none);
4050       __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none);
4051       __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO
4052       __ emit_data64(0x0000000000000000, relocInfo::none);
4053       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4054       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
4055     }
4056 
4057     return start;
4058   }
4059 
4060 // ofs and limit are use for multi-block byte array.
4061 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
4062   address generate_sha256_implCompress(bool multi_block, const char *name) {
4063     assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), "");
4064     __ align(CodeEntryAlignment);
4065     StubCodeMark mark(this, "StubRoutines", name);
4066     address start = __ pc();
4067 
4068     Register buf = c_rarg0;
4069     Register state = c_rarg1;
4070     Register ofs = c_rarg2;
4071     Register limit = c_rarg3;
4072 
4073     const XMMRegister msg = xmm0;
4074     const XMMRegister state0 = xmm1;
4075     const XMMRegister state1 = xmm2;
4076     const XMMRegister msgtmp0 = xmm3;
4077 
4078     const XMMRegister msgtmp1 = xmm4;
4079     const XMMRegister msgtmp2 = xmm5;
4080     const XMMRegister msgtmp3 = xmm6;
4081     const XMMRegister msgtmp4 = xmm7;
4082 
4083     const XMMRegister shuf_mask = xmm8;
4084 
4085     __ enter();
4086 
4087     __ subptr(rsp, 4 * wordSize);
4088 
4089     if (VM_Version::supports_sha()) {
4090       __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4091         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4092     } else if (VM_Version::supports_avx2()) {
4093       __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4094         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4095     }
4096     __ addptr(rsp, 4 * wordSize);
4097     __ vzeroupper();
4098     __ leave();
4099     __ ret(0);
4100     return start;
4101   }
4102 
4103   address generate_sha512_implCompress(bool multi_block, const char *name) {
4104     assert(VM_Version::supports_avx2(), "");
4105     assert(VM_Version::supports_bmi2(), "");
4106     __ align(CodeEntryAlignment);
4107     StubCodeMark mark(this, "StubRoutines", name);
4108     address start = __ pc();
4109 
4110     Register buf = c_rarg0;
4111     Register state = c_rarg1;
4112     Register ofs = c_rarg2;
4113     Register limit = c_rarg3;
4114 
4115     const XMMRegister msg = xmm0;
4116     const XMMRegister state0 = xmm1;
4117     const XMMRegister state1 = xmm2;
4118     const XMMRegister msgtmp0 = xmm3;
4119     const XMMRegister msgtmp1 = xmm4;
4120     const XMMRegister msgtmp2 = xmm5;
4121     const XMMRegister msgtmp3 = xmm6;
4122     const XMMRegister msgtmp4 = xmm7;
4123 
4124     const XMMRegister shuf_mask = xmm8;
4125 
4126     __ enter();
4127 
4128     __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
4129     buf, state, ofs, limit, rsp, multi_block, shuf_mask);
4130 
4131     __ vzeroupper();
4132     __ leave();
4133     __ ret(0);
4134     return start;
4135   }
4136 
4137   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
4138   // to hide instruction latency
4139   //
4140   // Arguments:
4141   //
4142   // Inputs:
4143   //   c_rarg0   - source byte array address
4144   //   c_rarg1   - destination byte array address
4145   //   c_rarg2   - K (key) in little endian int array
4146   //   c_rarg3   - counter vector byte array address
4147   //   Linux
4148   //     c_rarg4   -          input length
4149   //     c_rarg5   -          saved encryptedCounter start
4150   //     rbp + 6 * wordSize - saved used length
4151   //   Windows
4152   //     rbp + 6 * wordSize - input length
4153   //     rbp + 7 * wordSize - saved encryptedCounter start
4154   //     rbp + 8 * wordSize - saved used length
4155   //
4156   // Output:
4157   //   rax       - input length
4158   //
4159   address generate_counterMode_AESCrypt_Parallel() {
4160     assert(UseAES, "need AES instructions and misaligned SSE support");
4161     __ align(CodeEntryAlignment);
4162     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
4163     address start = __ pc();
4164     const Register from = c_rarg0; // source array address
4165     const Register to = c_rarg1; // destination array address
4166     const Register key = c_rarg2; // key array address
4167     const Register counter = c_rarg3; // counter byte array initialized from counter array address
4168                                       // and updated with the incremented counter in the end
4169 #ifndef _WIN64
4170     const Register len_reg = c_rarg4;
4171     const Register saved_encCounter_start = c_rarg5;
4172     const Register used_addr = r10;
4173     const Address  used_mem(rbp, 2 * wordSize);
4174     const Register used = r11;
4175 #else
4176     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
4177     const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
4178     const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
4179     const Register len_reg = r10; // pick the first volatile windows register
4180     const Register saved_encCounter_start = r11;
4181     const Register used_addr = r13;
4182     const Register used = r14;
4183 #endif
4184     const Register pos = rax;
4185 
4186     const int PARALLEL_FACTOR = 6;
4187     const XMMRegister xmm_counter_shuf_mask = xmm0;
4188     const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
4189     const XMMRegister xmm_curr_counter = xmm2;
4190 
4191     const XMMRegister xmm_key_tmp0 = xmm3;
4192     const XMMRegister xmm_key_tmp1 = xmm4;
4193 
4194     // registers holding the four results in the parallelized loop
4195     const XMMRegister xmm_result0 = xmm5;
4196     const XMMRegister xmm_result1 = xmm6;
4197     const XMMRegister xmm_result2 = xmm7;
4198     const XMMRegister xmm_result3 = xmm8;
4199     const XMMRegister xmm_result4 = xmm9;
4200     const XMMRegister xmm_result5 = xmm10;
4201 
4202     const XMMRegister xmm_from0 = xmm11;
4203     const XMMRegister xmm_from1 = xmm12;
4204     const XMMRegister xmm_from2 = xmm13;
4205     const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
4206     const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
4207     const XMMRegister xmm_from5 = xmm4;
4208 
4209     //for key_128, key_192, key_256
4210     const int rounds[3] = {10, 12, 14};
4211     Label L_exit_preLoop, L_preLoop_start;
4212     Label L_multiBlock_loopTop[3];
4213     Label L_singleBlockLoopTop[3];
4214     Label L__incCounter[3][6]; //for 6 blocks
4215     Label L__incCounter_single[3]; //for single block, key128, key192, key256
4216     Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
4217     Label L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
4218 
4219     Label L_exit;
4220 
4221     __ enter(); // required for proper stackwalking of RuntimeStub frame
4222 
4223 #ifdef _WIN64
4224     // allocate spill slots for r13, r14
4225     enum {
4226         saved_r13_offset,
4227         saved_r14_offset
4228     };
4229     __ subptr(rsp, 2 * wordSize);
4230     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
4231     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
4232 
4233     // on win64, fill len_reg from stack position
4234     __ movl(len_reg, len_mem);
4235     __ movptr(saved_encCounter_start, saved_encCounter_mem);
4236     __ movptr(used_addr, used_mem);
4237     __ movl(used, Address(used_addr, 0));
4238 #else
4239     __ push(len_reg); // Save
4240     __ movptr(used_addr, used_mem);
4241     __ movl(used, Address(used_addr, 0));
4242 #endif
4243 
4244     __ push(rbx); // Save RBX
4245     __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
4246     __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()), pos); // pos as scratch
4247     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
4248     __ movptr(pos, 0);
4249 
4250     // Use the partially used encrpyted counter from last invocation
4251     __ BIND(L_preLoop_start);
4252     __ cmpptr(used, 16);
4253     __ jcc(Assembler::aboveEqual, L_exit_preLoop);
4254       __ cmpptr(len_reg, 0);
4255       __ jcc(Assembler::lessEqual, L_exit_preLoop);
4256       __ movb(rbx, Address(saved_encCounter_start, used));
4257       __ xorb(rbx, Address(from, pos));
4258       __ movb(Address(to, pos), rbx);
4259       __ addptr(pos, 1);
4260       __ addptr(used, 1);
4261       __ subptr(len_reg, 1);
4262 
4263     __ jmp(L_preLoop_start);
4264 
4265     __ BIND(L_exit_preLoop);
4266     __ movl(Address(used_addr, 0), used);
4267 
4268     // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
4269     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); // rbx as scratch
4270     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4271     __ cmpl(rbx, 52);
4272     __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
4273     __ cmpl(rbx, 60);
4274     __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
4275 
4276 #define CTR_DoSix(opc, src_reg)                \
4277     __ opc(xmm_result0, src_reg);              \
4278     __ opc(xmm_result1, src_reg);              \
4279     __ opc(xmm_result2, src_reg);              \
4280     __ opc(xmm_result3, src_reg);              \
4281     __ opc(xmm_result4, src_reg);              \
4282     __ opc(xmm_result5, src_reg);
4283 
4284     // k == 0 :  generate code for key_128
4285     // k == 1 :  generate code for key_192
4286     // k == 2 :  generate code for key_256
4287     for (int k = 0; k < 3; ++k) {
4288       //multi blocks starts here
4289       __ align(OptoLoopAlignment);
4290       __ BIND(L_multiBlock_loopTop[k]);
4291       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
4292       __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
4293       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4294 
4295       //load, then increase counters
4296       CTR_DoSix(movdqa, xmm_curr_counter);
4297       inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
4298       inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
4299       inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
4300       inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
4301       inc_counter(rbx, xmm_result5,  0x05, L__incCounter[k][4]);
4302       inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
4303       CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
4304       CTR_DoSix(pxor, xmm_key_tmp0);   //PXOR with Round 0 key
4305 
4306       //load two ROUND_KEYs at a time
4307       for (int i = 1; i < rounds[k]; ) {
4308         load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
4309         load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
4310         CTR_DoSix(aesenc, xmm_key_tmp1);
4311         i++;
4312         if (i != rounds[k]) {
4313           CTR_DoSix(aesenc, xmm_key_tmp0);
4314         } else {
4315           CTR_DoSix(aesenclast, xmm_key_tmp0);
4316         }
4317         i++;
4318       }
4319 
4320       // get next PARALLEL_FACTOR blocks into xmm_result registers
4321       __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4322       __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4323       __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4324       __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4325       __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
4326       __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
4327 
4328       __ pxor(xmm_result0, xmm_from0);
4329       __ pxor(xmm_result1, xmm_from1);
4330       __ pxor(xmm_result2, xmm_from2);
4331       __ pxor(xmm_result3, xmm_from3);
4332       __ pxor(xmm_result4, xmm_from4);
4333       __ pxor(xmm_result5, xmm_from5);
4334 
4335       // store 6 results into the next 64 bytes of output
4336       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4337       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4338       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4339       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4340       __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
4341       __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
4342 
4343       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
4344       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
4345       __ jmp(L_multiBlock_loopTop[k]);
4346 
4347       // singleBlock starts here
4348       __ align(OptoLoopAlignment);
4349       __ BIND(L_singleBlockLoopTop[k]);
4350       __ cmpptr(len_reg, 0);
4351       __ jcc(Assembler::lessEqual, L_exit);
4352       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4353       __ movdqa(xmm_result0, xmm_curr_counter);
4354       inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
4355       __ pshufb(xmm_result0, xmm_counter_shuf_mask);
4356       __ pxor(xmm_result0, xmm_key_tmp0);
4357       for (int i = 1; i < rounds[k]; i++) {
4358         load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
4359         __ aesenc(xmm_result0, xmm_key_tmp0);
4360       }
4361       load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
4362       __ aesenclast(xmm_result0, xmm_key_tmp0);
4363       __ cmpptr(len_reg, AESBlockSize);
4364       __ jcc(Assembler::less, L_processTail_insr[k]);
4365         __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4366         __ pxor(xmm_result0, xmm_from0);
4367         __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4368         __ addptr(pos, AESBlockSize);
4369         __ subptr(len_reg, AESBlockSize);
4370         __ jmp(L_singleBlockLoopTop[k]);
4371       __ BIND(L_processTail_insr[k]);                               // Process the tail part of the input array
4372         __ addptr(pos, len_reg);                                    // 1. Insert bytes from src array into xmm_from0 register
4373         __ testptr(len_reg, 8);
4374         __ jcc(Assembler::zero, L_processTail_4_insr[k]);
4375           __ subptr(pos,8);
4376           __ pinsrq(xmm_from0, Address(from, pos), 0);
4377         __ BIND(L_processTail_4_insr[k]);
4378         __ testptr(len_reg, 4);
4379         __ jcc(Assembler::zero, L_processTail_2_insr[k]);
4380           __ subptr(pos,4);
4381           __ pslldq(xmm_from0, 4);
4382           __ pinsrd(xmm_from0, Address(from, pos), 0);
4383         __ BIND(L_processTail_2_insr[k]);
4384         __ testptr(len_reg, 2);
4385         __ jcc(Assembler::zero, L_processTail_1_insr[k]);
4386           __ subptr(pos, 2);
4387           __ pslldq(xmm_from0, 2);
4388           __ pinsrw(xmm_from0, Address(from, pos), 0);
4389         __ BIND(L_processTail_1_insr[k]);
4390         __ testptr(len_reg, 1);
4391         __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
4392           __ subptr(pos, 1);
4393           __ pslldq(xmm_from0, 1);
4394           __ pinsrb(xmm_from0, Address(from, pos), 0);
4395         __ BIND(L_processTail_exit_insr[k]);
4396 
4397         __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);  // 2. Perform pxor of the encrypted counter and plaintext Bytes.
4398         __ pxor(xmm_result0, xmm_from0);                             //    Also the encrypted counter is saved for next invocation.
4399 
4400         __ testptr(len_reg, 8);
4401         __ jcc(Assembler::zero, L_processTail_4_extr[k]);            // 3. Extract bytes from xmm_result0 into the dest. array
4402           __ pextrq(Address(to, pos), xmm_result0, 0);
4403           __ psrldq(xmm_result0, 8);
4404           __ addptr(pos, 8);
4405         __ BIND(L_processTail_4_extr[k]);
4406         __ testptr(len_reg, 4);
4407         __ jcc(Assembler::zero, L_processTail_2_extr[k]);
4408           __ pextrd(Address(to, pos), xmm_result0, 0);
4409           __ psrldq(xmm_result0, 4);
4410           __ addptr(pos, 4);
4411         __ BIND(L_processTail_2_extr[k]);
4412         __ testptr(len_reg, 2);
4413         __ jcc(Assembler::zero, L_processTail_1_extr[k]);
4414           __ pextrw(Address(to, pos), xmm_result0, 0);
4415           __ psrldq(xmm_result0, 2);
4416           __ addptr(pos, 2);
4417         __ BIND(L_processTail_1_extr[k]);
4418         __ testptr(len_reg, 1);
4419         __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
4420           __ pextrb(Address(to, pos), xmm_result0, 0);
4421 
4422         __ BIND(L_processTail_exit_extr[k]);
4423         __ movl(Address(used_addr, 0), len_reg);
4424         __ jmp(L_exit);
4425 
4426     }
4427 
4428     __ BIND(L_exit);
4429     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
4430     __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
4431     __ pop(rbx); // pop the saved RBX.
4432 #ifdef _WIN64
4433     __ movl(rax, len_mem);
4434     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
4435     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
4436     __ addptr(rsp, 2 * wordSize);
4437 #else
4438     __ pop(rax); // return 'len'
4439 #endif
4440     __ leave(); // required for proper stackwalking of RuntimeStub frame
4441     __ ret(0);
4442     return start;
4443   }
4444 
4445 void roundDec(XMMRegister xmm_reg) {
4446   __ vaesdec(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4447   __ vaesdec(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4448   __ vaesdec(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4449   __ vaesdec(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4450   __ vaesdec(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4451   __ vaesdec(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4452   __ vaesdec(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4453   __ vaesdec(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4454 }
4455 
4456 void roundDeclast(XMMRegister xmm_reg) {
4457   __ vaesdeclast(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit);
4458   __ vaesdeclast(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit);
4459   __ vaesdeclast(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit);
4460   __ vaesdeclast(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit);
4461   __ vaesdeclast(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit);
4462   __ vaesdeclast(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit);
4463   __ vaesdeclast(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit);
4464   __ vaesdeclast(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit);
4465 }
4466 
4467   void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = NULL) {
4468     __ movdqu(xmmdst, Address(key, offset));
4469     if (xmm_shuf_mask != NULL) {
4470       __ pshufb(xmmdst, xmm_shuf_mask);
4471     } else {
4472       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4473     }
4474     __ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
4475 
4476   }
4477 
4478 address generate_cipherBlockChaining_decryptVectorAESCrypt() {
4479     assert(VM_Version::supports_vaes(), "need AES instructions and misaligned SSE support");
4480     __ align(CodeEntryAlignment);
4481     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
4482     address start = __ pc();
4483 
4484     const Register from = c_rarg0;  // source array address
4485     const Register to = c_rarg1;  // destination array address
4486     const Register key = c_rarg2;  // key array address
4487     const Register rvec = c_rarg3;  // r byte array initialized from initvector array address
4488     // and left with the results of the last encryption block
4489 #ifndef _WIN64
4490     const Register len_reg = c_rarg4;  // src len (must be multiple of blocksize 16)
4491 #else
4492     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
4493     const Register len_reg = r11;      // pick the volatile windows register
4494 #endif
4495 
4496     Label Loop, Loop1, L_128, L_256, L_192, KEY_192, KEY_256, Loop2, Lcbc_dec_rem_loop,
4497           Lcbc_dec_rem_last, Lcbc_dec_ret, Lcbc_dec_rem, Lcbc_exit;
4498 
4499     __ enter();
4500 
4501 #ifdef _WIN64
4502   // on win64, fill len_reg from stack position
4503     __ movl(len_reg, len_mem);
4504 #else
4505     __ push(len_reg); // Save
4506 #endif
4507     __ push(rbx);
4508     __ vzeroupper();
4509 
4510     // Temporary variable declaration for swapping key bytes
4511     const XMMRegister xmm_key_shuf_mask = xmm1;
4512     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
4513 
4514     // Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds
4515     const Register rounds = rbx;
4516     __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4517 
4518     const XMMRegister IV = xmm0;
4519     // Load IV and broadcast value to 512-bits
4520     __ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit);
4521 
4522     // Temporary variables for storing round keys
4523     const XMMRegister RK0 = xmm30;
4524     const XMMRegister RK1 = xmm9;
4525     const XMMRegister RK2 = xmm18;
4526     const XMMRegister RK3 = xmm19;
4527     const XMMRegister RK4 = xmm20;
4528     const XMMRegister RK5 = xmm21;
4529     const XMMRegister RK6 = xmm22;
4530     const XMMRegister RK7 = xmm23;
4531     const XMMRegister RK8 = xmm24;
4532     const XMMRegister RK9 = xmm25;
4533     const XMMRegister RK10 = xmm26;
4534 
4535      // Load and shuffle key
4536     // the java expanded key ordering is rotated one position from what we want
4537     // so we start from 1*16 here and hit 0*16 last
4538     ev_load_key(RK1, key, 1 * 16, xmm_key_shuf_mask);
4539     ev_load_key(RK2, key, 2 * 16, xmm_key_shuf_mask);
4540     ev_load_key(RK3, key, 3 * 16, xmm_key_shuf_mask);
4541     ev_load_key(RK4, key, 4 * 16, xmm_key_shuf_mask);
4542     ev_load_key(RK5, key, 5 * 16, xmm_key_shuf_mask);
4543     ev_load_key(RK6, key, 6 * 16, xmm_key_shuf_mask);
4544     ev_load_key(RK7, key, 7 * 16, xmm_key_shuf_mask);
4545     ev_load_key(RK8, key, 8 * 16, xmm_key_shuf_mask);
4546     ev_load_key(RK9, key, 9 * 16, xmm_key_shuf_mask);
4547     ev_load_key(RK10, key, 10 * 16, xmm_key_shuf_mask);
4548     ev_load_key(RK0, key, 0*16, xmm_key_shuf_mask);
4549 
4550     // Variables for storing source cipher text
4551     const XMMRegister S0 = xmm10;
4552     const XMMRegister S1 = xmm11;
4553     const XMMRegister S2 = xmm12;
4554     const XMMRegister S3 = xmm13;
4555     const XMMRegister S4 = xmm14;
4556     const XMMRegister S5 = xmm15;
4557     const XMMRegister S6 = xmm16;
4558     const XMMRegister S7 = xmm17;
4559 
4560     // Variables for storing decrypted text
4561     const XMMRegister B0 = xmm1;
4562     const XMMRegister B1 = xmm2;
4563     const XMMRegister B2 = xmm3;
4564     const XMMRegister B3 = xmm4;
4565     const XMMRegister B4 = xmm5;
4566     const XMMRegister B5 = xmm6;
4567     const XMMRegister B6 = xmm7;
4568     const XMMRegister B7 = xmm8;
4569 
4570     __ cmpl(rounds, 44);
4571     __ jcc(Assembler::greater, KEY_192);
4572     __ jmp(Loop);
4573 
4574     __ BIND(KEY_192);
4575     const XMMRegister RK11 = xmm27;
4576     const XMMRegister RK12 = xmm28;
4577     ev_load_key(RK11, key, 11*16, xmm_key_shuf_mask);
4578     ev_load_key(RK12, key, 12*16, xmm_key_shuf_mask);
4579 
4580     __ cmpl(rounds, 52);
4581     __ jcc(Assembler::greater, KEY_256);
4582     __ jmp(Loop);
4583 
4584     __ BIND(KEY_256);
4585     const XMMRegister RK13 = xmm29;
4586     const XMMRegister RK14 = xmm31;
4587     ev_load_key(RK13, key, 13*16, xmm_key_shuf_mask);
4588     ev_load_key(RK14, key, 14*16, xmm_key_shuf_mask);
4589 
4590     __ BIND(Loop);
4591     __ cmpl(len_reg, 512);
4592     __ jcc(Assembler::below, Lcbc_dec_rem);
4593     __ BIND(Loop1);
4594     __ subl(len_reg, 512);
4595     __ evmovdquq(S0, Address(from, 0 * 64), Assembler::AVX_512bit);
4596     __ evmovdquq(S1, Address(from, 1 * 64), Assembler::AVX_512bit);
4597     __ evmovdquq(S2, Address(from, 2 * 64), Assembler::AVX_512bit);
4598     __ evmovdquq(S3, Address(from, 3 * 64), Assembler::AVX_512bit);
4599     __ evmovdquq(S4, Address(from, 4 * 64), Assembler::AVX_512bit);
4600     __ evmovdquq(S5, Address(from, 5 * 64), Assembler::AVX_512bit);
4601     __ evmovdquq(S6, Address(from, 6 * 64), Assembler::AVX_512bit);
4602     __ evmovdquq(S7, Address(from, 7 * 64), Assembler::AVX_512bit);
4603     __ leaq(from, Address(from, 8 * 64));
4604 
4605     __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
4606     __ evpxorq(B1, S1, RK1, Assembler::AVX_512bit);
4607     __ evpxorq(B2, S2, RK1, Assembler::AVX_512bit);
4608     __ evpxorq(B3, S3, RK1, Assembler::AVX_512bit);
4609     __ evpxorq(B4, S4, RK1, Assembler::AVX_512bit);
4610     __ evpxorq(B5, S5, RK1, Assembler::AVX_512bit);
4611     __ evpxorq(B6, S6, RK1, Assembler::AVX_512bit);
4612     __ evpxorq(B7, S7, RK1, Assembler::AVX_512bit);
4613 
4614     __ evalignq(IV, S0, IV, 0x06);
4615     __ evalignq(S0, S1, S0, 0x06);
4616     __ evalignq(S1, S2, S1, 0x06);
4617     __ evalignq(S2, S3, S2, 0x06);
4618     __ evalignq(S3, S4, S3, 0x06);
4619     __ evalignq(S4, S5, S4, 0x06);
4620     __ evalignq(S5, S6, S5, 0x06);
4621     __ evalignq(S6, S7, S6, 0x06);
4622 
4623     roundDec(RK2);
4624     roundDec(RK3);
4625     roundDec(RK4);
4626     roundDec(RK5);
4627     roundDec(RK6);
4628     roundDec(RK7);
4629     roundDec(RK8);
4630     roundDec(RK9);
4631     roundDec(RK10);
4632 
4633     __ cmpl(rounds, 44);
4634     __ jcc(Assembler::belowEqual, L_128);
4635     roundDec(RK11);
4636     roundDec(RK12);
4637 
4638     __ cmpl(rounds, 52);
4639     __ jcc(Assembler::belowEqual, L_192);
4640     roundDec(RK13);
4641     roundDec(RK14);
4642 
4643     __ BIND(L_256);
4644     roundDeclast(RK0);
4645     __ jmp(Loop2);
4646 
4647     __ BIND(L_128);
4648     roundDeclast(RK0);
4649     __ jmp(Loop2);
4650 
4651     __ BIND(L_192);
4652     roundDeclast(RK0);
4653 
4654     __ BIND(Loop2);
4655     __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
4656     __ evpxorq(B1, B1, S0, Assembler::AVX_512bit);
4657     __ evpxorq(B2, B2, S1, Assembler::AVX_512bit);
4658     __ evpxorq(B3, B3, S2, Assembler::AVX_512bit);
4659     __ evpxorq(B4, B4, S3, Assembler::AVX_512bit);
4660     __ evpxorq(B5, B5, S4, Assembler::AVX_512bit);
4661     __ evpxorq(B6, B6, S5, Assembler::AVX_512bit);
4662     __ evpxorq(B7, B7, S6, Assembler::AVX_512bit);
4663     __ evmovdquq(IV, S7, Assembler::AVX_512bit);
4664 
4665     __ evmovdquq(Address(to, 0 * 64), B0, Assembler::AVX_512bit);
4666     __ evmovdquq(Address(to, 1 * 64), B1, Assembler::AVX_512bit);
4667     __ evmovdquq(Address(to, 2 * 64), B2, Assembler::AVX_512bit);
4668     __ evmovdquq(Address(to, 3 * 64), B3, Assembler::AVX_512bit);
4669     __ evmovdquq(Address(to, 4 * 64), B4, Assembler::AVX_512bit);
4670     __ evmovdquq(Address(to, 5 * 64), B5, Assembler::AVX_512bit);
4671     __ evmovdquq(Address(to, 6 * 64), B6, Assembler::AVX_512bit);
4672     __ evmovdquq(Address(to, 7 * 64), B7, Assembler::AVX_512bit);
4673     __ leaq(to, Address(to, 8 * 64));
4674     __ jmp(Loop);
4675 
4676     __ BIND(Lcbc_dec_rem);
4677     __ evshufi64x2(IV, IV, IV, 0x03, Assembler::AVX_512bit);
4678 
4679     __ BIND(Lcbc_dec_rem_loop);
4680     __ subl(len_reg, 16);
4681     __ jcc(Assembler::carrySet, Lcbc_dec_ret);
4682 
4683     __ movdqu(S0, Address(from, 0));
4684     __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit);
4685     __ vaesdec(B0, B0, RK2, Assembler::AVX_512bit);
4686     __ vaesdec(B0, B0, RK3, Assembler::AVX_512bit);
4687     __ vaesdec(B0, B0, RK4, Assembler::AVX_512bit);
4688     __ vaesdec(B0, B0, RK5, Assembler::AVX_512bit);
4689     __ vaesdec(B0, B0, RK6, Assembler::AVX_512bit);
4690     __ vaesdec(B0, B0, RK7, Assembler::AVX_512bit);
4691     __ vaesdec(B0, B0, RK8, Assembler::AVX_512bit);
4692     __ vaesdec(B0, B0, RK9, Assembler::AVX_512bit);
4693     __ vaesdec(B0, B0, RK10, Assembler::AVX_512bit);
4694     __ cmpl(rounds, 44);
4695     __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
4696 
4697     __ vaesdec(B0, B0, RK11, Assembler::AVX_512bit);
4698     __ vaesdec(B0, B0, RK12, Assembler::AVX_512bit);
4699     __ cmpl(rounds, 52);
4700     __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last);
4701 
4702     __ vaesdec(B0, B0, RK13, Assembler::AVX_512bit);
4703     __ vaesdec(B0, B0, RK14, Assembler::AVX_512bit);
4704 
4705     __ BIND(Lcbc_dec_rem_last);
4706     __ vaesdeclast(B0, B0, RK0, Assembler::AVX_512bit);
4707 
4708     __ evpxorq(B0, B0, IV, Assembler::AVX_512bit);
4709     __ evmovdquq(IV, S0, Assembler::AVX_512bit);
4710     __ movdqu(Address(to, 0), B0);
4711     __ leaq(from, Address(from, 16));
4712     __ leaq(to, Address(to, 16));
4713     __ jmp(Lcbc_dec_rem_loop);
4714 
4715     __ BIND(Lcbc_dec_ret);
4716     __ movdqu(Address(rvec, 0), IV);
4717 
4718     // Zero out the round keys
4719     __ evpxorq(RK0, RK0, RK0, Assembler::AVX_512bit);
4720     __ evpxorq(RK1, RK1, RK1, Assembler::AVX_512bit);
4721     __ evpxorq(RK2, RK2, RK2, Assembler::AVX_512bit);
4722     __ evpxorq(RK3, RK3, RK3, Assembler::AVX_512bit);
4723     __ evpxorq(RK4, RK4, RK4, Assembler::AVX_512bit);
4724     __ evpxorq(RK5, RK5, RK5, Assembler::AVX_512bit);
4725     __ evpxorq(RK6, RK6, RK6, Assembler::AVX_512bit);
4726     __ evpxorq(RK7, RK7, RK7, Assembler::AVX_512bit);
4727     __ evpxorq(RK8, RK8, RK8, Assembler::AVX_512bit);
4728     __ evpxorq(RK9, RK9, RK9, Assembler::AVX_512bit);
4729     __ evpxorq(RK10, RK10, RK10, Assembler::AVX_512bit);
4730     __ cmpl(rounds, 44);
4731     __ jcc(Assembler::belowEqual, Lcbc_exit);
4732     __ evpxorq(RK11, RK11, RK11, Assembler::AVX_512bit);
4733     __ evpxorq(RK12, RK12, RK12, Assembler::AVX_512bit);
4734     __ cmpl(rounds, 52);
4735     __ jcc(Assembler::belowEqual, Lcbc_exit);
4736     __ evpxorq(RK13, RK13, RK13, Assembler::AVX_512bit);
4737     __ evpxorq(RK14, RK14, RK14, Assembler::AVX_512bit);
4738 
4739     __ BIND(Lcbc_exit);
4740     __ pop(rbx);
4741 #ifdef _WIN64
4742     __ movl(rax, len_mem);
4743 #else
4744     __ pop(rax); // return length
4745 #endif
4746     __ leave(); // required for proper stackwalking of RuntimeStub frame
4747     __ ret(0);
4748     return start;
4749 }
4750 
4751 // Polynomial x^128+x^127+x^126+x^121+1
4752 address ghash_polynomial_addr() {
4753     __ align(CodeEntryAlignment);
4754     StubCodeMark mark(this, "StubRoutines", "_ghash_poly_addr");
4755     address start = __ pc();
4756     __ emit_data64(0x0000000000000001, relocInfo::none);
4757     __ emit_data64(0xc200000000000000, relocInfo::none);
4758     return start;
4759 }
4760 
4761 address ghash_shufflemask_addr() {
4762     __ align(CodeEntryAlignment);
4763     StubCodeMark mark(this, "StubRoutines", "_ghash_shuffmask_addr");
4764     address start = __ pc();
4765     __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
4766     __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none);
4767     return start;
4768 }
4769 
4770 // Ghash single and multi block operations using AVX instructions
4771 address generate_avx_ghash_processBlocks() {
4772     __ align(CodeEntryAlignment);
4773 
4774     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4775     address start = __ pc();
4776 
4777     // arguments
4778     const Register state = c_rarg0;
4779     const Register htbl = c_rarg1;
4780     const Register data = c_rarg2;
4781     const Register blocks = c_rarg3;
4782     __ enter();
4783    // Save state before entering routine
4784     __ avx_ghash(state, htbl, data, blocks);
4785     __ leave(); // required for proper stackwalking of RuntimeStub frame
4786     __ ret(0);
4787     return start;
4788 }
4789 
4790   // byte swap x86 long
4791   address generate_ghash_long_swap_mask() {
4792     __ align(CodeEntryAlignment);
4793     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
4794     address start = __ pc();
4795     __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
4796     __ emit_data64(0x0706050403020100, relocInfo::none );
4797   return start;
4798   }
4799 
4800   // byte swap x86 byte array
4801   address generate_ghash_byte_swap_mask() {
4802     __ align(CodeEntryAlignment);
4803     StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
4804     address start = __ pc();
4805     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
4806     __ emit_data64(0x0001020304050607, relocInfo::none );
4807   return start;
4808   }
4809 
4810   /* Single and multi-block ghash operations */
4811   address generate_ghash_processBlocks() {
4812     __ align(CodeEntryAlignment);
4813     Label L_ghash_loop, L_exit;
4814     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4815     address start = __ pc();
4816 
4817     const Register state        = c_rarg0;
4818     const Register subkeyH      = c_rarg1;
4819     const Register data         = c_rarg2;
4820     const Register blocks       = c_rarg3;
4821 
4822     const XMMRegister xmm_temp0 = xmm0;
4823     const XMMRegister xmm_temp1 = xmm1;
4824     const XMMRegister xmm_temp2 = xmm2;
4825     const XMMRegister xmm_temp3 = xmm3;
4826     const XMMRegister xmm_temp4 = xmm4;
4827     const XMMRegister xmm_temp5 = xmm5;
4828     const XMMRegister xmm_temp6 = xmm6;
4829     const XMMRegister xmm_temp7 = xmm7;
4830     const XMMRegister xmm_temp8 = xmm8;
4831     const XMMRegister xmm_temp9 = xmm9;
4832     const XMMRegister xmm_temp10 = xmm10;
4833 
4834     __ enter();
4835 
4836     __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
4837 
4838     __ movdqu(xmm_temp0, Address(state, 0));
4839     __ pshufb(xmm_temp0, xmm_temp10);
4840 
4841 
4842     __ BIND(L_ghash_loop);
4843     __ movdqu(xmm_temp2, Address(data, 0));
4844     __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
4845 
4846     __ movdqu(xmm_temp1, Address(subkeyH, 0));
4847     __ pshufb(xmm_temp1, xmm_temp10);
4848 
4849     __ pxor(xmm_temp0, xmm_temp2);
4850 
4851     //
4852     // Multiply with the hash key
4853     //
4854     __ movdqu(xmm_temp3, xmm_temp0);
4855     __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
4856     __ movdqu(xmm_temp4, xmm_temp0);
4857     __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
4858 
4859     __ movdqu(xmm_temp5, xmm_temp0);
4860     __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
4861     __ movdqu(xmm_temp6, xmm_temp0);
4862     __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
4863 
4864     __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
4865 
4866     __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
4867     __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
4868     __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
4869     __ pxor(xmm_temp3, xmm_temp5);
4870     __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
4871                                         // of the carry-less multiplication of
4872                                         // xmm0 by xmm1.
4873 
4874     // We shift the result of the multiplication by one bit position
4875     // to the left to cope for the fact that the bits are reversed.
4876     __ movdqu(xmm_temp7, xmm_temp3);
4877     __ movdqu(xmm_temp8, xmm_temp6);
4878     __ pslld(xmm_temp3, 1);
4879     __ pslld(xmm_temp6, 1);
4880     __ psrld(xmm_temp7, 31);
4881     __ psrld(xmm_temp8, 31);
4882     __ movdqu(xmm_temp9, xmm_temp7);
4883     __ pslldq(xmm_temp8, 4);
4884     __ pslldq(xmm_temp7, 4);
4885     __ psrldq(xmm_temp9, 12);
4886     __ por(xmm_temp3, xmm_temp7);
4887     __ por(xmm_temp6, xmm_temp8);
4888     __ por(xmm_temp6, xmm_temp9);
4889 
4890     //
4891     // First phase of the reduction
4892     //
4893     // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
4894     // independently.
4895     __ movdqu(xmm_temp7, xmm_temp3);
4896     __ movdqu(xmm_temp8, xmm_temp3);
4897     __ movdqu(xmm_temp9, xmm_temp3);
4898     __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
4899     __ pslld(xmm_temp8, 30);    // packed right shift shifting << 30
4900     __ pslld(xmm_temp9, 25);    // packed right shift shifting << 25
4901     __ pxor(xmm_temp7, xmm_temp8);      // xor the shifted versions
4902     __ pxor(xmm_temp7, xmm_temp9);
4903     __ movdqu(xmm_temp8, xmm_temp7);
4904     __ pslldq(xmm_temp7, 12);
4905     __ psrldq(xmm_temp8, 4);
4906     __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
4907 
4908     //
4909     // Second phase of the reduction
4910     //
4911     // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
4912     // shift operations.
4913     __ movdqu(xmm_temp2, xmm_temp3);
4914     __ movdqu(xmm_temp4, xmm_temp3);
4915     __ movdqu(xmm_temp5, xmm_temp3);
4916     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
4917     __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
4918     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
4919     __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
4920     __ pxor(xmm_temp2, xmm_temp5);
4921     __ pxor(xmm_temp2, xmm_temp8);
4922     __ pxor(xmm_temp3, xmm_temp2);
4923     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
4924 
4925     __ decrement(blocks);
4926     __ jcc(Assembler::zero, L_exit);
4927     __ movdqu(xmm_temp0, xmm_temp6);
4928     __ addptr(data, 16);
4929     __ jmp(L_ghash_loop);
4930 
4931     __ BIND(L_exit);
4932     __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
4933     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
4934     __ leave();
4935     __ ret(0);
4936     return start;
4937   }
4938 
4939   //base64 character set
4940   address base64_charset_addr() {
4941     __ align(CodeEntryAlignment);
4942     StubCodeMark mark(this, "StubRoutines", "base64_charset");
4943     address start = __ pc();
4944     __ emit_data64(0x0000004200000041, relocInfo::none);
4945     __ emit_data64(0x0000004400000043, relocInfo::none);
4946     __ emit_data64(0x0000004600000045, relocInfo::none);
4947     __ emit_data64(0x0000004800000047, relocInfo::none);
4948     __ emit_data64(0x0000004a00000049, relocInfo::none);
4949     __ emit_data64(0x0000004c0000004b, relocInfo::none);
4950     __ emit_data64(0x0000004e0000004d, relocInfo::none);
4951     __ emit_data64(0x000000500000004f, relocInfo::none);
4952     __ emit_data64(0x0000005200000051, relocInfo::none);
4953     __ emit_data64(0x0000005400000053, relocInfo::none);
4954     __ emit_data64(0x0000005600000055, relocInfo::none);
4955     __ emit_data64(0x0000005800000057, relocInfo::none);
4956     __ emit_data64(0x0000005a00000059, relocInfo::none);
4957     __ emit_data64(0x0000006200000061, relocInfo::none);
4958     __ emit_data64(0x0000006400000063, relocInfo::none);
4959     __ emit_data64(0x0000006600000065, relocInfo::none);
4960     __ emit_data64(0x0000006800000067, relocInfo::none);
4961     __ emit_data64(0x0000006a00000069, relocInfo::none);
4962     __ emit_data64(0x0000006c0000006b, relocInfo::none);
4963     __ emit_data64(0x0000006e0000006d, relocInfo::none);
4964     __ emit_data64(0x000000700000006f, relocInfo::none);
4965     __ emit_data64(0x0000007200000071, relocInfo::none);
4966     __ emit_data64(0x0000007400000073, relocInfo::none);
4967     __ emit_data64(0x0000007600000075, relocInfo::none);
4968     __ emit_data64(0x0000007800000077, relocInfo::none);
4969     __ emit_data64(0x0000007a00000079, relocInfo::none);
4970     __ emit_data64(0x0000003100000030, relocInfo::none);
4971     __ emit_data64(0x0000003300000032, relocInfo::none);
4972     __ emit_data64(0x0000003500000034, relocInfo::none);
4973     __ emit_data64(0x0000003700000036, relocInfo::none);
4974     __ emit_data64(0x0000003900000038, relocInfo::none);
4975     __ emit_data64(0x0000002f0000002b, relocInfo::none);
4976     return start;
4977   }
4978 
4979   //base64 url character set
4980   address base64url_charset_addr() {
4981     __ align(CodeEntryAlignment);
4982     StubCodeMark mark(this, "StubRoutines", "base64url_charset");
4983     address start = __ pc();
4984     __ emit_data64(0x0000004200000041, relocInfo::none);
4985     __ emit_data64(0x0000004400000043, relocInfo::none);
4986     __ emit_data64(0x0000004600000045, relocInfo::none);
4987     __ emit_data64(0x0000004800000047, relocInfo::none);
4988     __ emit_data64(0x0000004a00000049, relocInfo::none);
4989     __ emit_data64(0x0000004c0000004b, relocInfo::none);
4990     __ emit_data64(0x0000004e0000004d, relocInfo::none);
4991     __ emit_data64(0x000000500000004f, relocInfo::none);
4992     __ emit_data64(0x0000005200000051, relocInfo::none);
4993     __ emit_data64(0x0000005400000053, relocInfo::none);
4994     __ emit_data64(0x0000005600000055, relocInfo::none);
4995     __ emit_data64(0x0000005800000057, relocInfo::none);
4996     __ emit_data64(0x0000005a00000059, relocInfo::none);
4997     __ emit_data64(0x0000006200000061, relocInfo::none);
4998     __ emit_data64(0x0000006400000063, relocInfo::none);
4999     __ emit_data64(0x0000006600000065, relocInfo::none);
5000     __ emit_data64(0x0000006800000067, relocInfo::none);
5001     __ emit_data64(0x0000006a00000069, relocInfo::none);
5002     __ emit_data64(0x0000006c0000006b, relocInfo::none);
5003     __ emit_data64(0x0000006e0000006d, relocInfo::none);
5004     __ emit_data64(0x000000700000006f, relocInfo::none);
5005     __ emit_data64(0x0000007200000071, relocInfo::none);
5006     __ emit_data64(0x0000007400000073, relocInfo::none);
5007     __ emit_data64(0x0000007600000075, relocInfo::none);
5008     __ emit_data64(0x0000007800000077, relocInfo::none);
5009     __ emit_data64(0x0000007a00000079, relocInfo::none);
5010     __ emit_data64(0x0000003100000030, relocInfo::none);
5011     __ emit_data64(0x0000003300000032, relocInfo::none);
5012     __ emit_data64(0x0000003500000034, relocInfo::none);
5013     __ emit_data64(0x0000003700000036, relocInfo::none);
5014     __ emit_data64(0x0000003900000038, relocInfo::none);
5015     __ emit_data64(0x0000005f0000002d, relocInfo::none);
5016 
5017     return start;
5018   }
5019 
5020   address base64_bswap_mask_addr() {
5021     __ align(CodeEntryAlignment);
5022     StubCodeMark mark(this, "StubRoutines", "bswap_mask_base64");
5023     address start = __ pc();
5024     __ emit_data64(0x0504038002010080, relocInfo::none);
5025     __ emit_data64(0x0b0a098008070680, relocInfo::none);
5026     __ emit_data64(0x0908078006050480, relocInfo::none);
5027     __ emit_data64(0x0f0e0d800c0b0a80, relocInfo::none);
5028     __ emit_data64(0x0605048003020180, relocInfo::none);
5029     __ emit_data64(0x0c0b0a8009080780, relocInfo::none);
5030     __ emit_data64(0x0504038002010080, relocInfo::none);
5031     __ emit_data64(0x0b0a098008070680, relocInfo::none);
5032 
5033     return start;
5034   }
5035 
5036   address base64_right_shift_mask_addr() {
5037     __ align(CodeEntryAlignment);
5038     StubCodeMark mark(this, "StubRoutines", "right_shift_mask");
5039     address start = __ pc();
5040     __ emit_data64(0x0006000400020000, relocInfo::none);
5041     __ emit_data64(0x0006000400020000, relocInfo::none);
5042     __ emit_data64(0x0006000400020000, relocInfo::none);
5043     __ emit_data64(0x0006000400020000, relocInfo::none);
5044     __ emit_data64(0x0006000400020000, relocInfo::none);
5045     __ emit_data64(0x0006000400020000, relocInfo::none);
5046     __ emit_data64(0x0006000400020000, relocInfo::none);
5047     __ emit_data64(0x0006000400020000, relocInfo::none);
5048 
5049     return start;
5050   }
5051 
5052   address base64_left_shift_mask_addr() {
5053     __ align(CodeEntryAlignment);
5054     StubCodeMark mark(this, "StubRoutines", "left_shift_mask");
5055     address start = __ pc();
5056     __ emit_data64(0x0000000200040000, relocInfo::none);
5057     __ emit_data64(0x0000000200040000, relocInfo::none);
5058     __ emit_data64(0x0000000200040000, relocInfo::none);
5059     __ emit_data64(0x0000000200040000, relocInfo::none);
5060     __ emit_data64(0x0000000200040000, relocInfo::none);
5061     __ emit_data64(0x0000000200040000, relocInfo::none);
5062     __ emit_data64(0x0000000200040000, relocInfo::none);
5063     __ emit_data64(0x0000000200040000, relocInfo::none);
5064 
5065     return start;
5066   }
5067 
5068   address base64_and_mask_addr() {
5069     __ align(CodeEntryAlignment);
5070     StubCodeMark mark(this, "StubRoutines", "and_mask");
5071     address start = __ pc();
5072     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5073     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5074     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5075     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5076     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5077     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5078     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5079     __ emit_data64(0x3f003f003f000000, relocInfo::none);
5080     return start;
5081   }
5082 
5083   address base64_gather_mask_addr() {
5084     __ align(CodeEntryAlignment);
5085     StubCodeMark mark(this, "StubRoutines", "gather_mask");
5086     address start = __ pc();
5087     __ emit_data64(0xffffffffffffffff, relocInfo::none);
5088     return start;
5089   }
5090 
5091 // Code for generating Base64 encoding.
5092 // Intrinsic function prototype in Base64.java:
5093 // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) {
5094   address generate_base64_encodeBlock() {
5095     __ align(CodeEntryAlignment);
5096     StubCodeMark mark(this, "StubRoutines", "implEncode");
5097     address start = __ pc();
5098     __ enter();
5099 
5100     // Save callee-saved registers before using them
5101     __ push(r12);
5102     __ push(r13);
5103     __ push(r14);
5104     __ push(r15);
5105 
5106     // arguments
5107     const Register source = c_rarg0; // Source Array
5108     const Register start_offset = c_rarg1; // start offset
5109     const Register end_offset = c_rarg2; // end offset
5110     const Register dest = c_rarg3; // destination array
5111 
5112 #ifndef _WIN64
5113     const Register dp = c_rarg4;  // Position for writing to dest array
5114     const Register isURL = c_rarg5;// Base64 or URL character set
5115 #else
5116     const Address  dp_mem(rbp, 6 * wordSize);  // length is on stack on Win64
5117     const Address isURL_mem(rbp, 7 * wordSize);
5118     const Register isURL = r10;      // pick the volatile windows register
5119     const Register dp = r12;
5120     __ movl(dp, dp_mem);
5121     __ movl(isURL, isURL_mem);
5122 #endif
5123 
5124     const Register length = r14;
5125     Label L_process80, L_process32, L_process3, L_exit, L_processdata;
5126 
5127     // calculate length from offsets
5128     __ movl(length, end_offset);
5129     __ subl(length, start_offset);
5130     __ cmpl(length, 0);
5131     __ jcc(Assembler::lessEqual, L_exit);
5132 
5133     __ lea(r11, ExternalAddress(StubRoutines::x86::base64_charset_addr()));
5134     // check if base64 charset(isURL=0) or base64 url charset(isURL=1) needs to be loaded
5135     __ cmpl(isURL, 0);
5136     __ jcc(Assembler::equal, L_processdata);
5137     __ lea(r11, ExternalAddress(StubRoutines::x86::base64url_charset_addr()));
5138 
5139     // load masks required for encoding data
5140     __ BIND(L_processdata);
5141     __ movdqu(xmm16, ExternalAddress(StubRoutines::x86::base64_gather_mask_addr()));
5142     // Set 64 bits of K register.
5143     __ evpcmpeqb(k3, xmm16, xmm16, Assembler::AVX_512bit);
5144     __ evmovdquq(xmm12, ExternalAddress(StubRoutines::x86::base64_bswap_mask_addr()), Assembler::AVX_256bit, r13);
5145     __ evmovdquq(xmm13, ExternalAddress(StubRoutines::x86::base64_right_shift_mask_addr()), Assembler::AVX_512bit, r13);
5146     __ evmovdquq(xmm14, ExternalAddress(StubRoutines::x86::base64_left_shift_mask_addr()), Assembler::AVX_512bit, r13);
5147     __ evmovdquq(xmm15, ExternalAddress(StubRoutines::x86::base64_and_mask_addr()), Assembler::AVX_512bit, r13);
5148 
5149     // Vector Base64 implementation, producing 96 bytes of encoded data
5150     __ BIND(L_process80);
5151     __ cmpl(length, 80);
5152     __ jcc(Assembler::below, L_process32);
5153     __ evmovdquq(xmm0, Address(source, start_offset, Address::times_1, 0), Assembler::AVX_256bit);
5154     __ evmovdquq(xmm1, Address(source, start_offset, Address::times_1, 24), Assembler::AVX_256bit);
5155     __ evmovdquq(xmm2, Address(source, start_offset, Address::times_1, 48), Assembler::AVX_256bit);
5156 
5157     //permute the input data in such a manner that we have continuity of the source
5158     __ vpermq(xmm3, xmm0, 148, Assembler::AVX_256bit);
5159     __ vpermq(xmm4, xmm1, 148, Assembler::AVX_256bit);
5160     __ vpermq(xmm5, xmm2, 148, Assembler::AVX_256bit);
5161 
5162     //shuffle input and group 3 bytes of data and to it add 0 as the 4th byte.
5163     //we can deal with 12 bytes at a time in a 128 bit register
5164     __ vpshufb(xmm3, xmm3, xmm12, Assembler::AVX_256bit);
5165     __ vpshufb(xmm4, xmm4, xmm12, Assembler::AVX_256bit);
5166     __ vpshufb(xmm5, xmm5, xmm12, Assembler::AVX_256bit);
5167 
5168     //convert byte to word. Each 128 bit register will have 6 bytes for processing
5169     __ vpmovzxbw(xmm3, xmm3, Assembler::AVX_512bit);
5170     __ vpmovzxbw(xmm4, xmm4, Assembler::AVX_512bit);
5171     __ vpmovzxbw(xmm5, xmm5, Assembler::AVX_512bit);
5172 
5173     // Extract bits in the following pattern 6, 4+2, 2+4, 6 to convert 3, 8 bit numbers to 4, 6 bit numbers
5174     __ evpsrlvw(xmm0, xmm3, xmm13,  Assembler::AVX_512bit);
5175     __ evpsrlvw(xmm1, xmm4, xmm13, Assembler::AVX_512bit);
5176     __ evpsrlvw(xmm2, xmm5, xmm13, Assembler::AVX_512bit);
5177 
5178     __ evpsllvw(xmm3, xmm3, xmm14, Assembler::AVX_512bit);
5179     __ evpsllvw(xmm4, xmm4, xmm14, Assembler::AVX_512bit);
5180     __ evpsllvw(xmm5, xmm5, xmm14, Assembler::AVX_512bit);
5181 
5182     __ vpsrlq(xmm0, xmm0, 8, Assembler::AVX_512bit);
5183     __ vpsrlq(xmm1, xmm1, 8, Assembler::AVX_512bit);
5184     __ vpsrlq(xmm2, xmm2, 8, Assembler::AVX_512bit);
5185 
5186     __ vpsllq(xmm3, xmm3, 8, Assembler::AVX_512bit);
5187     __ vpsllq(xmm4, xmm4, 8, Assembler::AVX_512bit);
5188     __ vpsllq(xmm5, xmm5, 8, Assembler::AVX_512bit);
5189 
5190     __ vpandq(xmm3, xmm3, xmm15, Assembler::AVX_512bit);
5191     __ vpandq(xmm4, xmm4, xmm15, Assembler::AVX_512bit);
5192     __ vpandq(xmm5, xmm5, xmm15, Assembler::AVX_512bit);
5193 
5194     // Get the final 4*6 bits base64 encoding
5195     __ vporq(xmm3, xmm3, xmm0, Assembler::AVX_512bit);
5196     __ vporq(xmm4, xmm4, xmm1, Assembler::AVX_512bit);
5197     __ vporq(xmm5, xmm5, xmm2, Assembler::AVX_512bit);
5198 
5199     // Shift
5200     __ vpsrlq(xmm3, xmm3, 8, Assembler::AVX_512bit);
5201     __ vpsrlq(xmm4, xmm4, 8, Assembler::AVX_512bit);
5202     __ vpsrlq(xmm5, xmm5, 8, Assembler::AVX_512bit);
5203 
5204     // look up 6 bits in the base64 character set to fetch the encoding
5205     // we are converting word to dword as gather instructions need dword indices for looking up encoding
5206     __ vextracti64x4(xmm6, xmm3, 0);
5207     __ vpmovzxwd(xmm0, xmm6, Assembler::AVX_512bit);
5208     __ vextracti64x4(xmm6, xmm3, 1);
5209     __ vpmovzxwd(xmm1, xmm6, Assembler::AVX_512bit);
5210 
5211     __ vextracti64x4(xmm6, xmm4, 0);
5212     __ vpmovzxwd(xmm2, xmm6, Assembler::AVX_512bit);
5213     __ vextracti64x4(xmm6, xmm4, 1);
5214     __ vpmovzxwd(xmm3, xmm6, Assembler::AVX_512bit);
5215 
5216     __ vextracti64x4(xmm4, xmm5, 0);
5217     __ vpmovzxwd(xmm6, xmm4, Assembler::AVX_512bit);
5218 
5219     __ vextracti64x4(xmm4, xmm5, 1);
5220     __ vpmovzxwd(xmm7, xmm4, Assembler::AVX_512bit);
5221 
5222     __ kmovql(k2, k3);
5223     __ evpgatherdd(xmm4, k2, Address(r11, xmm0, Address::times_4, 0), Assembler::AVX_512bit);
5224     __ kmovql(k2, k3);
5225     __ evpgatherdd(xmm5, k2, Address(r11, xmm1, Address::times_4, 0), Assembler::AVX_512bit);
5226     __ kmovql(k2, k3);
5227     __ evpgatherdd(xmm8, k2, Address(r11, xmm2, Address::times_4, 0), Assembler::AVX_512bit);
5228     __ kmovql(k2, k3);
5229     __ evpgatherdd(xmm9, k2, Address(r11, xmm3, Address::times_4, 0), Assembler::AVX_512bit);
5230     __ kmovql(k2, k3);
5231     __ evpgatherdd(xmm10, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit);
5232     __ kmovql(k2, k3);
5233     __ evpgatherdd(xmm11, k2, Address(r11, xmm7, Address::times_4, 0), Assembler::AVX_512bit);
5234 
5235     //Down convert dword to byte. Final output is 16*6 = 96 bytes long
5236     __ evpmovdb(Address(dest, dp, Address::times_1, 0), xmm4, Assembler::AVX_512bit);
5237     __ evpmovdb(Address(dest, dp, Address::times_1, 16), xmm5, Assembler::AVX_512bit);
5238     __ evpmovdb(Address(dest, dp, Address::times_1, 32), xmm8, Assembler::AVX_512bit);
5239     __ evpmovdb(Address(dest, dp, Address::times_1, 48), xmm9, Assembler::AVX_512bit);
5240     __ evpmovdb(Address(dest, dp, Address::times_1, 64), xmm10, Assembler::AVX_512bit);
5241     __ evpmovdb(Address(dest, dp, Address::times_1, 80), xmm11, Assembler::AVX_512bit);
5242 
5243     __ addq(dest, 96);
5244     __ addq(source, 72);
5245     __ subq(length, 72);
5246     __ jmp(L_process80);
5247 
5248     // Vector Base64 implementation generating 32 bytes of encoded data
5249     __ BIND(L_process32);
5250     __ cmpl(length, 32);
5251     __ jcc(Assembler::below, L_process3);
5252     __ evmovdquq(xmm0, Address(source, start_offset), Assembler::AVX_256bit);
5253     __ vpermq(xmm0, xmm0, 148, Assembler::AVX_256bit);
5254     __ vpshufb(xmm6, xmm0, xmm12, Assembler::AVX_256bit);
5255     __ vpmovzxbw(xmm6, xmm6, Assembler::AVX_512bit);
5256     __ evpsrlvw(xmm2, xmm6, xmm13, Assembler::AVX_512bit);
5257     __ evpsllvw(xmm3, xmm6, xmm14, Assembler::AVX_512bit);
5258 
5259     __ vpsrlq(xmm2, xmm2, 8, Assembler::AVX_512bit);
5260     __ vpsllq(xmm3, xmm3, 8, Assembler::AVX_512bit);
5261     __ vpandq(xmm3, xmm3, xmm15, Assembler::AVX_512bit);
5262     __ vporq(xmm1, xmm2, xmm3, Assembler::AVX_512bit);
5263     __ vpsrlq(xmm1, xmm1, 8, Assembler::AVX_512bit);
5264     __ vextracti64x4(xmm9, xmm1, 0);
5265     __ vpmovzxwd(xmm6, xmm9, Assembler::AVX_512bit);
5266     __ vextracti64x4(xmm9, xmm1, 1);
5267     __ vpmovzxwd(xmm5, xmm9,  Assembler::AVX_512bit);
5268     __ kmovql(k2, k3);
5269     __ evpgatherdd(xmm8, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit);
5270     __ kmovql(k2, k3);
5271     __ evpgatherdd(xmm10, k2, Address(r11, xmm5, Address::times_4, 0), Assembler::AVX_512bit);
5272     __ evpmovdb(Address(dest, dp, Address::times_1, 0), xmm8, Assembler::AVX_512bit);
5273     __ evpmovdb(Address(dest, dp, Address::times_1, 16), xmm10, Assembler::AVX_512bit);
5274     __ subq(length, 24);
5275     __ addq(dest, 32);
5276     __ addq(source, 24);
5277     __ jmp(L_process32);
5278 
5279     // Scalar data processing takes 3 bytes at a time and produces 4 bytes of encoded data
5280     /* This code corresponds to the scalar version of the following snippet in Base64.java
5281     ** int bits = (src[sp0++] & 0xff) << 16 |(src[sp0++] & 0xff) << 8 |(src[sp0++] & 0xff);
5282     ** dst[dp0++] = (byte)base64[(bits >> > 18) & 0x3f];
5283     ** dst[dp0++] = (byte)base64[(bits >> > 12) & 0x3f];
5284     ** dst[dp0++] = (byte)base64[(bits >> > 6) & 0x3f];
5285     ** dst[dp0++] = (byte)base64[bits & 0x3f];*/
5286     __ BIND(L_process3);
5287     __ cmpl(length, 3);
5288     __ jcc(Assembler::below, L_exit);
5289     // Read 1 byte at a time
5290     __ movzbl(rax, Address(source, start_offset));
5291     __ shll(rax, 0x10);
5292     __ movl(r15, rax);
5293     __ movzbl(rax, Address(source, start_offset, Address::times_1, 1));
5294     __ shll(rax, 0x8);
5295     __ movzwl(rax, rax);
5296     __ orl(r15, rax);
5297     __ movzbl(rax, Address(source, start_offset, Address::times_1, 2));
5298     __ orl(rax, r15);
5299     // Save 3 bytes read in r15
5300     __ movl(r15, rax);
5301     __ shrl(rax, 0x12);
5302     __ andl(rax, 0x3f);
5303     // rax contains the index, r11 contains base64 lookup table
5304     __ movb(rax, Address(r11, rax, Address::times_4));
5305     // Write the encoded byte to destination
5306     __ movb(Address(dest, dp, Address::times_1, 0), rax);
5307     __ movl(rax, r15);
5308     __ shrl(rax, 0xc);
5309     __ andl(rax, 0x3f);
5310     __ movb(rax, Address(r11, rax, Address::times_4));
5311     __ movb(Address(dest, dp, Address::times_1, 1), rax);
5312     __ movl(rax, r15);
5313     __ shrl(rax, 0x6);
5314     __ andl(rax, 0x3f);
5315     __ movb(rax, Address(r11, rax, Address::times_4));
5316     __ movb(Address(dest, dp, Address::times_1, 2), rax);
5317     __ movl(rax, r15);
5318     __ andl(rax, 0x3f);
5319     __ movb(rax, Address(r11, rax, Address::times_4));
5320     __ movb(Address(dest, dp, Address::times_1, 3), rax);
5321     __ subl(length, 3);
5322     __ addq(dest, 4);
5323     __ addq(source, 3);
5324     __ jmp(L_process3);
5325     __ BIND(L_exit);
5326     __ pop(r15);
5327     __ pop(r14);
5328     __ pop(r13);
5329     __ pop(r12);
5330     __ leave();
5331     __ ret(0);
5332     return start;
5333   }
5334 
5335   /**
5336    *  Arguments:
5337    *
5338    * Inputs:
5339    *   c_rarg0   - int crc
5340    *   c_rarg1   - byte* buf
5341    *   c_rarg2   - int length
5342    *
5343    * Ouput:
5344    *       rax   - int crc result
5345    */
5346   address generate_updateBytesCRC32() {
5347     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
5348 
5349     __ align(CodeEntryAlignment);
5350     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
5351 
5352     address start = __ pc();
5353     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5354     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5355     // rscratch1: r10
5356     const Register crc   = c_rarg0;  // crc
5357     const Register buf   = c_rarg1;  // source java byte array address
5358     const Register len   = c_rarg2;  // length
5359     const Register table = c_rarg3;  // crc_table address (reuse register)
5360     const Register tmp   = r11;
5361     assert_different_registers(crc, buf, len, table, tmp, rax);
5362 
5363     BLOCK_COMMENT("Entry:");
5364     __ enter(); // required for proper stackwalking of RuntimeStub frame
5365 
5366     __ kernel_crc32(crc, buf, len, table, tmp);
5367 
5368     __ movl(rax, crc);
5369     __ vzeroupper();
5370     __ leave(); // required for proper stackwalking of RuntimeStub frame
5371     __ ret(0);
5372 
5373     return start;
5374   }
5375 
5376   /**
5377   *  Arguments:
5378   *
5379   * Inputs:
5380   *   c_rarg0   - int crc
5381   *   c_rarg1   - byte* buf
5382   *   c_rarg2   - long length
5383   *   c_rarg3   - table_start - optional (present only when doing a library_call,
5384   *              not used by x86 algorithm)
5385   *
5386   * Ouput:
5387   *       rax   - int crc result
5388   */
5389   address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
5390       assert(UseCRC32CIntrinsics, "need SSE4_2");
5391       __ align(CodeEntryAlignment);
5392       StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
5393       address start = __ pc();
5394       //reg.arg        int#0        int#1        int#2        int#3        int#4        int#5        float regs
5395       //Windows        RCX          RDX          R8           R9           none         none         XMM0..XMM3
5396       //Lin / Sol      RDI          RSI          RDX          RCX          R8           R9           XMM0..XMM7
5397       const Register crc = c_rarg0;  // crc
5398       const Register buf = c_rarg1;  // source java byte array address
5399       const Register len = c_rarg2;  // length
5400       const Register a = rax;
5401       const Register j = r9;
5402       const Register k = r10;
5403       const Register l = r11;
5404 #ifdef _WIN64
5405       const Register y = rdi;
5406       const Register z = rsi;
5407 #else
5408       const Register y = rcx;
5409       const Register z = r8;
5410 #endif
5411       assert_different_registers(crc, buf, len, a, j, k, l, y, z);
5412 
5413       BLOCK_COMMENT("Entry:");
5414       __ enter(); // required for proper stackwalking of RuntimeStub frame
5415 #ifdef _WIN64
5416       __ push(y);
5417       __ push(z);
5418 #endif
5419       __ crc32c_ipl_alg2_alt2(crc, buf, len,
5420                               a, j, k,
5421                               l, y, z,
5422                               c_farg0, c_farg1, c_farg2,
5423                               is_pclmulqdq_supported);
5424       __ movl(rax, crc);
5425 #ifdef _WIN64
5426       __ pop(z);
5427       __ pop(y);
5428 #endif
5429       __ vzeroupper();
5430       __ leave(); // required for proper stackwalking of RuntimeStub frame
5431       __ ret(0);
5432 
5433       return start;
5434   }
5435 
5436   /**
5437    *  Arguments:
5438    *
5439    *  Input:
5440    *    c_rarg0   - x address
5441    *    c_rarg1   - x length
5442    *    c_rarg2   - y address
5443    *    c_rarg3   - y length
5444    * not Win64
5445    *    c_rarg4   - z address
5446    *    c_rarg5   - z length
5447    * Win64
5448    *    rsp+40    - z address
5449    *    rsp+48    - z length
5450    */
5451   address generate_multiplyToLen() {
5452     __ align(CodeEntryAlignment);
5453     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
5454 
5455     address start = __ pc();
5456     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5457     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5458     const Register x     = rdi;
5459     const Register xlen  = rax;
5460     const Register y     = rsi;
5461     const Register ylen  = rcx;
5462     const Register z     = r8;
5463     const Register zlen  = r11;
5464 
5465     // Next registers will be saved on stack in multiply_to_len().
5466     const Register tmp1  = r12;
5467     const Register tmp2  = r13;
5468     const Register tmp3  = r14;
5469     const Register tmp4  = r15;
5470     const Register tmp5  = rbx;
5471 
5472     BLOCK_COMMENT("Entry:");
5473     __ enter(); // required for proper stackwalking of RuntimeStub frame
5474 
5475 #ifndef _WIN64
5476     __ movptr(zlen, r9); // Save r9 in r11 - zlen
5477 #endif
5478     setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
5479                        // ylen => rcx, z => r8, zlen => r11
5480                        // r9 and r10 may be used to save non-volatile registers
5481 #ifdef _WIN64
5482     // last 2 arguments (#4, #5) are on stack on Win64
5483     __ movptr(z, Address(rsp, 6 * wordSize));
5484     __ movptr(zlen, Address(rsp, 7 * wordSize));
5485 #endif
5486 
5487     __ movptr(xlen, rsi);
5488     __ movptr(y,    rdx);
5489     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
5490 
5491     restore_arg_regs();
5492 
5493     __ leave(); // required for proper stackwalking of RuntimeStub frame
5494     __ ret(0);
5495 
5496     return start;
5497   }
5498 
5499   /**
5500   *  Arguments:
5501   *
5502   *  Input:
5503   *    c_rarg0   - obja     address
5504   *    c_rarg1   - objb     address
5505   *    c_rarg3   - length   length
5506   *    c_rarg4   - scale    log2_array_indxscale
5507   *
5508   *  Output:
5509   *        rax   - int >= mismatched index, < 0 bitwise complement of tail
5510   */
5511   address generate_vectorizedMismatch() {
5512     __ align(CodeEntryAlignment);
5513     StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
5514     address start = __ pc();
5515 
5516     BLOCK_COMMENT("Entry:");
5517     __ enter();
5518 
5519 #ifdef _WIN64  // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5520     const Register scale = c_rarg0;  //rcx, will exchange with r9
5521     const Register objb = c_rarg1;   //rdx
5522     const Register length = c_rarg2; //r8
5523     const Register obja = c_rarg3;   //r9
5524     __ xchgq(obja, scale);  //now obja and scale contains the correct contents
5525 
5526     const Register tmp1 = r10;
5527     const Register tmp2 = r11;
5528 #endif
5529 #ifndef _WIN64 // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5530     const Register obja = c_rarg0;   //U:rdi
5531     const Register objb = c_rarg1;   //U:rsi
5532     const Register length = c_rarg2; //U:rdx
5533     const Register scale = c_rarg3;  //U:rcx
5534     const Register tmp1 = r8;
5535     const Register tmp2 = r9;
5536 #endif
5537     const Register result = rax; //return value
5538     const XMMRegister vec0 = xmm0;
5539     const XMMRegister vec1 = xmm1;
5540     const XMMRegister vec2 = xmm2;
5541 
5542     __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
5543 
5544     __ vzeroupper();
5545     __ leave();
5546     __ ret(0);
5547 
5548     return start;
5549   }
5550 
5551 /**
5552    *  Arguments:
5553    *
5554   //  Input:
5555   //    c_rarg0   - x address
5556   //    c_rarg1   - x length
5557   //    c_rarg2   - z address
5558   //    c_rarg3   - z lenth
5559    *
5560    */
5561   address generate_squareToLen() {
5562 
5563     __ align(CodeEntryAlignment);
5564     StubCodeMark mark(this, "StubRoutines", "squareToLen");
5565 
5566     address start = __ pc();
5567     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5568     // Unix:  rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
5569     const Register x      = rdi;
5570     const Register len    = rsi;
5571     const Register z      = r8;
5572     const Register zlen   = rcx;
5573 
5574    const Register tmp1      = r12;
5575    const Register tmp2      = r13;
5576    const Register tmp3      = r14;
5577    const Register tmp4      = r15;
5578    const Register tmp5      = rbx;
5579 
5580     BLOCK_COMMENT("Entry:");
5581     __ enter(); // required for proper stackwalking of RuntimeStub frame
5582 
5583     setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
5584                        // zlen => rcx
5585                        // r9 and r10 may be used to save non-volatile registers
5586     __ movptr(r8, rdx);
5587     __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
5588 
5589     restore_arg_regs();
5590 
5591     __ leave(); // required for proper stackwalking of RuntimeStub frame
5592     __ ret(0);
5593 
5594     return start;
5595   }
5596 
5597   address generate_method_entry_barrier() {
5598     __ align(CodeEntryAlignment);
5599     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
5600 
5601     Label deoptimize_label;
5602 
5603     address start = __ pc();
5604 
5605     __ push(-1); // cookie, this is used for writing the new rsp when deoptimizing
5606 
5607     BLOCK_COMMENT("Entry:");
5608     __ enter(); // save rbp
5609 
5610     // save c_rarg0, because we want to use that value.
5611     // We could do without it but then we depend on the number of slots used by pusha
5612     __ push(c_rarg0);
5613 
5614     __ lea(c_rarg0, Address(rsp, wordSize * 3)); // 1 for cookie, 1 for rbp, 1 for c_rarg0 - this should be the return address
5615 
5616     __ pusha();
5617 
5618     // The method may have floats as arguments, and we must spill them before calling
5619     // the VM runtime.
5620     assert(Argument::n_float_register_parameters_j == 8, "Assumption");
5621     const int xmm_size = wordSize * 2;
5622     const int xmm_spill_size = xmm_size * Argument::n_float_register_parameters_j;
5623     __ subptr(rsp, xmm_spill_size);
5624     __ movdqu(Address(rsp, xmm_size * 7), xmm7);
5625     __ movdqu(Address(rsp, xmm_size * 6), xmm6);
5626     __ movdqu(Address(rsp, xmm_size * 5), xmm5);
5627     __ movdqu(Address(rsp, xmm_size * 4), xmm4);
5628     __ movdqu(Address(rsp, xmm_size * 3), xmm3);
5629     __ movdqu(Address(rsp, xmm_size * 2), xmm2);
5630     __ movdqu(Address(rsp, xmm_size * 1), xmm1);
5631     __ movdqu(Address(rsp, xmm_size * 0), xmm0);
5632 
5633     __ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<int (*)(address*)>(BarrierSetNMethod::nmethod_stub_entry_barrier)), 1);
5634 
5635     __ movdqu(xmm0, Address(rsp, xmm_size * 0));
5636     __ movdqu(xmm1, Address(rsp, xmm_size * 1));
5637     __ movdqu(xmm2, Address(rsp, xmm_size * 2));
5638     __ movdqu(xmm3, Address(rsp, xmm_size * 3));
5639     __ movdqu(xmm4, Address(rsp, xmm_size * 4));
5640     __ movdqu(xmm5, Address(rsp, xmm_size * 5));
5641     __ movdqu(xmm6, Address(rsp, xmm_size * 6));
5642     __ movdqu(xmm7, Address(rsp, xmm_size * 7));
5643     __ addptr(rsp, xmm_spill_size);
5644 
5645     __ cmpl(rax, 1); // 1 means deoptimize
5646     __ jcc(Assembler::equal, deoptimize_label);
5647 
5648     __ popa();
5649     __ pop(c_rarg0);
5650 
5651     __ leave();
5652 
5653     __ addptr(rsp, 1 * wordSize); // cookie
5654     __ ret(0);
5655 
5656 
5657     __ BIND(deoptimize_label);
5658 
5659     __ popa();
5660     __ pop(c_rarg0);
5661 
5662     __ leave();
5663 
5664     // this can be taken out, but is good for verification purposes. getting a SIGSEGV
5665     // here while still having a correct stack is valuable
5666     __ testptr(rsp, Address(rsp, 0));
5667 
5668     __ movptr(rsp, Address(rsp, 0)); // new rsp was written in the barrier
5669     __ jmp(Address(rsp, -1 * wordSize)); // jmp target should be callers verified_entry_point
5670 
5671     return start;
5672   }
5673 
5674    /**
5675    *  Arguments:
5676    *
5677    *  Input:
5678    *    c_rarg0   - out address
5679    *    c_rarg1   - in address
5680    *    c_rarg2   - offset
5681    *    c_rarg3   - len
5682    * not Win64
5683    *    c_rarg4   - k
5684    * Win64
5685    *    rsp+40    - k
5686    */
5687   address generate_mulAdd() {
5688     __ align(CodeEntryAlignment);
5689     StubCodeMark mark(this, "StubRoutines", "mulAdd");
5690 
5691     address start = __ pc();
5692     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
5693     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
5694     const Register out     = rdi;
5695     const Register in      = rsi;
5696     const Register offset  = r11;
5697     const Register len     = rcx;
5698     const Register k       = r8;
5699 
5700     // Next registers will be saved on stack in mul_add().
5701     const Register tmp1  = r12;
5702     const Register tmp2  = r13;
5703     const Register tmp3  = r14;
5704     const Register tmp4  = r15;
5705     const Register tmp5  = rbx;
5706 
5707     BLOCK_COMMENT("Entry:");
5708     __ enter(); // required for proper stackwalking of RuntimeStub frame
5709 
5710     setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
5711                        // len => rcx, k => r8
5712                        // r9 and r10 may be used to save non-volatile registers
5713 #ifdef _WIN64
5714     // last argument is on stack on Win64
5715     __ movl(k, Address(rsp, 6 * wordSize));
5716 #endif
5717     __ movptr(r11, rdx);  // move offset in rdx to offset(r11)
5718     __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
5719 
5720     restore_arg_regs();
5721 
5722     __ leave(); // required for proper stackwalking of RuntimeStub frame
5723     __ ret(0);
5724 
5725     return start;
5726   }
5727 
5728   address generate_libmExp() {
5729     StubCodeMark mark(this, "StubRoutines", "libmExp");
5730 
5731     address start = __ pc();
5732 
5733     const XMMRegister x0  = xmm0;
5734     const XMMRegister x1  = xmm1;
5735     const XMMRegister x2  = xmm2;
5736     const XMMRegister x3  = xmm3;
5737 
5738     const XMMRegister x4  = xmm4;
5739     const XMMRegister x5  = xmm5;
5740     const XMMRegister x6  = xmm6;
5741     const XMMRegister x7  = xmm7;
5742 
5743     const Register tmp   = r11;
5744 
5745     BLOCK_COMMENT("Entry:");
5746     __ enter(); // required for proper stackwalking of RuntimeStub frame
5747 
5748     __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
5749 
5750     __ leave(); // required for proper stackwalking of RuntimeStub frame
5751     __ ret(0);
5752 
5753     return start;
5754 
5755   }
5756 
5757   address generate_libmLog() {
5758     StubCodeMark mark(this, "StubRoutines", "libmLog");
5759 
5760     address start = __ pc();
5761 
5762     const XMMRegister x0 = xmm0;
5763     const XMMRegister x1 = xmm1;
5764     const XMMRegister x2 = xmm2;
5765     const XMMRegister x3 = xmm3;
5766 
5767     const XMMRegister x4 = xmm4;
5768     const XMMRegister x5 = xmm5;
5769     const XMMRegister x6 = xmm6;
5770     const XMMRegister x7 = xmm7;
5771 
5772     const Register tmp1 = r11;
5773     const Register tmp2 = r8;
5774 
5775     BLOCK_COMMENT("Entry:");
5776     __ enter(); // required for proper stackwalking of RuntimeStub frame
5777 
5778     __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
5779 
5780     __ leave(); // required for proper stackwalking of RuntimeStub frame
5781     __ ret(0);
5782 
5783     return start;
5784 
5785   }
5786 
5787   address generate_libmLog10() {
5788     StubCodeMark mark(this, "StubRoutines", "libmLog10");
5789 
5790     address start = __ pc();
5791 
5792     const XMMRegister x0 = xmm0;
5793     const XMMRegister x1 = xmm1;
5794     const XMMRegister x2 = xmm2;
5795     const XMMRegister x3 = xmm3;
5796 
5797     const XMMRegister x4 = xmm4;
5798     const XMMRegister x5 = xmm5;
5799     const XMMRegister x6 = xmm6;
5800     const XMMRegister x7 = xmm7;
5801 
5802     const Register tmp = r11;
5803 
5804     BLOCK_COMMENT("Entry:");
5805     __ enter(); // required for proper stackwalking of RuntimeStub frame
5806 
5807     __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
5808 
5809     __ leave(); // required for proper stackwalking of RuntimeStub frame
5810     __ ret(0);
5811 
5812     return start;
5813 
5814   }
5815 
5816   address generate_libmPow() {
5817     StubCodeMark mark(this, "StubRoutines", "libmPow");
5818 
5819     address start = __ pc();
5820 
5821     const XMMRegister x0 = xmm0;
5822     const XMMRegister x1 = xmm1;
5823     const XMMRegister x2 = xmm2;
5824     const XMMRegister x3 = xmm3;
5825 
5826     const XMMRegister x4 = xmm4;
5827     const XMMRegister x5 = xmm5;
5828     const XMMRegister x6 = xmm6;
5829     const XMMRegister x7 = xmm7;
5830 
5831     const Register tmp1 = r8;
5832     const Register tmp2 = r9;
5833     const Register tmp3 = r10;
5834     const Register tmp4 = r11;
5835 
5836     BLOCK_COMMENT("Entry:");
5837     __ enter(); // required for proper stackwalking of RuntimeStub frame
5838 
5839     __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5840 
5841     __ leave(); // required for proper stackwalking of RuntimeStub frame
5842     __ ret(0);
5843 
5844     return start;
5845 
5846   }
5847 
5848   address generate_libmSin() {
5849     StubCodeMark mark(this, "StubRoutines", "libmSin");
5850 
5851     address start = __ pc();
5852 
5853     const XMMRegister x0 = xmm0;
5854     const XMMRegister x1 = xmm1;
5855     const XMMRegister x2 = xmm2;
5856     const XMMRegister x3 = xmm3;
5857 
5858     const XMMRegister x4 = xmm4;
5859     const XMMRegister x5 = xmm5;
5860     const XMMRegister x6 = xmm6;
5861     const XMMRegister x7 = xmm7;
5862 
5863     const Register tmp1 = r8;
5864     const Register tmp2 = r9;
5865     const Register tmp3 = r10;
5866     const Register tmp4 = r11;
5867 
5868     BLOCK_COMMENT("Entry:");
5869     __ enter(); // required for proper stackwalking of RuntimeStub frame
5870 
5871 #ifdef _WIN64
5872     __ push(rsi);
5873     __ push(rdi);
5874 #endif
5875     __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5876 
5877 #ifdef _WIN64
5878     __ pop(rdi);
5879     __ pop(rsi);
5880 #endif
5881 
5882     __ leave(); // required for proper stackwalking of RuntimeStub frame
5883     __ ret(0);
5884 
5885     return start;
5886 
5887   }
5888 
5889   address generate_libmCos() {
5890     StubCodeMark mark(this, "StubRoutines", "libmCos");
5891 
5892     address start = __ pc();
5893 
5894     const XMMRegister x0 = xmm0;
5895     const XMMRegister x1 = xmm1;
5896     const XMMRegister x2 = xmm2;
5897     const XMMRegister x3 = xmm3;
5898 
5899     const XMMRegister x4 = xmm4;
5900     const XMMRegister x5 = xmm5;
5901     const XMMRegister x6 = xmm6;
5902     const XMMRegister x7 = xmm7;
5903 
5904     const Register tmp1 = r8;
5905     const Register tmp2 = r9;
5906     const Register tmp3 = r10;
5907     const Register tmp4 = r11;
5908 
5909     BLOCK_COMMENT("Entry:");
5910     __ enter(); // required for proper stackwalking of RuntimeStub frame
5911 
5912 #ifdef _WIN64
5913     __ push(rsi);
5914     __ push(rdi);
5915 #endif
5916     __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5917 
5918 #ifdef _WIN64
5919     __ pop(rdi);
5920     __ pop(rsi);
5921 #endif
5922 
5923     __ leave(); // required for proper stackwalking of RuntimeStub frame
5924     __ ret(0);
5925 
5926     return start;
5927 
5928   }
5929 
5930   address generate_libmTan() {
5931     StubCodeMark mark(this, "StubRoutines", "libmTan");
5932 
5933     address start = __ pc();
5934 
5935     const XMMRegister x0 = xmm0;
5936     const XMMRegister x1 = xmm1;
5937     const XMMRegister x2 = xmm2;
5938     const XMMRegister x3 = xmm3;
5939 
5940     const XMMRegister x4 = xmm4;
5941     const XMMRegister x5 = xmm5;
5942     const XMMRegister x6 = xmm6;
5943     const XMMRegister x7 = xmm7;
5944 
5945     const Register tmp1 = r8;
5946     const Register tmp2 = r9;
5947     const Register tmp3 = r10;
5948     const Register tmp4 = r11;
5949 
5950     BLOCK_COMMENT("Entry:");
5951     __ enter(); // required for proper stackwalking of RuntimeStub frame
5952 
5953 #ifdef _WIN64
5954     __ push(rsi);
5955     __ push(rdi);
5956 #endif
5957     __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
5958 
5959 #ifdef _WIN64
5960     __ pop(rdi);
5961     __ pop(rsi);
5962 #endif
5963 
5964     __ leave(); // required for proper stackwalking of RuntimeStub frame
5965     __ ret(0);
5966 
5967     return start;
5968 
5969   }
5970 
5971 #undef __
5972 #define __ masm->
5973 
5974   // Continuation point for throwing of implicit exceptions that are
5975   // not handled in the current activation. Fabricates an exception
5976   // oop and initiates normal exception dispatching in this
5977   // frame. Since we need to preserve callee-saved values (currently
5978   // only for C2, but done for C1 as well) we need a callee-saved oop
5979   // map and therefore have to make these stubs into RuntimeStubs
5980   // rather than BufferBlobs.  If the compiler needs all registers to
5981   // be preserved between the fault point and the exception handler
5982   // then it must assume responsibility for that in
5983   // AbstractCompiler::continuation_for_implicit_null_exception or
5984   // continuation_for_implicit_division_by_zero_exception. All other
5985   // implicit exceptions (e.g., NullPointerException or
5986   // AbstractMethodError on entry) are either at call sites or
5987   // otherwise assume that stack unwinding will be initiated, so
5988   // caller saved registers were assumed volatile in the compiler.
5989   address generate_throw_exception(const char* name,
5990                                    address runtime_entry,
5991                                    Register arg1 = noreg,
5992                                    Register arg2 = noreg) {
5993     // Information about frame layout at time of blocking runtime call.
5994     // Note that we only have to preserve callee-saved registers since
5995     // the compilers are responsible for supplying a continuation point
5996     // if they expect all registers to be preserved.
5997     enum layout {
5998       rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
5999       rbp_off2,
6000       return_off,
6001       return_off2,
6002       framesize // inclusive of return address
6003     };
6004 
6005     int insts_size = 512;
6006     int locs_size  = 64;
6007 
6008     CodeBuffer code(name, insts_size, locs_size);
6009     OopMapSet* oop_maps  = new OopMapSet();
6010     MacroAssembler* masm = new MacroAssembler(&code);
6011 
6012     address start = __ pc();
6013 
6014     // This is an inlined and slightly modified version of call_VM
6015     // which has the ability to fetch the return PC out of
6016     // thread-local storage and also sets up last_Java_sp slightly
6017     // differently than the real call_VM
6018 
6019     __ enter(); // required for proper stackwalking of RuntimeStub frame
6020 
6021     assert(is_even(framesize/2), "sp not 16-byte aligned");
6022 
6023     // return address and rbp are already in place
6024     __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
6025 
6026     int frame_complete = __ pc() - start;
6027 
6028     // Set up last_Java_sp and last_Java_fp
6029     address the_pc = __ pc();
6030     __ set_last_Java_frame(rsp, rbp, the_pc);
6031     __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
6032 
6033     // Call runtime
6034     if (arg1 != noreg) {
6035       assert(arg2 != c_rarg1, "clobbered");
6036       __ movptr(c_rarg1, arg1);
6037     }
6038     if (arg2 != noreg) {
6039       __ movptr(c_rarg2, arg2);
6040     }
6041     __ movptr(c_rarg0, r15_thread);
6042     BLOCK_COMMENT("call runtime_entry");
6043     __ call(RuntimeAddress(runtime_entry));
6044 
6045     // Generate oop map
6046     OopMap* map = new OopMap(framesize, 0);
6047 
6048     oop_maps->add_gc_map(the_pc - start, map);
6049 
6050     __ reset_last_Java_frame(true);
6051 
6052     __ leave(); // required for proper stackwalking of RuntimeStub frame
6053 
6054     // check for pending exceptions
6055 #ifdef ASSERT
6056     Label L;
6057     __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()),
6058             (int32_t) NULL_WORD);
6059     __ jcc(Assembler::notEqual, L);
6060     __ should_not_reach_here();
6061     __ bind(L);
6062 #endif // ASSERT
6063     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
6064 
6065 
6066     // codeBlob framesize is in words (not VMRegImpl::slot_size)
6067     RuntimeStub* stub =
6068       RuntimeStub::new_runtime_stub(name,
6069                                     &code,
6070                                     frame_complete,
6071                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
6072                                     oop_maps, false);
6073     return stub->entry_point();
6074   }
6075 
6076   void create_control_words() {
6077     // Round to nearest, 53-bit mode, exceptions masked
6078     StubRoutines::_fpu_cntrl_wrd_std   = 0x027F;
6079     // Round to zero, 53-bit mode, exception mased
6080     StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F;
6081     // Round to nearest, 24-bit mode, exceptions masked
6082     StubRoutines::_fpu_cntrl_wrd_24    = 0x007F;
6083     // Round to nearest, 64-bit mode, exceptions masked
6084     StubRoutines::_mxcsr_std           = 0x1F80;
6085     // Note: the following two constants are 80-bit values
6086     //       layout is critical for correct loading by FPU.
6087     // Bias for strict fp multiply/divide
6088     StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000
6089     StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000;
6090     StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff;
6091     // Un-Bias for strict fp multiply/divide
6092     StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000
6093     StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000;
6094     StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff;
6095   }
6096 
6097   // Initialization
6098   void generate_initial() {
6099     // Generates all stubs and initializes the entry points
6100 
6101     // This platform-specific settings are needed by generate_call_stub()
6102     create_control_words();
6103 
6104     // entry points that exist in all platforms Note: This is code
6105     // that could be shared among different platforms - however the
6106     // benefit seems to be smaller than the disadvantage of having a
6107     // much more complicated generator structure. See also comment in
6108     // stubRoutines.hpp.
6109 
6110     StubRoutines::_forward_exception_entry = generate_forward_exception();
6111 
6112     StubRoutines::_call_stub_entry =
6113       generate_call_stub(StubRoutines::_call_stub_return_address);
6114 
6115     // is referenced by megamorphic call
6116     StubRoutines::_catch_exception_entry = generate_catch_exception();
6117 
6118     // atomic calls
6119     StubRoutines::_atomic_xchg_entry          = generate_atomic_xchg();
6120     StubRoutines::_atomic_xchg_long_entry     = generate_atomic_xchg_long();
6121     StubRoutines::_atomic_cmpxchg_entry       = generate_atomic_cmpxchg();
6122     StubRoutines::_atomic_cmpxchg_byte_entry  = generate_atomic_cmpxchg_byte();
6123     StubRoutines::_atomic_cmpxchg_long_entry  = generate_atomic_cmpxchg_long();
6124     StubRoutines::_atomic_add_entry           = generate_atomic_add();
6125     StubRoutines::_atomic_add_long_entry      = generate_atomic_add_long();
6126     StubRoutines::_fence_entry                = generate_orderaccess_fence();
6127 
6128     // platform dependent
6129     StubRoutines::x86::_get_previous_fp_entry = generate_get_previous_fp();
6130     StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp();
6131 
6132     StubRoutines::x86::_verify_mxcsr_entry    = generate_verify_mxcsr();
6133 
6134     // Build this early so it's available for the interpreter.
6135     StubRoutines::_throw_StackOverflowError_entry =
6136       generate_throw_exception("StackOverflowError throw_exception",
6137                                CAST_FROM_FN_PTR(address,
6138                                                 SharedRuntime::
6139                                                 throw_StackOverflowError));
6140     StubRoutines::_throw_delayed_StackOverflowError_entry =
6141       generate_throw_exception("delayed StackOverflowError throw_exception",
6142                                CAST_FROM_FN_PTR(address,
6143                                                 SharedRuntime::
6144                                                 throw_delayed_StackOverflowError));
6145     if (UseCRC32Intrinsics) {
6146       // set table address before stub generation which use it
6147       StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
6148       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
6149     }
6150 
6151     if (UseCRC32CIntrinsics) {
6152       bool supports_clmul = VM_Version::supports_clmul();
6153       StubRoutines::x86::generate_CRC32C_table(supports_clmul);
6154       StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
6155       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
6156     }
6157     if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) {
6158       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
6159           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
6160           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
6161         StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF;
6162         StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2;
6163         StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4;
6164         StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable;
6165         StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2;
6166         StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3;
6167         StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1;
6168         StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE;
6169         StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4;
6170         StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV;
6171         StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK;
6172         StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1;
6173         StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3;
6174         StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO;
6175       }
6176       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
6177         StubRoutines::_dexp = generate_libmExp();
6178       }
6179       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
6180         StubRoutines::_dlog = generate_libmLog();
6181       }
6182       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
6183         StubRoutines::_dlog10 = generate_libmLog10();
6184       }
6185       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
6186         StubRoutines::_dpow = generate_libmPow();
6187       }
6188       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
6189         StubRoutines::_dsin = generate_libmSin();
6190       }
6191       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
6192         StubRoutines::_dcos = generate_libmCos();
6193       }
6194       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
6195         StubRoutines::_dtan = generate_libmTan();
6196       }
6197     }
6198   }
6199 
6200   void generate_all() {
6201     // Generates all stubs and initializes the entry points
6202 
6203     // These entry points require SharedInfo::stack0 to be set up in
6204     // non-core builds and need to be relocatable, so they each
6205     // fabricate a RuntimeStub internally.
6206     StubRoutines::_throw_AbstractMethodError_entry =
6207       generate_throw_exception("AbstractMethodError throw_exception",
6208                                CAST_FROM_FN_PTR(address,
6209                                                 SharedRuntime::
6210                                                 throw_AbstractMethodError));
6211 
6212     StubRoutines::_throw_IncompatibleClassChangeError_entry =
6213       generate_throw_exception("IncompatibleClassChangeError throw_exception",
6214                                CAST_FROM_FN_PTR(address,
6215                                                 SharedRuntime::
6216                                                 throw_IncompatibleClassChangeError));
6217 
6218     StubRoutines::_throw_NullPointerException_at_call_entry =
6219       generate_throw_exception("NullPointerException at call throw_exception",
6220                                CAST_FROM_FN_PTR(address,
6221                                                 SharedRuntime::
6222                                                 throw_NullPointerException_at_call));
6223 
6224     // entry points that are platform specific
6225     StubRoutines::x86::_f2i_fixup = generate_f2i_fixup();
6226     StubRoutines::x86::_f2l_fixup = generate_f2l_fixup();
6227     StubRoutines::x86::_d2i_fixup = generate_d2i_fixup();
6228     StubRoutines::x86::_d2l_fixup = generate_d2l_fixup();
6229 
6230     StubRoutines::x86::_vector_iota_indices     = generate_iota_indices("iota_indices");
6231     StubRoutines::x86::_float_sign_mask  = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
6232     StubRoutines::x86::_float_sign_flip  = generate_fp_mask("float_sign_flip",  0x8000000080000000);
6233     StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
6234     StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
6235     StubRoutines::x86::_vector_float_sign_mask = generate_vector_fp_mask("vector_float_sign_mask", 0x7FFFFFFF7FFFFFFF);
6236     StubRoutines::x86::_vector_float_sign_flip = generate_vector_fp_mask("vector_float_sign_flip", 0x8000000080000000);
6237     StubRoutines::x86::_vector_double_sign_mask = generate_vector_fp_mask("vector_double_sign_mask", 0x7FFFFFFFFFFFFFFF);
6238     StubRoutines::x86::_vector_double_sign_flip = generate_vector_fp_mask("vector_double_sign_flip", 0x8000000000000000);
6239     StubRoutines::x86::_vector_all_bits_set = generate_vector_fp_mask("vector_all_bits_set", 0xFFFFFFFFFFFFFFFF);
6240     StubRoutines::x86::_vector_byte_bitset = generate_vector_fp_mask("vector_byte_bitset", 0x0101010101010101);
6241     StubRoutines::x86::_vector_long_perm_mask = generate_vector_custom_i32("vector_long_perm_mask", Assembler::AVX_512bit,
6242                                                                            0, 2, 4, 6, 8, 10, 12, 14);
6243     StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_fp_mask("vector_short_to_byte_mask", 0x00ff00ff00ff00ff);
6244     StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask");
6245     StubRoutines::x86::_vector_int_to_byte_mask = generate_vector_fp_mask("vector_int_to_byte_mask", 0x000000ff000000ff);
6246     StubRoutines::x86::_vector_int_to_short_mask = generate_vector_fp_mask("vector_int_to_short_mask", 0x0000ffff0000ffff);
6247     StubRoutines::x86::_vector_32_bit_mask = generate_vector_custom_i32("vector_32_bit_mask", Assembler::AVX_512bit,
6248                                                                         0xFFFFFFFF, 0, 0, 0);
6249     StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit,
6250                                                                         0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
6251     StubRoutines::x86::_vector_int_shuffle_mask = generate_vector_fp_mask("vector_int_shuffle_mask", 0x0302010003020100);
6252     StubRoutines::x86::_vector_int_size_mask = generate_vector_fp_mask("vector_int_size_mask", 0x0000000400000004);
6253     StubRoutines::x86::_vector_short_shuffle_mask = generate_vector_fp_mask("vector_short_shuffle_mask", 0x0100010001000100);
6254     StubRoutines::x86::_vector_short_size_mask = generate_vector_fp_mask("vector_short_size_mask", 0x0002000200020002);
6255     StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_fp_mask("vector_long_shuffle_mask", 0x0000000100000000);
6256     StubRoutines::x86::_vector_long_size_mask = generate_vector_fp_mask("vector_long_size_mask", 0x0000000200000002);
6257 
6258     // support for verify_oop (must happen after universe_init)
6259     StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
6260 
6261     // arraycopy stubs used by compilers
6262     generate_arraycopy_stubs();
6263 
6264     // don't bother generating these AES intrinsic stubs unless global flag is set
6265     if (UseAESIntrinsics) {
6266       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
6267       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
6268       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
6269       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
6270       if (VM_Version::supports_vaes() &&  VM_Version::supports_avx512vl() && VM_Version::supports_avx512dq() ) {
6271         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptVectorAESCrypt();
6272       } else {
6273         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
6274       }
6275     }
6276     if (UseAESCTRIntrinsics){
6277       StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
6278       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
6279     }
6280 
6281     if (UseSHA1Intrinsics) {
6282       StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
6283       StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
6284       StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
6285       StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
6286     }
6287     if (UseSHA256Intrinsics) {
6288       StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
6289       char* dst = (char*)StubRoutines::x86::_k256_W;
6290       char* src = (char*)StubRoutines::x86::_k256;
6291       for (int ii = 0; ii < 16; ++ii) {
6292         memcpy(dst + 32 * ii,      src + 16 * ii, 16);
6293         memcpy(dst + 32 * ii + 16, src + 16 * ii, 16);
6294       }
6295       StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W;
6296       StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
6297       StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
6298       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
6299     }
6300     if (UseSHA512Intrinsics) {
6301       StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W;
6302       StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512();
6303       StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
6304       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
6305     }
6306 
6307     // Generate GHASH intrinsics code
6308     if (UseGHASHIntrinsics) {
6309     StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
6310     StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
6311       if (VM_Version::supports_avx()) {
6312         StubRoutines::x86::_ghash_shuffmask_addr = ghash_shufflemask_addr();
6313         StubRoutines::x86::_ghash_poly_addr = ghash_polynomial_addr();
6314         StubRoutines::_ghash_processBlocks = generate_avx_ghash_processBlocks();
6315       } else {
6316         StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
6317       }
6318     }
6319 
6320     if (UseBASE64Intrinsics) {
6321       StubRoutines::x86::_and_mask = base64_and_mask_addr();
6322       StubRoutines::x86::_bswap_mask = base64_bswap_mask_addr();
6323       StubRoutines::x86::_base64_charset = base64_charset_addr();
6324       StubRoutines::x86::_url_charset = base64url_charset_addr();
6325       StubRoutines::x86::_gather_mask = base64_gather_mask_addr();
6326       StubRoutines::x86::_left_shift_mask = base64_left_shift_mask_addr();
6327       StubRoutines::x86::_right_shift_mask = base64_right_shift_mask_addr();
6328       StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
6329     }
6330 
6331     // Safefetch stubs.
6332     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
6333                                                        &StubRoutines::_safefetch32_fault_pc,
6334                                                        &StubRoutines::_safefetch32_continuation_pc);
6335     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
6336                                                        &StubRoutines::_safefetchN_fault_pc,
6337                                                        &StubRoutines::_safefetchN_continuation_pc);
6338 
6339     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
6340     if (bs_nm != NULL) {
6341       StubRoutines::x86::_method_entry_barrier = generate_method_entry_barrier();
6342     }
6343 #ifdef COMPILER2
6344     if (UseMultiplyToLenIntrinsic) {
6345       StubRoutines::_multiplyToLen = generate_multiplyToLen();
6346     }
6347     if (UseSquareToLenIntrinsic) {
6348       StubRoutines::_squareToLen = generate_squareToLen();
6349     }
6350     if (UseMulAddIntrinsic) {
6351       StubRoutines::_mulAdd = generate_mulAdd();
6352     }
6353 #ifndef _WINDOWS
6354     if (UseMontgomeryMultiplyIntrinsic) {
6355       StubRoutines::_montgomeryMultiply
6356         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
6357     }
6358     if (UseMontgomerySquareIntrinsic) {
6359       StubRoutines::_montgomerySquare
6360         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
6361     }
6362 #endif // WINDOWS
6363 #endif // COMPILER2
6364 
6365     if (UseVectorizedMismatchIntrinsic) {
6366       StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
6367     }
6368 
6369 #ifdef __VECTOR_API_MATH_INTRINSICS_COMMON
6370     if (UseVectorApiIntrinsics) {
6371       if (UseAVX >= 1) {
6372           #if defined(__VECTOR_API_MATH_INTRINSICS_LINUX)
6373           if (UseAVX > 2) {
6374               StubRoutines::_vector_float512_exp = CAST_FROM_FN_PTR(address, __svml_expf16_ha_z0);
6375               StubRoutines::_vector_double512_exp = CAST_FROM_FN_PTR(address, __svml_exp8_ha_z0); 
6376               StubRoutines::_vector_float512_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f16_ha_z0);
6377               StubRoutines::_vector_double512_expm1 = CAST_FROM_FN_PTR(address, __svml_expm18_ha_z0);
6378               StubRoutines::_vector_float512_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf16_ha_z0);
6379               StubRoutines::_vector_double512_log1p = CAST_FROM_FN_PTR(address, __svml_log1p8_ha_z0);
6380               StubRoutines::_vector_float512_log = CAST_FROM_FN_PTR(address, __svml_logf16_ha_z0);
6381               StubRoutines::_vector_double512_log = CAST_FROM_FN_PTR(address, __svml_log8_ha_z0);
6382               StubRoutines::_vector_float512_log10 = CAST_FROM_FN_PTR(address, __svml_log10f16_ha_z0);
6383               StubRoutines::_vector_double512_log10 = CAST_FROM_FN_PTR(address, __svml_log108_ha_z0);
6384               StubRoutines::_vector_float512_sin = CAST_FROM_FN_PTR(address, __svml_sinf16_ha_z0);      
6385               StubRoutines::_vector_double512_sin = CAST_FROM_FN_PTR(address, __svml_sin8_ha_z0);
6386               StubRoutines::_vector_float512_cos = CAST_FROM_FN_PTR(address, __svml_cosf16_ha_z0);      
6387               StubRoutines::_vector_double512_cos = CAST_FROM_FN_PTR(address, __svml_cos8_ha_z0);
6388               StubRoutines::_vector_float512_tan = CAST_FROM_FN_PTR(address, __svml_tanf16_ha_z0);
6389               StubRoutines::_vector_double512_tan = CAST_FROM_FN_PTR(address, __svml_tan8_ha_z0);      
6390               StubRoutines::_vector_float512_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf16_ha_z0);
6391               StubRoutines::_vector_double512_sinh = CAST_FROM_FN_PTR(address, __svml_sinh8_ha_z0);
6392               StubRoutines::_vector_float512_cosh = CAST_FROM_FN_PTR(address, __svml_coshf16_ha_z0);
6393               StubRoutines::_vector_double512_cosh = CAST_FROM_FN_PTR(address, __svml_cosh8_ha_z0);
6394               StubRoutines::_vector_float512_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf16_ha_z0);
6395               StubRoutines::_vector_double512_tanh = CAST_FROM_FN_PTR(address, __svml_tanh8_ha_z0);
6396               StubRoutines::_vector_float512_acos = CAST_FROM_FN_PTR(address, __svml_acosf16_ha_z0);
6397               StubRoutines::_vector_double512_acos = CAST_FROM_FN_PTR(address, __svml_acos8_ha_z0);
6398               StubRoutines::_vector_float512_asin = CAST_FROM_FN_PTR(address, __svml_asinf16_ha_z0);
6399               StubRoutines::_vector_double512_asin = CAST_FROM_FN_PTR(address, __svml_asin8_ha_z0);
6400               StubRoutines::_vector_float512_atan = CAST_FROM_FN_PTR(address, __svml_atanf16_ha_z0);
6401               StubRoutines::_vector_double512_atan = CAST_FROM_FN_PTR(address, __svml_atan8_ha_z0);
6402               StubRoutines::_vector_float512_pow = CAST_FROM_FN_PTR(address, __svml_powf16_ha_z0);
6403               StubRoutines::_vector_double512_pow = CAST_FROM_FN_PTR(address, __svml_pow8_ha_z0);
6404               StubRoutines::_vector_float512_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf16_ha_z0);
6405               StubRoutines::_vector_double512_hypot = CAST_FROM_FN_PTR(address, __svml_hypot8_ha_z0);
6406               StubRoutines::_vector_float512_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf16_ha_z0);
6407               StubRoutines::_vector_double512_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt8_ha_z0);
6408               StubRoutines::_vector_float512_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f16_ha_z0);
6409               StubRoutines::_vector_double512_atan2 = CAST_FROM_FN_PTR(address, __svml_atan28_ha_z0);
6410           }
6411           #endif
6412         if (UseAVX==1) {
6413           StubRoutines::_vector_float64_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_e9);  
6414           StubRoutines::_vector_float128_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_e9);
6415           StubRoutines::_vector_float256_exp = CAST_FROM_FN_PTR(address, __svml_expf8_ha_e9); 
6416           StubRoutines::_vector_double64_exp = CAST_FROM_FN_PTR(address, __svml_exp1_ha_e9);  
6417           StubRoutines::_vector_double128_exp = CAST_FROM_FN_PTR(address, __svml_exp2_ha_e9); 
6418           StubRoutines::_vector_double256_exp = CAST_FROM_FN_PTR(address, __svml_exp4_ha_e9);
6419           StubRoutines::_vector_float64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_e9);
6420           StubRoutines::_vector_float128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_e9);
6421           StubRoutines::_vector_float256_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f8_ha_e9);
6422           StubRoutines::_vector_double64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm11_ha_e9);
6423           StubRoutines::_vector_double128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm12_ha_e9);
6424           StubRoutines::_vector_double256_expm1 = CAST_FROM_FN_PTR(address, __svml_expm14_ha_e9);
6425           StubRoutines::_vector_float64_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_e9);
6426           StubRoutines::_vector_float128_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_e9);
6427           StubRoutines::_vector_float256_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf8_ha_e9);
6428           StubRoutines::_vector_double64_log1p = CAST_FROM_FN_PTR(address, __svml_log1p1_ha_e9);
6429           StubRoutines::_vector_double128_log1p = CAST_FROM_FN_PTR(address, __svml_log1p2_ha_e9);
6430           StubRoutines::_vector_double256_log1p = CAST_FROM_FN_PTR(address, __svml_log1p4_ha_e9);
6431           StubRoutines::_vector_float64_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_e9);
6432           StubRoutines::_vector_float128_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_e9);
6433           StubRoutines::_vector_float256_log = CAST_FROM_FN_PTR(address, __svml_logf8_ha_e9);
6434           StubRoutines::_vector_double64_log = CAST_FROM_FN_PTR(address, __svml_log1_ha_e9);
6435           StubRoutines::_vector_double128_log = CAST_FROM_FN_PTR(address, __svml_log2_ha_e9);
6436           StubRoutines::_vector_double256_log = CAST_FROM_FN_PTR(address, __svml_log4_ha_e9);
6437           StubRoutines::_vector_float64_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_e9);
6438           StubRoutines::_vector_float128_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_e9);
6439           StubRoutines::_vector_float256_log10 = CAST_FROM_FN_PTR(address, __svml_log10f8_ha_e9);
6440           StubRoutines::_vector_double64_log10 = CAST_FROM_FN_PTR(address, __svml_log101_ha_e9);
6441           StubRoutines::_vector_double128_log10 = CAST_FROM_FN_PTR(address, __svml_log102_ha_e9);
6442           StubRoutines::_vector_double256_log10 = CAST_FROM_FN_PTR(address, __svml_log104_ha_e9);
6443           StubRoutines::_vector_float64_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_e9);
6444           StubRoutines::_vector_float128_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_e9);
6445           StubRoutines::_vector_float256_sin = CAST_FROM_FN_PTR(address, __svml_sinf8_ha_e9);
6446           StubRoutines::_vector_double64_sin = CAST_FROM_FN_PTR(address, __svml_sin1_ha_e9);
6447           StubRoutines::_vector_double128_sin = CAST_FROM_FN_PTR(address, __svml_sin2_ha_e9);
6448           StubRoutines::_vector_double256_sin = CAST_FROM_FN_PTR(address, __svml_sin4_ha_e9);
6449           StubRoutines::_vector_float64_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_e9);
6450           StubRoutines::_vector_float128_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_e9);
6451           StubRoutines::_vector_float256_cos = CAST_FROM_FN_PTR(address, __svml_cosf8_ha_e9);
6452           StubRoutines::_vector_double64_cos = CAST_FROM_FN_PTR(address, __svml_cos1_ha_e9);
6453           StubRoutines::_vector_double128_cos = CAST_FROM_FN_PTR(address, __svml_cos2_ha_e9);
6454           StubRoutines::_vector_double256_cos = CAST_FROM_FN_PTR(address, __svml_cos4_ha_e9);
6455           StubRoutines::_vector_float64_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_e9);
6456           StubRoutines::_vector_float128_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_e9);
6457           StubRoutines::_vector_float256_tan = CAST_FROM_FN_PTR(address, __svml_tanf8_ha_e9);
6458           StubRoutines::_vector_double64_tan = CAST_FROM_FN_PTR(address, __svml_tan1_ha_e9);
6459           StubRoutines::_vector_double128_tan = CAST_FROM_FN_PTR(address, __svml_tan2_ha_e9);
6460           StubRoutines::_vector_double256_tan = CAST_FROM_FN_PTR(address, __svml_tan4_ha_e9);
6461           StubRoutines::_vector_float64_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_e9);
6462           StubRoutines::_vector_float128_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_e9);
6463           StubRoutines::_vector_float256_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf8_ha_e9);
6464           StubRoutines::_vector_double64_sinh = CAST_FROM_FN_PTR(address, __svml_sinh1_ha_e9);
6465           StubRoutines::_vector_double128_sinh = CAST_FROM_FN_PTR(address, __svml_sinh2_ha_e9);
6466           StubRoutines::_vector_double256_sinh = CAST_FROM_FN_PTR(address, __svml_sinh4_ha_e9);
6467           StubRoutines::_vector_float64_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_e9);
6468           StubRoutines::_vector_float128_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_e9);
6469           StubRoutines::_vector_float256_cosh = CAST_FROM_FN_PTR(address, __svml_coshf8_ha_e9);
6470           StubRoutines::_vector_double64_cosh = CAST_FROM_FN_PTR(address, __svml_cosh1_ha_e9);
6471           StubRoutines::_vector_double128_cosh = CAST_FROM_FN_PTR(address, __svml_cosh2_ha_e9);
6472           StubRoutines::_vector_double256_cosh = CAST_FROM_FN_PTR(address, __svml_cosh4_ha_e9);
6473           StubRoutines::_vector_float64_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_e9);
6474           StubRoutines::_vector_float128_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_e9);
6475           StubRoutines::_vector_float256_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf8_ha_e9);
6476           StubRoutines::_vector_double64_tanh = CAST_FROM_FN_PTR(address, __svml_tanh1_ha_e9);
6477           StubRoutines::_vector_double128_tanh = CAST_FROM_FN_PTR(address, __svml_tanh2_ha_e9);
6478           StubRoutines::_vector_double256_tanh = CAST_FROM_FN_PTR(address, __svml_tanh4_ha_e9);
6479           StubRoutines::_vector_float64_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_e9);
6480           StubRoutines::_vector_float128_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_e9);
6481           StubRoutines::_vector_float256_acos = CAST_FROM_FN_PTR(address, __svml_acosf8_ha_e9);
6482           StubRoutines::_vector_double64_acos = CAST_FROM_FN_PTR(address, __svml_acos1_ha_e9);
6483           StubRoutines::_vector_double128_acos = CAST_FROM_FN_PTR(address, __svml_acos2_ha_e9);
6484           StubRoutines::_vector_double256_acos = CAST_FROM_FN_PTR(address, __svml_acos4_ha_e9);
6485           StubRoutines::_vector_float64_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_e9);
6486           StubRoutines::_vector_float128_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_e9);
6487           StubRoutines::_vector_float256_asin = CAST_FROM_FN_PTR(address, __svml_asinf8_ha_e9);
6488           StubRoutines::_vector_double64_asin = CAST_FROM_FN_PTR(address, __svml_asin1_ha_e9);
6489           StubRoutines::_vector_double128_asin = CAST_FROM_FN_PTR(address, __svml_asin2_ha_e9);
6490           StubRoutines::_vector_double256_asin = CAST_FROM_FN_PTR(address, __svml_asin4_ha_e9);
6491           StubRoutines::_vector_float64_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_e9);
6492           StubRoutines::_vector_float128_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_e9);
6493           StubRoutines::_vector_float256_atan = CAST_FROM_FN_PTR(address, __svml_atanf8_ha_e9);
6494           StubRoutines::_vector_double64_atan = CAST_FROM_FN_PTR(address, __svml_atan1_ha_e9);
6495           StubRoutines::_vector_double128_atan = CAST_FROM_FN_PTR(address, __svml_atan2_ha_e9);
6496           StubRoutines::_vector_double256_atan = CAST_FROM_FN_PTR(address, __svml_atan4_ha_e9);
6497           StubRoutines::_vector_float64_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_e9);
6498           StubRoutines::_vector_float128_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_e9);
6499           StubRoutines::_vector_float256_pow = CAST_FROM_FN_PTR(address, __svml_powf8_ha_e9);
6500           StubRoutines::_vector_double64_pow = CAST_FROM_FN_PTR(address, __svml_pow1_ha_e9);
6501           StubRoutines::_vector_double128_pow = CAST_FROM_FN_PTR(address, __svml_pow2_ha_e9);
6502           StubRoutines::_vector_double256_pow = CAST_FROM_FN_PTR(address, __svml_pow4_ha_e9);
6503           StubRoutines::_vector_float64_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_e9);
6504           StubRoutines::_vector_float128_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_e9);
6505           StubRoutines::_vector_float256_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf8_ha_e9);
6506           StubRoutines::_vector_double64_hypot = CAST_FROM_FN_PTR(address, __svml_hypot1_ha_e9);
6507           StubRoutines::_vector_double128_hypot = CAST_FROM_FN_PTR(address, __svml_hypot2_ha_e9);
6508           StubRoutines::_vector_double256_hypot = CAST_FROM_FN_PTR(address, __svml_hypot4_ha_e9);
6509           StubRoutines::_vector_float64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_e9);
6510           StubRoutines::_vector_float128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_e9);
6511           StubRoutines::_vector_float256_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf8_ha_e9);
6512           StubRoutines::_vector_double64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt1_ha_e9);
6513           StubRoutines::_vector_double128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt2_ha_e9);
6514           StubRoutines::_vector_double256_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt4_ha_e9);
6515           StubRoutines::_vector_float64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_e9);
6516           StubRoutines::_vector_float128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_e9);
6517           StubRoutines::_vector_float256_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f8_ha_e9);
6518           StubRoutines::_vector_double64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan21_ha_e9);
6519           StubRoutines::_vector_double128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan22_ha_e9);
6520           StubRoutines::_vector_double256_atan2 = CAST_FROM_FN_PTR(address, __svml_atan24_ha_e9);  
6521         }  
6522         else {
6523           StubRoutines::_vector_float64_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_l9);  
6524           StubRoutines::_vector_float128_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_l9);
6525           StubRoutines::_vector_float256_exp = CAST_FROM_FN_PTR(address, __svml_expf8_ha_l9); 
6526           StubRoutines::_vector_double64_exp = CAST_FROM_FN_PTR(address, __svml_exp1_ha_l9);  
6527           StubRoutines::_vector_double128_exp = CAST_FROM_FN_PTR(address, __svml_exp2_ha_l9); 
6528           StubRoutines::_vector_double256_exp = CAST_FROM_FN_PTR(address, __svml_exp4_ha_l9);
6529           StubRoutines::_vector_float64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_l9);
6530           StubRoutines::_vector_float128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_l9);
6531           StubRoutines::_vector_float256_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f8_ha_l9);
6532           StubRoutines::_vector_double64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm11_ha_l9);
6533           StubRoutines::_vector_double128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm12_ha_l9);
6534           StubRoutines::_vector_double256_expm1 = CAST_FROM_FN_PTR(address, __svml_expm14_ha_l9);
6535           StubRoutines::_vector_float64_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_l9);
6536           StubRoutines::_vector_float128_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_l9);
6537           StubRoutines::_vector_float256_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf8_ha_l9);
6538           StubRoutines::_vector_double64_log1p = CAST_FROM_FN_PTR(address, __svml_log1p1_ha_l9);
6539           StubRoutines::_vector_double128_log1p = CAST_FROM_FN_PTR(address, __svml_log1p2_ha_l9);
6540           StubRoutines::_vector_double256_log1p = CAST_FROM_FN_PTR(address, __svml_log1p4_ha_l9);
6541           StubRoutines::_vector_float64_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_l9);
6542           StubRoutines::_vector_float128_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_l9);
6543           StubRoutines::_vector_float256_log = CAST_FROM_FN_PTR(address, __svml_logf8_ha_l9);
6544           StubRoutines::_vector_double64_log = CAST_FROM_FN_PTR(address, __svml_log1_ha_l9);
6545           StubRoutines::_vector_double128_log = CAST_FROM_FN_PTR(address, __svml_log2_ha_l9);
6546           StubRoutines::_vector_double256_log = CAST_FROM_FN_PTR(address, __svml_log4_ha_l9);
6547           StubRoutines::_vector_float64_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_l9);
6548           StubRoutines::_vector_float128_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_l9);
6549           StubRoutines::_vector_float256_log10 = CAST_FROM_FN_PTR(address, __svml_log10f8_ha_l9);
6550           StubRoutines::_vector_double64_log10 = CAST_FROM_FN_PTR(address, __svml_log101_ha_l9);
6551           StubRoutines::_vector_double128_log10 = CAST_FROM_FN_PTR(address, __svml_log102_ha_l9);
6552           StubRoutines::_vector_double256_log10 = CAST_FROM_FN_PTR(address, __svml_log104_ha_l9);
6553           StubRoutines::_vector_float64_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_l9);
6554           StubRoutines::_vector_float128_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_l9);
6555           StubRoutines::_vector_float256_sin = CAST_FROM_FN_PTR(address, __svml_sinf8_ha_l9);
6556           StubRoutines::_vector_double64_sin = CAST_FROM_FN_PTR(address, __svml_sin1_ha_l9);
6557           StubRoutines::_vector_double128_sin = CAST_FROM_FN_PTR(address, __svml_sin2_ha_l9);
6558           StubRoutines::_vector_double256_sin = CAST_FROM_FN_PTR(address, __svml_sin4_ha_l9);
6559           StubRoutines::_vector_float64_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_l9);
6560           StubRoutines::_vector_float128_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_l9);
6561           StubRoutines::_vector_float256_cos = CAST_FROM_FN_PTR(address, __svml_cosf8_ha_l9);
6562           StubRoutines::_vector_double64_cos = CAST_FROM_FN_PTR(address, __svml_cos1_ha_l9);
6563           StubRoutines::_vector_double128_cos = CAST_FROM_FN_PTR(address, __svml_cos2_ha_l9);
6564           StubRoutines::_vector_double256_cos = CAST_FROM_FN_PTR(address, __svml_cos4_ha_l9);
6565           StubRoutines::_vector_float64_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_l9);
6566           StubRoutines::_vector_float128_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_l9);
6567           StubRoutines::_vector_float256_tan = CAST_FROM_FN_PTR(address, __svml_tanf8_ha_l9);
6568           StubRoutines::_vector_double64_tan = CAST_FROM_FN_PTR(address, __svml_tan1_ha_l9);
6569           StubRoutines::_vector_double128_tan = CAST_FROM_FN_PTR(address, __svml_tan2_ha_l9);
6570           StubRoutines::_vector_double256_tan = CAST_FROM_FN_PTR(address, __svml_tan4_ha_l9);
6571           StubRoutines::_vector_float64_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_l9);
6572           StubRoutines::_vector_float128_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_l9);
6573           StubRoutines::_vector_float256_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf8_ha_l9);
6574           StubRoutines::_vector_double64_sinh = CAST_FROM_FN_PTR(address, __svml_sinh1_ha_l9);
6575           StubRoutines::_vector_double128_sinh = CAST_FROM_FN_PTR(address, __svml_sinh2_ha_l9);
6576           StubRoutines::_vector_double256_sinh = CAST_FROM_FN_PTR(address, __svml_sinh4_ha_l9);
6577           StubRoutines::_vector_float64_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_l9);
6578           StubRoutines::_vector_float128_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_l9);
6579           StubRoutines::_vector_float256_cosh = CAST_FROM_FN_PTR(address, __svml_coshf8_ha_l9);
6580           StubRoutines::_vector_double64_cosh = CAST_FROM_FN_PTR(address, __svml_cosh1_ha_l9);
6581           StubRoutines::_vector_double128_cosh = CAST_FROM_FN_PTR(address, __svml_cosh2_ha_l9);
6582           StubRoutines::_vector_double256_cosh = CAST_FROM_FN_PTR(address, __svml_cosh4_ha_l9);
6583           StubRoutines::_vector_float64_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_l9);
6584           StubRoutines::_vector_float128_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_l9);
6585           StubRoutines::_vector_float256_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf8_ha_l9);
6586           StubRoutines::_vector_double64_tanh = CAST_FROM_FN_PTR(address, __svml_tanh1_ha_l9);
6587           StubRoutines::_vector_double128_tanh = CAST_FROM_FN_PTR(address, __svml_tanh2_ha_l9);
6588           StubRoutines::_vector_double256_tanh = CAST_FROM_FN_PTR(address, __svml_tanh4_ha_l9);
6589           StubRoutines::_vector_float64_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_l9);
6590           StubRoutines::_vector_float128_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_l9);
6591           StubRoutines::_vector_float256_acos = CAST_FROM_FN_PTR(address, __svml_acosf8_ha_l9);
6592           StubRoutines::_vector_double64_acos = CAST_FROM_FN_PTR(address, __svml_acos1_ha_l9);
6593           StubRoutines::_vector_double128_acos = CAST_FROM_FN_PTR(address, __svml_acos2_ha_l9);
6594           StubRoutines::_vector_double256_acos = CAST_FROM_FN_PTR(address, __svml_acos4_ha_l9);
6595           StubRoutines::_vector_float64_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_l9);
6596           StubRoutines::_vector_float128_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_l9);
6597           StubRoutines::_vector_float256_asin = CAST_FROM_FN_PTR(address, __svml_asinf8_ha_l9);
6598           StubRoutines::_vector_double64_asin = CAST_FROM_FN_PTR(address, __svml_asin1_ha_l9);
6599           StubRoutines::_vector_double128_asin = CAST_FROM_FN_PTR(address, __svml_asin2_ha_l9);
6600           StubRoutines::_vector_double256_asin = CAST_FROM_FN_PTR(address, __svml_asin4_ha_l9);
6601           StubRoutines::_vector_float64_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_l9);
6602           StubRoutines::_vector_float128_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_l9);
6603           StubRoutines::_vector_float256_atan = CAST_FROM_FN_PTR(address, __svml_atanf8_ha_l9);
6604           StubRoutines::_vector_double64_atan = CAST_FROM_FN_PTR(address, __svml_atan1_ha_l9);
6605           StubRoutines::_vector_double128_atan = CAST_FROM_FN_PTR(address, __svml_atan2_ha_l9);
6606           StubRoutines::_vector_double256_atan = CAST_FROM_FN_PTR(address, __svml_atan4_ha_l9);
6607           StubRoutines::_vector_float64_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_l9);
6608           StubRoutines::_vector_float128_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_l9);
6609           StubRoutines::_vector_float256_pow = CAST_FROM_FN_PTR(address, __svml_powf8_ha_l9);
6610           StubRoutines::_vector_double64_pow = CAST_FROM_FN_PTR(address, __svml_pow1_ha_l9);
6611           StubRoutines::_vector_double128_pow = CAST_FROM_FN_PTR(address, __svml_pow2_ha_l9);
6612           StubRoutines::_vector_double256_pow = CAST_FROM_FN_PTR(address, __svml_pow4_ha_l9);
6613           StubRoutines::_vector_float64_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_l9);
6614           StubRoutines::_vector_float128_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_l9);
6615           StubRoutines::_vector_float256_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf8_ha_l9);
6616           StubRoutines::_vector_double64_hypot = CAST_FROM_FN_PTR(address, __svml_hypot1_ha_l9);
6617           StubRoutines::_vector_double128_hypot = CAST_FROM_FN_PTR(address, __svml_hypot2_ha_l9);
6618           StubRoutines::_vector_double256_hypot = CAST_FROM_FN_PTR(address, __svml_hypot4_ha_l9);
6619           StubRoutines::_vector_float64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_l9);
6620           StubRoutines::_vector_float128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_l9);
6621           StubRoutines::_vector_float256_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf8_ha_l9);
6622           StubRoutines::_vector_double64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt1_ha_l9);
6623           StubRoutines::_vector_double128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt2_ha_l9);
6624           StubRoutines::_vector_double256_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt4_ha_l9);
6625           StubRoutines::_vector_float64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_l9);
6626           StubRoutines::_vector_float128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_l9);
6627           StubRoutines::_vector_float256_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f8_ha_l9);
6628           StubRoutines::_vector_double64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan21_ha_l9);
6629           StubRoutines::_vector_double128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan22_ha_l9);
6630           StubRoutines::_vector_double256_atan2 = CAST_FROM_FN_PTR(address, __svml_atan24_ha_l9);  
6631       }  
6632         
6633        
6634       } else if (UseSSE>=2) {
6635         StubRoutines::_vector_float64_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_ex);  
6636         StubRoutines::_vector_float128_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_ex);  
6637         StubRoutines::_vector_double64_exp = CAST_FROM_FN_PTR(address, __svml_exp1_ha_ex);  
6638         StubRoutines::_vector_double128_exp = CAST_FROM_FN_PTR(address, __svml_exp2_ha_ex);  
6639         StubRoutines::_vector_float64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_ex);
6640         StubRoutines::_vector_float128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_ex);  
6641         StubRoutines::_vector_double64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm11_ha_ex);
6642         StubRoutines::_vector_double128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm12_ha_ex);   
6643         StubRoutines::_vector_float64_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_ex);   
6644         StubRoutines::_vector_float128_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_ex);  
6645         StubRoutines::_vector_double64_acos = CAST_FROM_FN_PTR(address, __svml_acos1_ha_ex);
6646         StubRoutines::_vector_double128_acos = CAST_FROM_FN_PTR(address, __svml_acos2_ha_ex);      
6647         StubRoutines::_vector_float64_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_ex);
6648         StubRoutines::_vector_float128_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_ex);  
6649         StubRoutines::_vector_double64_asin = CAST_FROM_FN_PTR(address, __svml_asin1_ha_ex);
6650         StubRoutines::_vector_double128_asin = CAST_FROM_FN_PTR(address, __svml_asin2_ha_ex);      
6651         StubRoutines::_vector_float64_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_ex);
6652         StubRoutines::_vector_float128_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_ex);  
6653         StubRoutines::_vector_double64_atan = CAST_FROM_FN_PTR(address, __svml_atan1_ha_ex);
6654         StubRoutines::_vector_double128_atan = CAST_FROM_FN_PTR(address, __svml_atan2_ha_ex);      
6655         StubRoutines::_vector_float64_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_ex);
6656         StubRoutines::_vector_float128_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_ex);  
6657         StubRoutines::_vector_double64_sin = CAST_FROM_FN_PTR(address, __svml_sin1_ha_ex);
6658         StubRoutines::_vector_double128_sin = CAST_FROM_FN_PTR(address, __svml_sin2_ha_ex);      
6659         StubRoutines::_vector_float64_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_ex);
6660         StubRoutines::_vector_float128_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_ex);  
6661         StubRoutines::_vector_double64_cos = CAST_FROM_FN_PTR(address, __svml_cos1_ha_ex);
6662         StubRoutines::_vector_double128_cos = CAST_FROM_FN_PTR(address, __svml_cos2_ha_ex);      
6663         StubRoutines::_vector_float64_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_ex);
6664         StubRoutines::_vector_float128_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_ex);  
6665         StubRoutines::_vector_double64_tan = CAST_FROM_FN_PTR(address, __svml_tan1_ha_ex);
6666         StubRoutines::_vector_double128_tan = CAST_FROM_FN_PTR(address, __svml_tan2_ha_ex);      
6667         StubRoutines::_vector_float64_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_ex);
6668         StubRoutines::_vector_float128_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_ex);  
6669         StubRoutines::_vector_double64_sinh = CAST_FROM_FN_PTR(address, __svml_sinh1_ha_ex);
6670         StubRoutines::_vector_double128_sinh = CAST_FROM_FN_PTR(address, __svml_sinh2_ha_ex);      
6671         StubRoutines::_vector_float64_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_ex);
6672         StubRoutines::_vector_float128_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_ex);
6673         StubRoutines::_vector_double64_cosh = CAST_FROM_FN_PTR(address, __svml_cosh1_ha_ex);  
6674         StubRoutines::_vector_double128_cosh = CAST_FROM_FN_PTR(address, __svml_cosh2_ha_ex);      
6675         StubRoutines::_vector_float64_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_ex);
6676         StubRoutines::_vector_float128_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_ex);  
6677         StubRoutines::_vector_double64_tanh = CAST_FROM_FN_PTR(address, __svml_tanh1_ha_ex);
6678         StubRoutines::_vector_double128_tanh = CAST_FROM_FN_PTR(address, __svml_tanh2_ha_ex);      
6679         StubRoutines::_vector_float64_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_ex);
6680         StubRoutines::_vector_float128_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_ex);  
6681         StubRoutines::_vector_double64_log = CAST_FROM_FN_PTR(address, __svml_log1_ha_ex);
6682         StubRoutines::_vector_double128_log = CAST_FROM_FN_PTR(address, __svml_log2_ha_ex);      
6683         StubRoutines::_vector_float64_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_ex);
6684         StubRoutines::_vector_float128_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_ex);  
6685         StubRoutines::_vector_double64_log10 = CAST_FROM_FN_PTR(address, __svml_log101_ha_ex);
6686         StubRoutines::_vector_double128_log10 = CAST_FROM_FN_PTR(address, __svml_log102_ha_ex);      
6687         StubRoutines::_vector_float64_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_ex);
6688         StubRoutines::_vector_float128_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_ex);  
6689         StubRoutines::_vector_double64_log1p = CAST_FROM_FN_PTR(address, __svml_log1p1_ha_ex);
6690         StubRoutines::_vector_double128_log1p = CAST_FROM_FN_PTR(address, __svml_log1p2_ha_ex);      
6691         StubRoutines::_vector_float64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_ex);
6692         StubRoutines::_vector_float128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_ex); 
6693         StubRoutines::_vector_double64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan21_ha_ex);
6694         StubRoutines::_vector_double128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan22_ha_ex);      
6695         StubRoutines::_vector_float64_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_ex);
6696         StubRoutines::_vector_float128_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_ex);  
6697         StubRoutines::_vector_double64_hypot = CAST_FROM_FN_PTR(address, __svml_hypot1_ha_ex);
6698         StubRoutines::_vector_double128_hypot = CAST_FROM_FN_PTR(address, __svml_hypot2_ha_ex);      
6699         StubRoutines::_vector_float64_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_ex);
6700         StubRoutines::_vector_float128_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_ex);  
6701         StubRoutines::_vector_double64_pow = CAST_FROM_FN_PTR(address, __svml_pow1_ha_ex);
6702         StubRoutines::_vector_double128_pow = CAST_FROM_FN_PTR(address, __svml_pow2_ha_ex);      
6703         StubRoutines::_vector_float64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_ex);
6704         StubRoutines::_vector_float128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_ex);  
6705         StubRoutines::_vector_double64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt1_ha_ex);
6706         StubRoutines::_vector_double128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt2_ha_ex);      
6707       }
6708   }
6709 #endif
6710   }
6711 
6712  public:
6713   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
6714     if (all) {
6715       generate_all();
6716     } else {
6717       generate_initial();
6718     }
6719   }
6720 }; // end class declaration
6721 
6722 void StubGenerator_generate(CodeBuffer* code, bool all) {
6723   StubGenerator g(code, all);
6724 }