1 /* 2 * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/macroAssembler.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "ci/ciUtilities.hpp" 29 #include "gc/shared/barrierSet.hpp" 30 #include "gc/shared/barrierSetAssembler.hpp" 31 #include "gc/shared/barrierSetNMethod.hpp" 32 #include "interpreter/interpreter.hpp" 33 #include "nativeInst_x86.hpp" 34 #include "oops/instanceOop.hpp" 35 #include "oops/method.hpp" 36 #include "oops/objArrayKlass.hpp" 37 #include "oops/oop.inline.hpp" 38 #include "prims/methodHandles.hpp" 39 #include "runtime/frame.inline.hpp" 40 #include "runtime/handles.inline.hpp" 41 #include "runtime/sharedRuntime.hpp" 42 #include "runtime/stubCodeGenerator.hpp" 43 #include "runtime/stubRoutines.hpp" 44 #include "runtime/thread.inline.hpp" 45 #ifdef COMPILER2 46 #include "opto/runtime.hpp" 47 #endif 48 #if INCLUDE_ZGC 49 #include "gc/z/zThreadLocalData.hpp" 50 #endif 51 52 #ifdef __VECTOR_API_MATH_INTRINSICS_COMMON 53 // Vector API SVML routines written in assembly 54 extern "C" 55 { 56 float __svml_expf4_ha_ex(float a); 57 double __svml_exp1_ha_ex(double a); 58 double __svml_exp2_ha_ex(double a); 59 float __svml_expf4_ha_l9(float a); 60 float __svml_expf8_ha_l9(float a); 61 float __svml_expf4_ha_e9(float a); 62 float __svml_expf8_ha_e9(float a); 63 float __svml_expf16_ha_z0(float a); 64 double __svml_exp1_ha_l9(double a); 65 double __svml_exp2_ha_l9(double a); 66 double __svml_exp4_ha_l9(double a); 67 double __svml_exp1_ha_e9(double a); 68 double __svml_exp2_ha_e9(double a); 69 double __svml_exp4_ha_e9(double a); 70 double __svml_exp8_ha_z0(double a); 71 float __svml_expm1f4_ha_ex(float a); 72 double __svml_expm11_ha_ex(double a); 73 double __svml_expm12_ha_ex(double a); 74 float __svml_expm1f4_ha_l9(float a); 75 float __svml_expm1f8_ha_l9(float a); 76 float __svml_expm1f4_ha_e9(float a); 77 float __svml_expm1f8_ha_e9(float a); 78 float __svml_expm1f16_ha_z0(float a); 79 double __svml_expm11_ha_l9(double a); 80 double __svml_expm12_ha_l9(double a); 81 double __svml_expm14_ha_l9(double a); 82 double __svml_expm11_ha_e9(double a); 83 double __svml_expm12_ha_e9(double a); 84 double __svml_expm14_ha_e9(double a); 85 double __svml_expm18_ha_z0(double a); 86 float __svml_log1pf4_ha_l9(float a); 87 float __svml_log1pf8_ha_l9(float a); 88 float __svml_log1pf4_ha_e9(float a); 89 float __svml_log1pf8_ha_e9(float a); 90 float __svml_log1pf16_ha_z0(float a); 91 double __svml_log1p1_ha_l9(double a); 92 double __svml_log1p2_ha_l9(double a); 93 double __svml_log1p4_ha_l9(double a); 94 double __svml_log1p1_ha_e9(double a); 95 double __svml_log1p2_ha_e9(double a); 96 double __svml_log1p4_ha_e9(double a); 97 double __svml_log1p8_ha_z0(double a); 98 float __svml_logf4_ha_l9(float a); 99 float __svml_logf8_ha_l9(float a); 100 float __svml_logf4_ha_e9(float a); 101 float __svml_logf8_ha_e9(float a); 102 float __svml_logf16_ha_z0(float a); 103 double __svml_log1_ha_l9(double a); 104 double __svml_log2_ha_l9(double a); 105 double __svml_log4_ha_l9(double a); 106 double __svml_log1_ha_e9(double a); 107 double __svml_log2_ha_e9(double a); 108 double __svml_log4_ha_e9(double a); 109 double __svml_log8_ha_z0(double a); 110 float __svml_log10f4_ha_l9(float a); 111 float __svml_log10f8_ha_l9(float a); 112 float __svml_log10f4_ha_e9(float a); 113 float __svml_log10f8_ha_e9(float a); 114 float __svml_log10f16_ha_z0(float a); 115 double __svml_log101_ha_l9(double a); 116 double __svml_log102_ha_l9(double a); 117 double __svml_log104_ha_l9(double a); 118 double __svml_log101_ha_e9(double a); 119 double __svml_log102_ha_e9(double a); 120 double __svml_log104_ha_e9(double a); 121 double __svml_log108_ha_z0(double a); 122 float __svml_sinf4_ha_l9(float a); 123 float __svml_sinf8_ha_l9(float a); 124 float __svml_sinf4_ha_e9(float a); 125 float __svml_sinf8_ha_e9(float a); 126 float __svml_sinf16_ha_z0(float a); 127 double __svml_sin1_ha_l9(double a); 128 double __svml_sin2_ha_l9(double a); 129 double __svml_sin4_ha_l9(double a); 130 double __svml_sin1_ha_e9(double a); 131 double __svml_sin2_ha_e9(double a); 132 double __svml_sin4_ha_e9(double a); 133 double __svml_sin8_ha_z0(double a); 134 float __svml_cosf4_ha_l9(float a); 135 float __svml_cosf8_ha_l9(float a); 136 float __svml_cosf4_ha_e9(float a); 137 float __svml_cosf8_ha_e9(float a); 138 float __svml_cosf16_ha_z0(float a); 139 double __svml_cos1_ha_l9(double a); 140 double __svml_cos2_ha_l9(double a); 141 double __svml_cos4_ha_l9(double a); 142 double __svml_cos1_ha_e9(double a); 143 double __svml_cos2_ha_e9(double a); 144 double __svml_cos4_ha_e9(double a); 145 double __svml_cos8_ha_z0(double a); 146 float __svml_tanf4_ha_l9(float a); 147 float __svml_tanf8_ha_l9(float a); 148 float __svml_tanf4_ha_e9(float a); 149 float __svml_tanf8_ha_e9(float a); 150 float __svml_tanf16_ha_z0(float a); 151 double __svml_tan1_ha_l9(double a); 152 double __svml_tan2_ha_l9(double a); 153 double __svml_tan4_ha_l9(double a); 154 double __svml_tan1_ha_e9(double a); 155 double __svml_tan2_ha_e9(double a); 156 double __svml_tan4_ha_e9(double a); 157 double __svml_tan8_ha_z0(double a); 158 double __svml_sinh1_ha_l9(double a); 159 double __svml_sinh2_ha_l9(double a); 160 double __svml_sinh4_ha_l9(double a); 161 double __svml_sinh1_ha_e9(double a); 162 double __svml_sinh2_ha_e9(double a); 163 double __svml_sinh4_ha_e9(double a); 164 double __svml_sinh8_ha_z0(double a); 165 float __svml_sinhf4_ha_l9(float a); 166 float __svml_sinhf8_ha_l9(float a); 167 float __svml_sinhf4_ha_e9(float a); 168 float __svml_sinhf8_ha_e9(float a); 169 float __svml_sinhf16_ha_z0(float a); 170 double __svml_cosh1_ha_l9(double a); 171 double __svml_cosh2_ha_l9(double a); 172 double __svml_cosh4_ha_l9(double a); 173 double __svml_cosh1_ha_e9(double a); 174 double __svml_cosh2_ha_e9(double a); 175 double __svml_cosh4_ha_e9(double a); 176 double __svml_cosh8_ha_z0(double a); 177 float __svml_coshf4_ha_l9(float a); 178 float __svml_coshf8_ha_l9(float a); 179 float __svml_coshf4_ha_e9(float a); 180 float __svml_coshf8_ha_e9(float a); 181 float __svml_coshf16_ha_z0(float a); 182 double __svml_tanh1_ha_l9(double a); 183 double __svml_tanh2_ha_l9(double a); 184 double __svml_tanh4_ha_l9(double a); 185 double __svml_tanh1_ha_e9(double a); 186 double __svml_tanh2_ha_e9(double a); 187 double __svml_tanh4_ha_e9(double a); 188 double __svml_tanh8_ha_z0(double a); 189 float __svml_tanhf4_ha_l9(float a); 190 float __svml_tanhf8_ha_l9(float a); 191 float __svml_tanhf4_ha_e9(float a); 192 float __svml_tanhf8_ha_e9(float a); 193 float __svml_tanhf16_ha_z0(float a); 194 float __svml_acosf4_ha_ex(float a); 195 float __svml_acosf4_ha_l9(float a); 196 float __svml_acosf8_ha_l9(float a); 197 float __svml_acosf4_ha_e9(float a); 198 float __svml_acosf8_ha_e9(float a); 199 float __svml_acosf16_ha_z0(float a); 200 double __svml_acos1_ha_ex(double a); 201 double __svml_acos2_ha_ex(double a); 202 double __svml_acos1_ha_l9(double a); 203 double __svml_acos2_ha_l9(double a); 204 double __svml_acos4_ha_l9(double a); 205 double __svml_acos1_ha_e9(double a); 206 double __svml_acos2_ha_e9(double a); 207 double __svml_acos4_ha_e9(double a); 208 double __svml_acos8_ha_z0(double a); 209 float __svml_asinf4_ha_ex(float a); 210 double __svml_asin1_ha_ex(double a); 211 double __svml_asin2_ha_ex(double a); 212 double __svml_asin1_ha_l9(double a); 213 double __svml_asin2_ha_l9(double a); 214 double __svml_asin4_ha_l9(double a); 215 double __svml_asin1_ha_e9(double a); 216 double __svml_asin2_ha_e9(double a); 217 double __svml_asin4_ha_e9(double a); 218 double __svml_asin8_ha_z0(double a); 219 float __svml_asinf4_ha_l9(float a); 220 float __svml_asinf8_ha_l9(float a); 221 float __svml_asinf4_ha_e9(float a); 222 float __svml_asinf8_ha_e9(float a); 223 float __svml_asinf16_ha_z0(float a); 224 float __svml_atanf4_ha_ex(float a); 225 double __svml_atan1_ha_ex(double a); 226 double __svml_atan2_ha_ex(double a); 227 double __svml_atan1_ha_l9(double a); 228 double __svml_atan2_ha_l9(double a); 229 double __svml_atan4_ha_l9(double a); 230 double __svml_atan1_ha_e9(double a); 231 double __svml_atan2_ha_e9(double a); 232 double __svml_atan4_ha_e9(double a); 233 double __svml_atan8_ha_z0(double a); 234 float __svml_atanf4_ha_l9(float a); 235 float __svml_atanf8_ha_l9(float a); 236 float __svml_atanf4_ha_e9(float a); 237 float __svml_atanf8_ha_e9(float a); 238 float __svml_atanf16_ha_z0(float a); 239 float __svml_powf4_ha_l9(float a, float b); 240 float __svml_powf8_ha_l9(float a, float b); 241 float __svml_powf4_ha_e9(float a, float b); 242 float __svml_powf8_ha_e9(float a, float b); 243 float __svml_powf16_ha_z0(float a, float b); 244 double __svml_pow1_ha_l9(double a, double b); 245 double __svml_pow2_ha_l9(double a, double b); 246 double __svml_pow4_ha_l9(double a, double b); 247 double __svml_pow1_ha_e9(double a, double b); 248 double __svml_pow2_ha_e9(double a, double b); 249 double __svml_pow4_ha_e9(double a, double b); 250 double __svml_pow8_ha_z0(double a, double b); 251 float __svml_hypotf4_ha_l9(float a, float b); 252 float __svml_hypotf8_ha_l9(float a, float b); 253 float __svml_hypotf4_ha_e9(float a, float b); 254 float __svml_hypotf8_ha_e9(float a, float b); 255 float __svml_hypotf16_ha_z0(float a, float b); 256 double __svml_hypot1_ha_l9(double a, double b); 257 double __svml_hypot2_ha_l9(double a, double b); 258 double __svml_hypot4_ha_l9(double a, double b); 259 double __svml_hypot1_ha_e9(double a, double b); 260 double __svml_hypot2_ha_e9(double a, double b); 261 double __svml_hypot4_ha_e9(double a, double b); 262 double __svml_hypot8_ha_z0(double a, double b); 263 float __svml_cbrtf4_ha_l9(float a); 264 float __svml_cbrtf8_ha_l9(float a); 265 float __svml_cbrtf4_ha_e9(float a); 266 float __svml_cbrtf8_ha_e9(float a); 267 float __svml_cbrtf16_ha_z0(float a); 268 double __svml_cbrt1_ha_l9(double a); 269 double __svml_cbrt2_ha_l9(double a); 270 double __svml_cbrt4_ha_l9(double a); 271 double __svml_cbrt1_ha_e9(double a); 272 double __svml_cbrt2_ha_e9(double a); 273 double __svml_cbrt4_ha_e9(double a); 274 double __svml_cbrt8_ha_z0(double a); 275 float __svml_atan2f4_ha_l9(float a, float b); 276 float __svml_atan2f8_ha_l9(float a, float b); 277 float __svml_atan2f4_ha_e9(float a, float b); 278 float __svml_atan2f8_ha_e9(float a, float b); 279 float __svml_atan2f16_ha_z0(float a, float b); 280 double __svml_atan21_ha_l9(double a, double b); 281 double __svml_atan22_ha_l9(double a, double b); 282 double __svml_atan24_ha_l9(double a, double b); 283 double __svml_atan28_ha_z0(double a, double b); 284 double __svml_atan21_ha_e9(double a, double b); 285 double __svml_atan22_ha_e9(double a, double b); 286 double __svml_atan24_ha_e9(double a, double b); 287 float __svml_sinf4_ha_ex(float a); 288 double __svml_sin1_ha_ex(double a); 289 double __svml_sin2_ha_ex(double a); 290 float __svml_cosf4_ha_ex(float a); 291 double __svml_cos1_ha_ex(double a); 292 double __svml_cos2_ha_ex(double a); 293 float __svml_tanf4_ha_ex(float a); 294 double __svml_tan1_ha_ex(double a); 295 double __svml_tan2_ha_ex(double a); 296 float __svml_sinhf4_ha_ex(float a); 297 double __svml_sinh1_ha_ex(double a); 298 double __svml_sinh2_ha_ex(double a); 299 float __svml_coshf4_ha_ex(float a); 300 double __svml_cosh1_ha_ex(double a); 301 double __svml_cosh2_ha_ex(double a); 302 float __svml_tanhf4_ha_ex(float a); 303 double __svml_tanh1_ha_ex(double a); 304 double __svml_tanh2_ha_ex(double a); 305 double __svml_log1_ha_ex(double a); 306 double __svml_log2_ha_ex(double a); 307 double __svml_log1p1_ha_ex(double a); 308 double __svml_log1p2_ha_ex(double a); 309 double __svml_log101_ha_ex(double a); 310 double __svml_log102_ha_ex(double a); 311 float __svml_logf4_ha_ex(float a); 312 float __svml_log1pf4_ha_ex(float a); 313 float __svml_log10f4_ha_ex(float a); 314 double __svml_atan21_ha_ex(double a); 315 double __svml_atan22_ha_ex(double a); 316 float __svml_atan2f4_ha_ex(float a); 317 float __svml_hypotf4_ha_ex(float a); 318 double __svml_hypot1_ha_ex(double a); 319 double __svml_hypot2_ha_ex(double a); 320 double __svml_pow1_ha_ex(double a); 321 double __svml_pow2_ha_ex(double a); 322 float __svml_powf4_ha_ex(float a); 323 double __svml_cbrt1_ha_ex(double a); 324 double __svml_cbrt2_ha_ex(double a); 325 float __svml_cbrtf4_ha_ex(float a); 326 } 327 #endif 328 329 // Declaration and definition of StubGenerator (no .hpp file). 330 // For a more detailed description of the stub routine structure 331 // see the comment in stubRoutines.hpp 332 333 #define __ _masm-> 334 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8) 335 #define a__ ((Assembler*)_masm)-> 336 337 #ifdef PRODUCT 338 #define BLOCK_COMMENT(str) /* nothing */ 339 #else 340 #define BLOCK_COMMENT(str) __ block_comment(str) 341 #endif 342 343 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 344 const int MXCSR_MASK = 0xFFC0; // Mask out any pending exceptions 345 346 // Stub Code definitions 347 348 class StubGenerator: public StubCodeGenerator { 349 private: 350 351 #ifdef PRODUCT 352 #define inc_counter_np(counter) ((void)0) 353 #else 354 void inc_counter_np_(int& counter) { 355 // This can destroy rscratch1 if counter is far from the code cache 356 __ incrementl(ExternalAddress((address)&counter)); 357 } 358 #define inc_counter_np(counter) \ 359 BLOCK_COMMENT("inc_counter " #counter); \ 360 inc_counter_np_(counter); 361 #endif 362 363 // Call stubs are used to call Java from C 364 // 365 // Linux Arguments: 366 // c_rarg0: call wrapper address address 367 // c_rarg1: result address 368 // c_rarg2: result type BasicType 369 // c_rarg3: method Method* 370 // c_rarg4: (interpreter) entry point address 371 // c_rarg5: parameters intptr_t* 372 // 16(rbp): parameter size (in words) int 373 // 24(rbp): thread Thread* 374 // 375 // [ return_from_Java ] <--- rsp 376 // [ argument word n ] 377 // ... 378 // -12 [ argument word 1 ] 379 // -11 [ saved r15 ] <--- rsp_after_call 380 // -10 [ saved r14 ] 381 // -9 [ saved r13 ] 382 // -8 [ saved r12 ] 383 // -7 [ saved rbx ] 384 // -6 [ call wrapper ] 385 // -5 [ result ] 386 // -4 [ result type ] 387 // -3 [ method ] 388 // -2 [ entry point ] 389 // -1 [ parameters ] 390 // 0 [ saved rbp ] <--- rbp 391 // 1 [ return address ] 392 // 2 [ parameter size ] 393 // 3 [ thread ] 394 // 395 // Windows Arguments: 396 // c_rarg0: call wrapper address address 397 // c_rarg1: result address 398 // c_rarg2: result type BasicType 399 // c_rarg3: method Method* 400 // 48(rbp): (interpreter) entry point address 401 // 56(rbp): parameters intptr_t* 402 // 64(rbp): parameter size (in words) int 403 // 72(rbp): thread Thread* 404 // 405 // [ return_from_Java ] <--- rsp 406 // [ argument word n ] 407 // ... 408 // -60 [ argument word 1 ] 409 // -59 [ saved xmm31 ] <--- rsp after_call 410 // [ saved xmm16-xmm30 ] (EVEX enabled, else the space is blank) 411 // -27 [ saved xmm15 ] 412 // [ saved xmm7-xmm14 ] 413 // -9 [ saved xmm6 ] (each xmm register takes 2 slots) 414 // -7 [ saved r15 ] 415 // -6 [ saved r14 ] 416 // -5 [ saved r13 ] 417 // -4 [ saved r12 ] 418 // -3 [ saved rdi ] 419 // -2 [ saved rsi ] 420 // -1 [ saved rbx ] 421 // 0 [ saved rbp ] <--- rbp 422 // 1 [ return address ] 423 // 2 [ call wrapper ] 424 // 3 [ result ] 425 // 4 [ result type ] 426 // 5 [ method ] 427 // 6 [ entry point ] 428 // 7 [ parameters ] 429 // 8 [ parameter size ] 430 // 9 [ thread ] 431 // 432 // Windows reserves the callers stack space for arguments 1-4. 433 // We spill c_rarg0-c_rarg3 to this space. 434 435 // Call stub stack layout word offsets from rbp 436 enum call_stub_layout { 437 #ifdef _WIN64 438 xmm_save_first = 6, // save from xmm6 439 xmm_save_last = 31, // to xmm31 440 xmm_save_base = -9, 441 rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27 442 r15_off = -7, 443 r14_off = -6, 444 r13_off = -5, 445 r12_off = -4, 446 rdi_off = -3, 447 rsi_off = -2, 448 rbx_off = -1, 449 rbp_off = 0, 450 retaddr_off = 1, 451 call_wrapper_off = 2, 452 result_off = 3, 453 result_type_off = 4, 454 method_off = 5, 455 entry_point_off = 6, 456 parameters_off = 7, 457 parameter_size_off = 8, 458 thread_off = 9 459 #else 460 rsp_after_call_off = -12, 461 mxcsr_off = rsp_after_call_off, 462 r15_off = -11, 463 r14_off = -10, 464 r13_off = -9, 465 r12_off = -8, 466 rbx_off = -7, 467 call_wrapper_off = -6, 468 result_off = -5, 469 result_type_off = -4, 470 method_off = -3, 471 entry_point_off = -2, 472 parameters_off = -1, 473 rbp_off = 0, 474 retaddr_off = 1, 475 parameter_size_off = 2, 476 thread_off = 3 477 #endif 478 }; 479 480 #ifdef _WIN64 481 Address xmm_save(int reg) { 482 assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range"); 483 return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize); 484 } 485 #endif 486 487 address generate_call_stub(address& return_address) { 488 assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 && 489 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 490 "adjust this code"); 491 StubCodeMark mark(this, "StubRoutines", "call_stub"); 492 address start = __ pc(); 493 494 // same as in generate_catch_exception()! 495 const Address rsp_after_call(rbp, rsp_after_call_off * wordSize); 496 497 const Address call_wrapper (rbp, call_wrapper_off * wordSize); 498 const Address result (rbp, result_off * wordSize); 499 const Address result_type (rbp, result_type_off * wordSize); 500 const Address method (rbp, method_off * wordSize); 501 const Address entry_point (rbp, entry_point_off * wordSize); 502 const Address parameters (rbp, parameters_off * wordSize); 503 const Address parameter_size(rbp, parameter_size_off * wordSize); 504 505 // same as in generate_catch_exception()! 506 const Address thread (rbp, thread_off * wordSize); 507 508 const Address r15_save(rbp, r15_off * wordSize); 509 const Address r14_save(rbp, r14_off * wordSize); 510 const Address r13_save(rbp, r13_off * wordSize); 511 const Address r12_save(rbp, r12_off * wordSize); 512 const Address rbx_save(rbp, rbx_off * wordSize); 513 514 // stub code 515 __ enter(); 516 __ subptr(rsp, -rsp_after_call_off * wordSize); 517 518 // save register parameters 519 #ifndef _WIN64 520 __ movptr(parameters, c_rarg5); // parameters 521 __ movptr(entry_point, c_rarg4); // entry_point 522 #endif 523 524 __ movptr(method, c_rarg3); // method 525 __ movl(result_type, c_rarg2); // result type 526 __ movptr(result, c_rarg1); // result 527 __ movptr(call_wrapper, c_rarg0); // call wrapper 528 529 // save regs belonging to calling function 530 __ movptr(rbx_save, rbx); 531 __ movptr(r12_save, r12); 532 __ movptr(r13_save, r13); 533 __ movptr(r14_save, r14); 534 __ movptr(r15_save, r15); 535 536 #ifdef _WIN64 537 int last_reg = 15; 538 if (UseAVX > 2) { 539 last_reg = 31; 540 } 541 if (VM_Version::supports_evex()) { 542 for (int i = xmm_save_first; i <= last_reg; i++) { 543 __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0); 544 } 545 } else { 546 for (int i = xmm_save_first; i <= last_reg; i++) { 547 __ movdqu(xmm_save(i), as_XMMRegister(i)); 548 } 549 } 550 551 const Address rdi_save(rbp, rdi_off * wordSize); 552 const Address rsi_save(rbp, rsi_off * wordSize); 553 554 __ movptr(rsi_save, rsi); 555 __ movptr(rdi_save, rdi); 556 #else 557 const Address mxcsr_save(rbp, mxcsr_off * wordSize); 558 { 559 Label skip_ldmx; 560 __ stmxcsr(mxcsr_save); 561 __ movl(rax, mxcsr_save); 562 __ andl(rax, MXCSR_MASK); // Only check control and mask bits 563 ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std()); 564 __ cmp32(rax, mxcsr_std); 565 __ jcc(Assembler::equal, skip_ldmx); 566 __ ldmxcsr(mxcsr_std); 567 __ bind(skip_ldmx); 568 } 569 #endif 570 571 // Load up thread register 572 __ movptr(r15_thread, thread); 573 __ reinit_heapbase(); 574 575 #ifdef ASSERT 576 // make sure we have no pending exceptions 577 { 578 Label L; 579 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD); 580 __ jcc(Assembler::equal, L); 581 __ stop("StubRoutines::call_stub: entered with pending exception"); 582 __ bind(L); 583 } 584 #endif 585 586 // pass parameters if any 587 BLOCK_COMMENT("pass parameters if any"); 588 Label parameters_done; 589 __ movl(c_rarg3, parameter_size); 590 __ testl(c_rarg3, c_rarg3); 591 __ jcc(Assembler::zero, parameters_done); 592 593 Label loop; 594 __ movptr(c_rarg2, parameters); // parameter pointer 595 __ movl(c_rarg1, c_rarg3); // parameter counter is in c_rarg1 596 __ BIND(loop); 597 __ movptr(rax, Address(c_rarg2, 0));// get parameter 598 __ addptr(c_rarg2, wordSize); // advance to next parameter 599 __ decrementl(c_rarg1); // decrement counter 600 __ push(rax); // pass parameter 601 __ jcc(Assembler::notZero, loop); 602 603 // call Java function 604 __ BIND(parameters_done); 605 __ movptr(rbx, method); // get Method* 606 __ movptr(c_rarg1, entry_point); // get entry_point 607 __ mov(r13, rsp); // set sender sp 608 BLOCK_COMMENT("call Java function"); 609 __ call(c_rarg1); 610 611 BLOCK_COMMENT("call_stub_return_address:"); 612 return_address = __ pc(); 613 614 // store result depending on type (everything that is not 615 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 616 __ movptr(c_rarg0, result); 617 Label is_long, is_float, is_double, exit; 618 __ movl(c_rarg1, result_type); 619 __ cmpl(c_rarg1, T_OBJECT); 620 __ jcc(Assembler::equal, is_long); 621 __ cmpl(c_rarg1, T_LONG); 622 __ jcc(Assembler::equal, is_long); 623 __ cmpl(c_rarg1, T_FLOAT); 624 __ jcc(Assembler::equal, is_float); 625 __ cmpl(c_rarg1, T_DOUBLE); 626 __ jcc(Assembler::equal, is_double); 627 628 // handle T_INT case 629 __ movl(Address(c_rarg0, 0), rax); 630 631 __ BIND(exit); 632 633 // pop parameters 634 __ lea(rsp, rsp_after_call); 635 636 #ifdef ASSERT 637 // verify that threads correspond 638 { 639 Label L1, L2, L3; 640 __ cmpptr(r15_thread, thread); 641 __ jcc(Assembler::equal, L1); 642 __ stop("StubRoutines::call_stub: r15_thread is corrupted"); 643 __ bind(L1); 644 __ get_thread(rbx); 645 __ cmpptr(r15_thread, thread); 646 __ jcc(Assembler::equal, L2); 647 __ stop("StubRoutines::call_stub: r15_thread is modified by call"); 648 __ bind(L2); 649 __ cmpptr(r15_thread, rbx); 650 __ jcc(Assembler::equal, L3); 651 __ stop("StubRoutines::call_stub: threads must correspond"); 652 __ bind(L3); 653 } 654 #endif 655 656 // restore regs belonging to calling function 657 #ifdef _WIN64 658 // emit the restores for xmm regs 659 if (VM_Version::supports_evex()) { 660 for (int i = xmm_save_first; i <= last_reg; i++) { 661 __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0); 662 } 663 } else { 664 for (int i = xmm_save_first; i <= last_reg; i++) { 665 __ movdqu(as_XMMRegister(i), xmm_save(i)); 666 } 667 } 668 #endif 669 __ movptr(r15, r15_save); 670 __ movptr(r14, r14_save); 671 __ movptr(r13, r13_save); 672 __ movptr(r12, r12_save); 673 __ movptr(rbx, rbx_save); 674 675 #ifdef _WIN64 676 __ movptr(rdi, rdi_save); 677 __ movptr(rsi, rsi_save); 678 #else 679 __ ldmxcsr(mxcsr_save); 680 #endif 681 682 // restore rsp 683 __ addptr(rsp, -rsp_after_call_off * wordSize); 684 685 // return 686 __ vzeroupper(); 687 __ pop(rbp); 688 __ ret(0); 689 690 // handle return types different from T_INT 691 __ BIND(is_long); 692 __ movq(Address(c_rarg0, 0), rax); 693 __ jmp(exit); 694 695 __ BIND(is_float); 696 __ movflt(Address(c_rarg0, 0), xmm0); 697 __ jmp(exit); 698 699 __ BIND(is_double); 700 __ movdbl(Address(c_rarg0, 0), xmm0); 701 __ jmp(exit); 702 703 return start; 704 } 705 706 // Return point for a Java call if there's an exception thrown in 707 // Java code. The exception is caught and transformed into a 708 // pending exception stored in JavaThread that can be tested from 709 // within the VM. 710 // 711 // Note: Usually the parameters are removed by the callee. In case 712 // of an exception crossing an activation frame boundary, that is 713 // not the case if the callee is compiled code => need to setup the 714 // rsp. 715 // 716 // rax: exception oop 717 718 address generate_catch_exception() { 719 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 720 address start = __ pc(); 721 722 // same as in generate_call_stub(): 723 const Address rsp_after_call(rbp, rsp_after_call_off * wordSize); 724 const Address thread (rbp, thread_off * wordSize); 725 726 #ifdef ASSERT 727 // verify that threads correspond 728 { 729 Label L1, L2, L3; 730 __ cmpptr(r15_thread, thread); 731 __ jcc(Assembler::equal, L1); 732 __ stop("StubRoutines::catch_exception: r15_thread is corrupted"); 733 __ bind(L1); 734 __ get_thread(rbx); 735 __ cmpptr(r15_thread, thread); 736 __ jcc(Assembler::equal, L2); 737 __ stop("StubRoutines::catch_exception: r15_thread is modified by call"); 738 __ bind(L2); 739 __ cmpptr(r15_thread, rbx); 740 __ jcc(Assembler::equal, L3); 741 __ stop("StubRoutines::catch_exception: threads must correspond"); 742 __ bind(L3); 743 } 744 #endif 745 746 // set pending exception 747 __ verify_oop(rax); 748 749 __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax); 750 __ lea(rscratch1, ExternalAddress((address)__FILE__)); 751 __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1); 752 __ movl(Address(r15_thread, Thread::exception_line_offset()), (int) __LINE__); 753 754 // complete return to VM 755 assert(StubRoutines::_call_stub_return_address != NULL, 756 "_call_stub_return_address must have been generated before"); 757 __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address)); 758 759 return start; 760 } 761 762 // Continuation point for runtime calls returning with a pending 763 // exception. The pending exception check happened in the runtime 764 // or native call stub. The pending exception in Thread is 765 // converted into a Java-level exception. 766 // 767 // Contract with Java-level exception handlers: 768 // rax: exception 769 // rdx: throwing pc 770 // 771 // NOTE: At entry of this stub, exception-pc must be on stack !! 772 773 address generate_forward_exception() { 774 StubCodeMark mark(this, "StubRoutines", "forward exception"); 775 address start = __ pc(); 776 777 // Upon entry, the sp points to the return address returning into 778 // Java (interpreted or compiled) code; i.e., the return address 779 // becomes the throwing pc. 780 // 781 // Arguments pushed before the runtime call are still on the stack 782 // but the exception handler will reset the stack pointer -> 783 // ignore them. A potential result in registers can be ignored as 784 // well. 785 786 #ifdef ASSERT 787 // make sure this code is only executed if there is a pending exception 788 { 789 Label L; 790 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL); 791 __ jcc(Assembler::notEqual, L); 792 __ stop("StubRoutines::forward exception: no pending exception (1)"); 793 __ bind(L); 794 } 795 #endif 796 797 // compute exception handler into rbx 798 __ movptr(c_rarg0, Address(rsp, 0)); 799 BLOCK_COMMENT("call exception_handler_for_return_address"); 800 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 801 SharedRuntime::exception_handler_for_return_address), 802 r15_thread, c_rarg0); 803 __ mov(rbx, rax); 804 805 // setup rax & rdx, remove return address & clear pending exception 806 __ pop(rdx); 807 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 808 __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD); 809 810 #ifdef ASSERT 811 // make sure exception is set 812 { 813 Label L; 814 __ testptr(rax, rax); 815 __ jcc(Assembler::notEqual, L); 816 __ stop("StubRoutines::forward exception: no pending exception (2)"); 817 __ bind(L); 818 } 819 #endif 820 821 // continue at exception handler (return address removed) 822 // rax: exception 823 // rbx: exception handler 824 // rdx: throwing pc 825 __ verify_oop(rax); 826 __ jmp(rbx); 827 828 return start; 829 } 830 831 // Support for jint atomic::xchg(jint exchange_value, volatile jint* dest) 832 // 833 // Arguments : 834 // c_rarg0: exchange_value 835 // c_rarg0: dest 836 // 837 // Result: 838 // *dest <- ex, return (orig *dest) 839 address generate_atomic_xchg() { 840 StubCodeMark mark(this, "StubRoutines", "atomic_xchg"); 841 address start = __ pc(); 842 843 __ movl(rax, c_rarg0); // Copy to eax we need a return value anyhow 844 __ xchgl(rax, Address(c_rarg1, 0)); // automatic LOCK 845 __ ret(0); 846 847 return start; 848 } 849 850 // Support for intptr_t atomic::xchg_long(jlong exchange_value, volatile jlong* dest) 851 // 852 // Arguments : 853 // c_rarg0: exchange_value 854 // c_rarg1: dest 855 // 856 // Result: 857 // *dest <- ex, return (orig *dest) 858 address generate_atomic_xchg_long() { 859 StubCodeMark mark(this, "StubRoutines", "atomic_xchg_long"); 860 address start = __ pc(); 861 862 __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow 863 __ xchgptr(rax, Address(c_rarg1, 0)); // automatic LOCK 864 __ ret(0); 865 866 return start; 867 } 868 869 // Support for jint atomic::atomic_cmpxchg(jint exchange_value, volatile jint* dest, 870 // jint compare_value) 871 // 872 // Arguments : 873 // c_rarg0: exchange_value 874 // c_rarg1: dest 875 // c_rarg2: compare_value 876 // 877 // Result: 878 // if ( compare_value == *dest ) { 879 // *dest = exchange_value 880 // return compare_value; 881 // else 882 // return *dest; 883 address generate_atomic_cmpxchg() { 884 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg"); 885 address start = __ pc(); 886 887 __ movl(rax, c_rarg2); 888 __ lock(); 889 __ cmpxchgl(c_rarg0, Address(c_rarg1, 0)); 890 __ ret(0); 891 892 return start; 893 } 894 895 // Support for int8_t atomic::atomic_cmpxchg(int8_t exchange_value, volatile int8_t* dest, 896 // int8_t compare_value) 897 // 898 // Arguments : 899 // c_rarg0: exchange_value 900 // c_rarg1: dest 901 // c_rarg2: compare_value 902 // 903 // Result: 904 // if ( compare_value == *dest ) { 905 // *dest = exchange_value 906 // return compare_value; 907 // else 908 // return *dest; 909 address generate_atomic_cmpxchg_byte() { 910 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_byte"); 911 address start = __ pc(); 912 913 __ movsbq(rax, c_rarg2); 914 __ lock(); 915 __ cmpxchgb(c_rarg0, Address(c_rarg1, 0)); 916 __ ret(0); 917 918 return start; 919 } 920 921 // Support for int64_t atomic::atomic_cmpxchg(int64_t exchange_value, 922 // volatile int64_t* dest, 923 // int64_t compare_value) 924 // Arguments : 925 // c_rarg0: exchange_value 926 // c_rarg1: dest 927 // c_rarg2: compare_value 928 // 929 // Result: 930 // if ( compare_value == *dest ) { 931 // *dest = exchange_value 932 // return compare_value; 933 // else 934 // return *dest; 935 address generate_atomic_cmpxchg_long() { 936 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long"); 937 address start = __ pc(); 938 939 __ movq(rax, c_rarg2); 940 __ lock(); 941 __ cmpxchgq(c_rarg0, Address(c_rarg1, 0)); 942 __ ret(0); 943 944 return start; 945 } 946 947 // Support for jint atomic::add(jint add_value, volatile jint* dest) 948 // 949 // Arguments : 950 // c_rarg0: add_value 951 // c_rarg1: dest 952 // 953 // Result: 954 // *dest += add_value 955 // return *dest; 956 address generate_atomic_add() { 957 StubCodeMark mark(this, "StubRoutines", "atomic_add"); 958 address start = __ pc(); 959 960 __ movl(rax, c_rarg0); 961 __ lock(); 962 __ xaddl(Address(c_rarg1, 0), c_rarg0); 963 __ addl(rax, c_rarg0); 964 __ ret(0); 965 966 return start; 967 } 968 969 // Support for intptr_t atomic::add_ptr(intptr_t add_value, volatile intptr_t* dest) 970 // 971 // Arguments : 972 // c_rarg0: add_value 973 // c_rarg1: dest 974 // 975 // Result: 976 // *dest += add_value 977 // return *dest; 978 address generate_atomic_add_long() { 979 StubCodeMark mark(this, "StubRoutines", "atomic_add_long"); 980 address start = __ pc(); 981 982 __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow 983 __ lock(); 984 __ xaddptr(Address(c_rarg1, 0), c_rarg0); 985 __ addptr(rax, c_rarg0); 986 __ ret(0); 987 988 return start; 989 } 990 991 // Support for intptr_t OrderAccess::fence() 992 // 993 // Arguments : 994 // 995 // Result: 996 address generate_orderaccess_fence() { 997 StubCodeMark mark(this, "StubRoutines", "orderaccess_fence"); 998 address start = __ pc(); 999 __ membar(Assembler::StoreLoad); 1000 __ ret(0); 1001 1002 return start; 1003 } 1004 1005 // Support for intptr_t get_previous_fp() 1006 // 1007 // This routine is used to find the previous frame pointer for the 1008 // caller (current_frame_guess). This is used as part of debugging 1009 // ps() is seemingly lost trying to find frames. 1010 // This code assumes that caller current_frame_guess) has a frame. 1011 address generate_get_previous_fp() { 1012 StubCodeMark mark(this, "StubRoutines", "get_previous_fp"); 1013 const Address old_fp(rbp, 0); 1014 const Address older_fp(rax, 0); 1015 address start = __ pc(); 1016 1017 __ enter(); 1018 __ movptr(rax, old_fp); // callers fp 1019 __ movptr(rax, older_fp); // the frame for ps() 1020 __ pop(rbp); 1021 __ ret(0); 1022 1023 return start; 1024 } 1025 1026 // Support for intptr_t get_previous_sp() 1027 // 1028 // This routine is used to find the previous stack pointer for the 1029 // caller. 1030 address generate_get_previous_sp() { 1031 StubCodeMark mark(this, "StubRoutines", "get_previous_sp"); 1032 address start = __ pc(); 1033 1034 __ movptr(rax, rsp); 1035 __ addptr(rax, 8); // return address is at the top of the stack. 1036 __ ret(0); 1037 1038 return start; 1039 } 1040 1041 //---------------------------------------------------------------------------------------------------- 1042 // Support for void verify_mxcsr() 1043 // 1044 // This routine is used with -Xcheck:jni to verify that native 1045 // JNI code does not return to Java code without restoring the 1046 // MXCSR register to our expected state. 1047 1048 address generate_verify_mxcsr() { 1049 StubCodeMark mark(this, "StubRoutines", "verify_mxcsr"); 1050 address start = __ pc(); 1051 1052 const Address mxcsr_save(rsp, 0); 1053 1054 if (CheckJNICalls) { 1055 Label ok_ret; 1056 ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std()); 1057 __ push(rax); 1058 __ subptr(rsp, wordSize); // allocate a temp location 1059 __ stmxcsr(mxcsr_save); 1060 __ movl(rax, mxcsr_save); 1061 __ andl(rax, MXCSR_MASK); // Only check control and mask bits 1062 __ cmp32(rax, mxcsr_std); 1063 __ jcc(Assembler::equal, ok_ret); 1064 1065 __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall"); 1066 1067 __ ldmxcsr(mxcsr_std); 1068 1069 __ bind(ok_ret); 1070 __ addptr(rsp, wordSize); 1071 __ pop(rax); 1072 } 1073 1074 __ ret(0); 1075 1076 return start; 1077 } 1078 1079 address generate_f2i_fixup() { 1080 StubCodeMark mark(this, "StubRoutines", "f2i_fixup"); 1081 Address inout(rsp, 5 * wordSize); // return address + 4 saves 1082 1083 address start = __ pc(); 1084 1085 Label L; 1086 1087 __ push(rax); 1088 __ push(c_rarg3); 1089 __ push(c_rarg2); 1090 __ push(c_rarg1); 1091 1092 __ movl(rax, 0x7f800000); 1093 __ xorl(c_rarg3, c_rarg3); 1094 __ movl(c_rarg2, inout); 1095 __ movl(c_rarg1, c_rarg2); 1096 __ andl(c_rarg1, 0x7fffffff); 1097 __ cmpl(rax, c_rarg1); // NaN? -> 0 1098 __ jcc(Assembler::negative, L); 1099 __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint 1100 __ movl(c_rarg3, 0x80000000); 1101 __ movl(rax, 0x7fffffff); 1102 __ cmovl(Assembler::positive, c_rarg3, rax); 1103 1104 __ bind(L); 1105 __ movptr(inout, c_rarg3); 1106 1107 __ pop(c_rarg1); 1108 __ pop(c_rarg2); 1109 __ pop(c_rarg3); 1110 __ pop(rax); 1111 1112 __ ret(0); 1113 1114 return start; 1115 } 1116 1117 address generate_f2l_fixup() { 1118 StubCodeMark mark(this, "StubRoutines", "f2l_fixup"); 1119 Address inout(rsp, 5 * wordSize); // return address + 4 saves 1120 address start = __ pc(); 1121 1122 Label L; 1123 1124 __ push(rax); 1125 __ push(c_rarg3); 1126 __ push(c_rarg2); 1127 __ push(c_rarg1); 1128 1129 __ movl(rax, 0x7f800000); 1130 __ xorl(c_rarg3, c_rarg3); 1131 __ movl(c_rarg2, inout); 1132 __ movl(c_rarg1, c_rarg2); 1133 __ andl(c_rarg1, 0x7fffffff); 1134 __ cmpl(rax, c_rarg1); // NaN? -> 0 1135 __ jcc(Assembler::negative, L); 1136 __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong 1137 __ mov64(c_rarg3, 0x8000000000000000); 1138 __ mov64(rax, 0x7fffffffffffffff); 1139 __ cmov(Assembler::positive, c_rarg3, rax); 1140 1141 __ bind(L); 1142 __ movptr(inout, c_rarg3); 1143 1144 __ pop(c_rarg1); 1145 __ pop(c_rarg2); 1146 __ pop(c_rarg3); 1147 __ pop(rax); 1148 1149 __ ret(0); 1150 1151 return start; 1152 } 1153 1154 address generate_d2i_fixup() { 1155 StubCodeMark mark(this, "StubRoutines", "d2i_fixup"); 1156 Address inout(rsp, 6 * wordSize); // return address + 5 saves 1157 1158 address start = __ pc(); 1159 1160 Label L; 1161 1162 __ push(rax); 1163 __ push(c_rarg3); 1164 __ push(c_rarg2); 1165 __ push(c_rarg1); 1166 __ push(c_rarg0); 1167 1168 __ movl(rax, 0x7ff00000); 1169 __ movq(c_rarg2, inout); 1170 __ movl(c_rarg3, c_rarg2); 1171 __ mov(c_rarg1, c_rarg2); 1172 __ mov(c_rarg0, c_rarg2); 1173 __ negl(c_rarg3); 1174 __ shrptr(c_rarg1, 0x20); 1175 __ orl(c_rarg3, c_rarg2); 1176 __ andl(c_rarg1, 0x7fffffff); 1177 __ xorl(c_rarg2, c_rarg2); 1178 __ shrl(c_rarg3, 0x1f); 1179 __ orl(c_rarg1, c_rarg3); 1180 __ cmpl(rax, c_rarg1); 1181 __ jcc(Assembler::negative, L); // NaN -> 0 1182 __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint 1183 __ movl(c_rarg2, 0x80000000); 1184 __ movl(rax, 0x7fffffff); 1185 __ cmov(Assembler::positive, c_rarg2, rax); 1186 1187 __ bind(L); 1188 __ movptr(inout, c_rarg2); 1189 1190 __ pop(c_rarg0); 1191 __ pop(c_rarg1); 1192 __ pop(c_rarg2); 1193 __ pop(c_rarg3); 1194 __ pop(rax); 1195 1196 __ ret(0); 1197 1198 return start; 1199 } 1200 1201 address generate_d2l_fixup() { 1202 StubCodeMark mark(this, "StubRoutines", "d2l_fixup"); 1203 Address inout(rsp, 6 * wordSize); // return address + 5 saves 1204 1205 address start = __ pc(); 1206 1207 Label L; 1208 1209 __ push(rax); 1210 __ push(c_rarg3); 1211 __ push(c_rarg2); 1212 __ push(c_rarg1); 1213 __ push(c_rarg0); 1214 1215 __ movl(rax, 0x7ff00000); 1216 __ movq(c_rarg2, inout); 1217 __ movl(c_rarg3, c_rarg2); 1218 __ mov(c_rarg1, c_rarg2); 1219 __ mov(c_rarg0, c_rarg2); 1220 __ negl(c_rarg3); 1221 __ shrptr(c_rarg1, 0x20); 1222 __ orl(c_rarg3, c_rarg2); 1223 __ andl(c_rarg1, 0x7fffffff); 1224 __ xorl(c_rarg2, c_rarg2); 1225 __ shrl(c_rarg3, 0x1f); 1226 __ orl(c_rarg1, c_rarg3); 1227 __ cmpl(rax, c_rarg1); 1228 __ jcc(Assembler::negative, L); // NaN -> 0 1229 __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong 1230 __ mov64(c_rarg2, 0x8000000000000000); 1231 __ mov64(rax, 0x7fffffffffffffff); 1232 __ cmovq(Assembler::positive, c_rarg2, rax); 1233 1234 __ bind(L); 1235 __ movq(inout, c_rarg2); 1236 1237 __ pop(c_rarg0); 1238 __ pop(c_rarg1); 1239 __ pop(c_rarg2); 1240 __ pop(c_rarg3); 1241 __ pop(rax); 1242 1243 __ ret(0); 1244 1245 return start; 1246 } 1247 1248 address generate_fp_mask(const char *stub_name, int64_t mask) { 1249 __ align(CodeEntryAlignment); 1250 StubCodeMark mark(this, "StubRoutines", stub_name); 1251 address start = __ pc(); 1252 1253 __ emit_data64( mask, relocInfo::none ); 1254 __ emit_data64( mask, relocInfo::none ); 1255 1256 return start; 1257 } 1258 1259 address generate_vector_fp_mask(const char *stub_name, int64_t mask) { 1260 __ align(CodeEntryAlignment); 1261 StubCodeMark mark(this, "StubRoutines", stub_name); 1262 address start = __ pc(); 1263 1264 __ emit_data64(mask, relocInfo::none); 1265 __ emit_data64(mask, relocInfo::none); 1266 __ emit_data64(mask, relocInfo::none); 1267 __ emit_data64(mask, relocInfo::none); 1268 __ emit_data64(mask, relocInfo::none); 1269 __ emit_data64(mask, relocInfo::none); 1270 __ emit_data64(mask, relocInfo::none); 1271 __ emit_data64(mask, relocInfo::none); 1272 1273 return start; 1274 } 1275 1276 address generate_vector_byte_perm_mask(const char *stub_name) { 1277 __ align(CodeEntryAlignment); 1278 StubCodeMark mark(this, "StubRoutines", stub_name); 1279 address start = __ pc(); 1280 1281 __ emit_data64(0x0000000000000001, relocInfo::none); 1282 __ emit_data64(0x0000000000000003, relocInfo::none); 1283 __ emit_data64(0x0000000000000005, relocInfo::none); 1284 __ emit_data64(0x0000000000000007, relocInfo::none); 1285 __ emit_data64(0x0000000000000000, relocInfo::none); 1286 __ emit_data64(0x0000000000000002, relocInfo::none); 1287 __ emit_data64(0x0000000000000004, relocInfo::none); 1288 __ emit_data64(0x0000000000000006, relocInfo::none); 1289 1290 return start; 1291 } 1292 1293 address generate_vector_custom_i32(const char *stub_name, Assembler::AvxVectorLen len, 1294 int32_t val0, int32_t val1, int32_t val2, int32_t val3, 1295 int32_t val4 = 0, int32_t val5 = 0, int32_t val6 = 0, int32_t val7 = 0, 1296 int32_t val8 = 0, int32_t val9 = 0, int32_t val10 = 0, int32_t val11 = 0, 1297 int32_t val12 = 0, int32_t val13 = 0, int32_t val14 = 0, int32_t val15 = 0) { 1298 __ align(CodeEntryAlignment); 1299 StubCodeMark mark(this, "StubRoutines", stub_name); 1300 address start = __ pc(); 1301 1302 assert(len != Assembler::AVX_NoVec, "vector len must be specified"); 1303 __ emit_data(val0, relocInfo::none, 0); 1304 __ emit_data(val1, relocInfo::none, 0); 1305 __ emit_data(val2, relocInfo::none, 0); 1306 __ emit_data(val3, relocInfo::none, 0); 1307 if (len >= Assembler::AVX_256bit) { 1308 __ emit_data(val4, relocInfo::none, 0); 1309 __ emit_data(val5, relocInfo::none, 0); 1310 __ emit_data(val6, relocInfo::none, 0); 1311 __ emit_data(val7, relocInfo::none, 0); 1312 if (len >= Assembler::AVX_512bit) { 1313 __ emit_data(val8, relocInfo::none, 0); 1314 __ emit_data(val9, relocInfo::none, 0); 1315 __ emit_data(val10, relocInfo::none, 0); 1316 __ emit_data(val11, relocInfo::none, 0); 1317 __ emit_data(val12, relocInfo::none, 0); 1318 __ emit_data(val13, relocInfo::none, 0); 1319 __ emit_data(val14, relocInfo::none, 0); 1320 __ emit_data(val15, relocInfo::none, 0); 1321 } 1322 } 1323 1324 return start; 1325 } 1326 1327 // Non-destructive plausibility checks for oops 1328 // 1329 // Arguments: 1330 // all args on stack! 1331 // 1332 // Stack after saving c_rarg3: 1333 // [tos + 0]: saved c_rarg3 1334 // [tos + 1]: saved c_rarg2 1335 // [tos + 2]: saved r12 (several TemplateTable methods use it) 1336 // [tos + 3]: saved flags 1337 // [tos + 4]: return address 1338 // * [tos + 5]: error message (char*) 1339 // * [tos + 6]: object to verify (oop) 1340 // * [tos + 7]: saved rax - saved by caller and bashed 1341 // * [tos + 8]: saved r10 (rscratch1) - saved by caller 1342 // * = popped on exit 1343 address generate_verify_oop() { 1344 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 1345 address start = __ pc(); 1346 1347 Label exit, error; 1348 1349 __ pushf(); 1350 __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 1351 1352 __ push(r12); 1353 1354 // save c_rarg2 and c_rarg3 1355 __ push(c_rarg2); 1356 __ push(c_rarg3); 1357 1358 enum { 1359 // After previous pushes. 1360 oop_to_verify = 6 * wordSize, 1361 saved_rax = 7 * wordSize, 1362 saved_r10 = 8 * wordSize, 1363 1364 // Before the call to MacroAssembler::debug(), see below. 1365 return_addr = 16 * wordSize, 1366 error_msg = 17 * wordSize 1367 }; 1368 1369 // get object 1370 __ movptr(rax, Address(rsp, oop_to_verify)); 1371 1372 // make sure object is 'reasonable' 1373 __ testptr(rax, rax); 1374 __ jcc(Assembler::zero, exit); // if obj is NULL it is OK 1375 1376 #if INCLUDE_ZGC 1377 if (UseZGC) { 1378 // Check if metadata bits indicate a bad oop 1379 __ testptr(rax, Address(r15_thread, ZThreadLocalData::address_bad_mask_offset())); 1380 __ jcc(Assembler::notZero, error); 1381 } 1382 #endif 1383 1384 // Check if the oop is in the right area of memory 1385 __ movptr(c_rarg2, rax); 1386 __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 1387 __ andptr(c_rarg2, c_rarg3); 1388 __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 1389 __ cmpptr(c_rarg2, c_rarg3); 1390 __ jcc(Assembler::notZero, error); 1391 1392 // set r12 to heapbase for load_klass() 1393 __ reinit_heapbase(); 1394 1395 // make sure klass is 'reasonable', which is not zero. 1396 __ load_klass(rax, rax); // get klass 1397 __ testptr(rax, rax); 1398 __ jcc(Assembler::zero, error); // if klass is NULL it is broken 1399 1400 // return if everything seems ok 1401 __ bind(exit); 1402 __ movptr(rax, Address(rsp, saved_rax)); // get saved rax back 1403 __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back 1404 __ pop(c_rarg3); // restore c_rarg3 1405 __ pop(c_rarg2); // restore c_rarg2 1406 __ pop(r12); // restore r12 1407 __ popf(); // restore flags 1408 __ ret(4 * wordSize); // pop caller saved stuff 1409 1410 // handle errors 1411 __ bind(error); 1412 __ movptr(rax, Address(rsp, saved_rax)); // get saved rax back 1413 __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back 1414 __ pop(c_rarg3); // get saved c_rarg3 back 1415 __ pop(c_rarg2); // get saved c_rarg2 back 1416 __ pop(r12); // get saved r12 back 1417 __ popf(); // get saved flags off stack -- 1418 // will be ignored 1419 1420 __ pusha(); // push registers 1421 // (rip is already 1422 // already pushed) 1423 // debug(char* msg, int64_t pc, int64_t regs[]) 1424 // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and 1425 // pushed all the registers, so now the stack looks like: 1426 // [tos + 0] 16 saved registers 1427 // [tos + 16] return address 1428 // * [tos + 17] error message (char*) 1429 // * [tos + 18] object to verify (oop) 1430 // * [tos + 19] saved rax - saved by caller and bashed 1431 // * [tos + 20] saved r10 (rscratch1) - saved by caller 1432 // * = popped on exit 1433 1434 __ movptr(c_rarg0, Address(rsp, error_msg)); // pass address of error message 1435 __ movptr(c_rarg1, Address(rsp, return_addr)); // pass return address 1436 __ movq(c_rarg2, rsp); // pass address of regs on stack 1437 __ mov(r12, rsp); // remember rsp 1438 __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows 1439 __ andptr(rsp, -16); // align stack as required by ABI 1440 BLOCK_COMMENT("call MacroAssembler::debug"); 1441 __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64))); 1442 __ mov(rsp, r12); // restore rsp 1443 __ popa(); // pop registers (includes r12) 1444 __ ret(4 * wordSize); // pop caller saved stuff 1445 1446 return start; 1447 } 1448 1449 // 1450 // Verify that a register contains clean 32-bits positive value 1451 // (high 32-bits are 0) so it could be used in 64-bits shifts. 1452 // 1453 // Input: 1454 // Rint - 32-bits value 1455 // Rtmp - scratch 1456 // 1457 void assert_clean_int(Register Rint, Register Rtmp) { 1458 #ifdef ASSERT 1459 Label L; 1460 assert_different_registers(Rtmp, Rint); 1461 __ movslq(Rtmp, Rint); 1462 __ cmpq(Rtmp, Rint); 1463 __ jcc(Assembler::equal, L); 1464 __ stop("high 32-bits of int value are not 0"); 1465 __ bind(L); 1466 #endif 1467 } 1468 1469 // Generate overlap test for array copy stubs 1470 // 1471 // Input: 1472 // c_rarg0 - from 1473 // c_rarg1 - to 1474 // c_rarg2 - element count 1475 // 1476 // Output: 1477 // rax - &from[element count - 1] 1478 // 1479 void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) { 1480 assert(no_overlap_target != NULL, "must be generated"); 1481 array_overlap_test(no_overlap_target, NULL, sf); 1482 } 1483 void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) { 1484 array_overlap_test(NULL, &L_no_overlap, sf); 1485 } 1486 void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) { 1487 const Register from = c_rarg0; 1488 const Register to = c_rarg1; 1489 const Register count = c_rarg2; 1490 const Register end_from = rax; 1491 1492 __ cmpptr(to, from); 1493 __ lea(end_from, Address(from, count, sf, 0)); 1494 if (NOLp == NULL) { 1495 ExternalAddress no_overlap(no_overlap_target); 1496 __ jump_cc(Assembler::belowEqual, no_overlap); 1497 __ cmpptr(to, end_from); 1498 __ jump_cc(Assembler::aboveEqual, no_overlap); 1499 } else { 1500 __ jcc(Assembler::belowEqual, (*NOLp)); 1501 __ cmpptr(to, end_from); 1502 __ jcc(Assembler::aboveEqual, (*NOLp)); 1503 } 1504 } 1505 1506 // Shuffle first three arg regs on Windows into Linux/Solaris locations. 1507 // 1508 // Outputs: 1509 // rdi - rcx 1510 // rsi - rdx 1511 // rdx - r8 1512 // rcx - r9 1513 // 1514 // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter 1515 // are non-volatile. r9 and r10 should not be used by the caller. 1516 // 1517 DEBUG_ONLY(bool regs_in_thread;) 1518 1519 void setup_arg_regs(int nargs = 3) { 1520 const Register saved_rdi = r9; 1521 const Register saved_rsi = r10; 1522 assert(nargs == 3 || nargs == 4, "else fix"); 1523 #ifdef _WIN64 1524 assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9, 1525 "unexpected argument registers"); 1526 if (nargs >= 4) 1527 __ mov(rax, r9); // r9 is also saved_rdi 1528 __ movptr(saved_rdi, rdi); 1529 __ movptr(saved_rsi, rsi); 1530 __ mov(rdi, rcx); // c_rarg0 1531 __ mov(rsi, rdx); // c_rarg1 1532 __ mov(rdx, r8); // c_rarg2 1533 if (nargs >= 4) 1534 __ mov(rcx, rax); // c_rarg3 (via rax) 1535 #else 1536 assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx, 1537 "unexpected argument registers"); 1538 #endif 1539 DEBUG_ONLY(regs_in_thread = false;) 1540 } 1541 1542 void restore_arg_regs() { 1543 assert(!regs_in_thread, "wrong call to restore_arg_regs"); 1544 const Register saved_rdi = r9; 1545 const Register saved_rsi = r10; 1546 #ifdef _WIN64 1547 __ movptr(rdi, saved_rdi); 1548 __ movptr(rsi, saved_rsi); 1549 #endif 1550 } 1551 1552 // This is used in places where r10 is a scratch register, and can 1553 // be adapted if r9 is needed also. 1554 void setup_arg_regs_using_thread() { 1555 const Register saved_r15 = r9; 1556 #ifdef _WIN64 1557 __ mov(saved_r15, r15); // r15 is callee saved and needs to be restored 1558 __ get_thread(r15_thread); 1559 assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9, 1560 "unexpected argument registers"); 1561 __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())), rdi); 1562 __ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())), rsi); 1563 1564 __ mov(rdi, rcx); // c_rarg0 1565 __ mov(rsi, rdx); // c_rarg1 1566 __ mov(rdx, r8); // c_rarg2 1567 #else 1568 assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx, 1569 "unexpected argument registers"); 1570 #endif 1571 DEBUG_ONLY(regs_in_thread = true;) 1572 } 1573 1574 void restore_arg_regs_using_thread() { 1575 assert(regs_in_thread, "wrong call to restore_arg_regs"); 1576 const Register saved_r15 = r9; 1577 #ifdef _WIN64 1578 __ get_thread(r15_thread); 1579 __ movptr(rsi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset()))); 1580 __ movptr(rdi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset()))); 1581 __ mov(r15, saved_r15); // r15 is callee saved and needs to be restored 1582 #endif 1583 } 1584 1585 // Copy big chunks forward 1586 // 1587 // Inputs: 1588 // end_from - source arrays end address 1589 // end_to - destination array end address 1590 // qword_count - 64-bits element count, negative 1591 // to - scratch 1592 // L_copy_bytes - entry label 1593 // L_copy_8_bytes - exit label 1594 // 1595 void copy_bytes_forward(Register end_from, Register end_to, 1596 Register qword_count, Register to, 1597 Label& L_copy_bytes, Label& L_copy_8_bytes) { 1598 DEBUG_ONLY(__ stop("enter at entry label, not here")); 1599 Label L_loop; 1600 __ align(OptoLoopAlignment); 1601 if (UseUnalignedLoadStores) { 1602 Label L_end; 1603 // Copy 64-bytes per iteration 1604 __ BIND(L_loop); 1605 if (UseAVX > 2) { 1606 __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit); 1607 __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit); 1608 } else if (UseAVX == 2) { 1609 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); 1610 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); 1611 __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24)); 1612 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1); 1613 } else { 1614 __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); 1615 __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); 1616 __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40)); 1617 __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1); 1618 __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24)); 1619 __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2); 1620 __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8)); 1621 __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3); 1622 } 1623 __ BIND(L_copy_bytes); 1624 __ addptr(qword_count, 8); 1625 __ jcc(Assembler::lessEqual, L_loop); 1626 __ subptr(qword_count, 4); // sub(8) and add(4) 1627 __ jccb(Assembler::greater, L_end); 1628 // Copy trailing 32 bytes 1629 if (UseAVX >= 2) { 1630 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); 1631 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0); 1632 } else { 1633 __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); 1634 __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0); 1635 __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8)); 1636 __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1); 1637 } 1638 __ addptr(qword_count, 4); 1639 __ BIND(L_end); 1640 if (UseAVX >= 2) { 1641 // clean upper bits of YMM registers 1642 __ vpxor(xmm0, xmm0); 1643 __ vpxor(xmm1, xmm1); 1644 } 1645 } else { 1646 // Copy 32-bytes per iteration 1647 __ BIND(L_loop); 1648 __ movq(to, Address(end_from, qword_count, Address::times_8, -24)); 1649 __ movq(Address(end_to, qword_count, Address::times_8, -24), to); 1650 __ movq(to, Address(end_from, qword_count, Address::times_8, -16)); 1651 __ movq(Address(end_to, qword_count, Address::times_8, -16), to); 1652 __ movq(to, Address(end_from, qword_count, Address::times_8, - 8)); 1653 __ movq(Address(end_to, qword_count, Address::times_8, - 8), to); 1654 __ movq(to, Address(end_from, qword_count, Address::times_8, - 0)); 1655 __ movq(Address(end_to, qword_count, Address::times_8, - 0), to); 1656 1657 __ BIND(L_copy_bytes); 1658 __ addptr(qword_count, 4); 1659 __ jcc(Assembler::lessEqual, L_loop); 1660 } 1661 __ subptr(qword_count, 4); 1662 __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords 1663 } 1664 1665 // Copy big chunks backward 1666 // 1667 // Inputs: 1668 // from - source arrays address 1669 // dest - destination array address 1670 // qword_count - 64-bits element count 1671 // to - scratch 1672 // L_copy_bytes - entry label 1673 // L_copy_8_bytes - exit label 1674 // 1675 void copy_bytes_backward(Register from, Register dest, 1676 Register qword_count, Register to, 1677 Label& L_copy_bytes, Label& L_copy_8_bytes) { 1678 DEBUG_ONLY(__ stop("enter at entry label, not here")); 1679 Label L_loop; 1680 __ align(OptoLoopAlignment); 1681 if (UseUnalignedLoadStores) { 1682 Label L_end; 1683 // Copy 64-bytes per iteration 1684 __ BIND(L_loop); 1685 if (UseAVX > 2) { 1686 __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit); 1687 __ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit); 1688 } else if (UseAVX == 2) { 1689 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32)); 1690 __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0); 1691 __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); 1692 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); 1693 } else { 1694 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48)); 1695 __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0); 1696 __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32)); 1697 __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1); 1698 __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16)); 1699 __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2); 1700 __ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0)); 1701 __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3); 1702 } 1703 __ BIND(L_copy_bytes); 1704 __ subptr(qword_count, 8); 1705 __ jcc(Assembler::greaterEqual, L_loop); 1706 1707 __ addptr(qword_count, 4); // add(8) and sub(4) 1708 __ jccb(Assembler::less, L_end); 1709 // Copy trailing 32 bytes 1710 if (UseAVX >= 2) { 1711 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0)); 1712 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0); 1713 } else { 1714 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16)); 1715 __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0); 1716 __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); 1717 __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); 1718 } 1719 __ subptr(qword_count, 4); 1720 __ BIND(L_end); 1721 if (UseAVX >= 2) { 1722 // clean upper bits of YMM registers 1723 __ vpxor(xmm0, xmm0); 1724 __ vpxor(xmm1, xmm1); 1725 } 1726 } else { 1727 // Copy 32-bytes per iteration 1728 __ BIND(L_loop); 1729 __ movq(to, Address(from, qword_count, Address::times_8, 24)); 1730 __ movq(Address(dest, qword_count, Address::times_8, 24), to); 1731 __ movq(to, Address(from, qword_count, Address::times_8, 16)); 1732 __ movq(Address(dest, qword_count, Address::times_8, 16), to); 1733 __ movq(to, Address(from, qword_count, Address::times_8, 8)); 1734 __ movq(Address(dest, qword_count, Address::times_8, 8), to); 1735 __ movq(to, Address(from, qword_count, Address::times_8, 0)); 1736 __ movq(Address(dest, qword_count, Address::times_8, 0), to); 1737 1738 __ BIND(L_copy_bytes); 1739 __ subptr(qword_count, 4); 1740 __ jcc(Assembler::greaterEqual, L_loop); 1741 } 1742 __ addptr(qword_count, 4); 1743 __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords 1744 } 1745 1746 1747 // Arguments: 1748 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1749 // ignored 1750 // name - stub name string 1751 // 1752 // Inputs: 1753 // c_rarg0 - source array address 1754 // c_rarg1 - destination array address 1755 // c_rarg2 - element count, treated as ssize_t, can be zero 1756 // 1757 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1758 // we let the hardware handle it. The one to eight bytes within words, 1759 // dwords or qwords that span cache line boundaries will still be loaded 1760 // and stored atomically. 1761 // 1762 // Side Effects: 1763 // disjoint_byte_copy_entry is set to the no-overlap entry point 1764 // used by generate_conjoint_byte_copy(). 1765 // 1766 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1767 __ align(CodeEntryAlignment); 1768 StubCodeMark mark(this, "StubRoutines", name); 1769 address start = __ pc(); 1770 1771 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; 1772 Label L_copy_byte, L_exit; 1773 const Register from = rdi; // source array address 1774 const Register to = rsi; // destination array address 1775 const Register count = rdx; // elements count 1776 const Register byte_count = rcx; 1777 const Register qword_count = count; 1778 const Register end_from = from; // source array end address 1779 const Register end_to = to; // destination array end address 1780 // End pointers are inclusive, and if count is not zero they point 1781 // to the last unit copied: end_to[0] := end_from[0] 1782 1783 __ enter(); // required for proper stackwalking of RuntimeStub frame 1784 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1785 1786 if (entry != NULL) { 1787 *entry = __ pc(); 1788 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1789 BLOCK_COMMENT("Entry:"); 1790 } 1791 1792 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1793 // r9 and r10 may be used to save non-volatile registers 1794 1795 // 'from', 'to' and 'count' are now valid 1796 __ movptr(byte_count, count); 1797 __ shrptr(count, 3); // count => qword_count 1798 1799 // Copy from low to high addresses. Use 'to' as scratch. 1800 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 1801 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 1802 __ negptr(qword_count); // make the count negative 1803 __ jmp(L_copy_bytes); 1804 1805 // Copy trailing qwords 1806 __ BIND(L_copy_8_bytes); 1807 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 1808 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 1809 __ increment(qword_count); 1810 __ jcc(Assembler::notZero, L_copy_8_bytes); 1811 1812 // Check for and copy trailing dword 1813 __ BIND(L_copy_4_bytes); 1814 __ testl(byte_count, 4); 1815 __ jccb(Assembler::zero, L_copy_2_bytes); 1816 __ movl(rax, Address(end_from, 8)); 1817 __ movl(Address(end_to, 8), rax); 1818 1819 __ addptr(end_from, 4); 1820 __ addptr(end_to, 4); 1821 1822 // Check for and copy trailing word 1823 __ BIND(L_copy_2_bytes); 1824 __ testl(byte_count, 2); 1825 __ jccb(Assembler::zero, L_copy_byte); 1826 __ movw(rax, Address(end_from, 8)); 1827 __ movw(Address(end_to, 8), rax); 1828 1829 __ addptr(end_from, 2); 1830 __ addptr(end_to, 2); 1831 1832 // Check for and copy trailing byte 1833 __ BIND(L_copy_byte); 1834 __ testl(byte_count, 1); 1835 __ jccb(Assembler::zero, L_exit); 1836 __ movb(rax, Address(end_from, 8)); 1837 __ movb(Address(end_to, 8), rax); 1838 1839 __ BIND(L_exit); 1840 restore_arg_regs(); 1841 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free 1842 __ xorptr(rax, rax); // return 0 1843 __ vzeroupper(); 1844 __ leave(); // required for proper stackwalking of RuntimeStub frame 1845 __ ret(0); 1846 1847 // Copy in multi-bytes chunks 1848 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 1849 __ jmp(L_copy_4_bytes); 1850 1851 return start; 1852 } 1853 1854 // Arguments: 1855 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1856 // ignored 1857 // name - stub name string 1858 // 1859 // Inputs: 1860 // c_rarg0 - source array address 1861 // c_rarg1 - destination array address 1862 // c_rarg2 - element count, treated as ssize_t, can be zero 1863 // 1864 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1865 // we let the hardware handle it. The one to eight bytes within words, 1866 // dwords or qwords that span cache line boundaries will still be loaded 1867 // and stored atomically. 1868 // 1869 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1870 address* entry, const char *name) { 1871 __ align(CodeEntryAlignment); 1872 StubCodeMark mark(this, "StubRoutines", name); 1873 address start = __ pc(); 1874 1875 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes; 1876 const Register from = rdi; // source array address 1877 const Register to = rsi; // destination array address 1878 const Register count = rdx; // elements count 1879 const Register byte_count = rcx; 1880 const Register qword_count = count; 1881 1882 __ enter(); // required for proper stackwalking of RuntimeStub frame 1883 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1884 1885 if (entry != NULL) { 1886 *entry = __ pc(); 1887 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1888 BLOCK_COMMENT("Entry:"); 1889 } 1890 1891 array_overlap_test(nooverlap_target, Address::times_1); 1892 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1893 // r9 and r10 may be used to save non-volatile registers 1894 1895 // 'from', 'to' and 'count' are now valid 1896 __ movptr(byte_count, count); 1897 __ shrptr(count, 3); // count => qword_count 1898 1899 // Copy from high to low addresses. 1900 1901 // Check for and copy trailing byte 1902 __ testl(byte_count, 1); 1903 __ jcc(Assembler::zero, L_copy_2_bytes); 1904 __ movb(rax, Address(from, byte_count, Address::times_1, -1)); 1905 __ movb(Address(to, byte_count, Address::times_1, -1), rax); 1906 __ decrement(byte_count); // Adjust for possible trailing word 1907 1908 // Check for and copy trailing word 1909 __ BIND(L_copy_2_bytes); 1910 __ testl(byte_count, 2); 1911 __ jcc(Assembler::zero, L_copy_4_bytes); 1912 __ movw(rax, Address(from, byte_count, Address::times_1, -2)); 1913 __ movw(Address(to, byte_count, Address::times_1, -2), rax); 1914 1915 // Check for and copy trailing dword 1916 __ BIND(L_copy_4_bytes); 1917 __ testl(byte_count, 4); 1918 __ jcc(Assembler::zero, L_copy_bytes); 1919 __ movl(rax, Address(from, qword_count, Address::times_8)); 1920 __ movl(Address(to, qword_count, Address::times_8), rax); 1921 __ jmp(L_copy_bytes); 1922 1923 // Copy trailing qwords 1924 __ BIND(L_copy_8_bytes); 1925 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 1926 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 1927 __ decrement(qword_count); 1928 __ jcc(Assembler::notZero, L_copy_8_bytes); 1929 1930 restore_arg_regs(); 1931 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free 1932 __ xorptr(rax, rax); // return 0 1933 __ vzeroupper(); 1934 __ leave(); // required for proper stackwalking of RuntimeStub frame 1935 __ ret(0); 1936 1937 // Copy in multi-bytes chunks 1938 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 1939 1940 restore_arg_regs(); 1941 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free 1942 __ xorptr(rax, rax); // return 0 1943 __ vzeroupper(); 1944 __ leave(); // required for proper stackwalking of RuntimeStub frame 1945 __ ret(0); 1946 1947 return start; 1948 } 1949 1950 // Arguments: 1951 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1952 // ignored 1953 // name - stub name string 1954 // 1955 // Inputs: 1956 // c_rarg0 - source array address 1957 // c_rarg1 - destination array address 1958 // c_rarg2 - element count, treated as ssize_t, can be zero 1959 // 1960 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1961 // let the hardware handle it. The two or four words within dwords 1962 // or qwords that span cache line boundaries will still be loaded 1963 // and stored atomically. 1964 // 1965 // Side Effects: 1966 // disjoint_short_copy_entry is set to the no-overlap entry point 1967 // used by generate_conjoint_short_copy(). 1968 // 1969 address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) { 1970 __ align(CodeEntryAlignment); 1971 StubCodeMark mark(this, "StubRoutines", name); 1972 address start = __ pc(); 1973 1974 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit; 1975 const Register from = rdi; // source array address 1976 const Register to = rsi; // destination array address 1977 const Register count = rdx; // elements count 1978 const Register word_count = rcx; 1979 const Register qword_count = count; 1980 const Register end_from = from; // source array end address 1981 const Register end_to = to; // destination array end address 1982 // End pointers are inclusive, and if count is not zero they point 1983 // to the last unit copied: end_to[0] := end_from[0] 1984 1985 __ enter(); // required for proper stackwalking of RuntimeStub frame 1986 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 1987 1988 if (entry != NULL) { 1989 *entry = __ pc(); 1990 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1991 BLOCK_COMMENT("Entry:"); 1992 } 1993 1994 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 1995 // r9 and r10 may be used to save non-volatile registers 1996 1997 // 'from', 'to' and 'count' are now valid 1998 __ movptr(word_count, count); 1999 __ shrptr(count, 2); // count => qword_count 2000 2001 // Copy from low to high addresses. Use 'to' as scratch. 2002 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 2003 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 2004 __ negptr(qword_count); 2005 __ jmp(L_copy_bytes); 2006 2007 // Copy trailing qwords 2008 __ BIND(L_copy_8_bytes); 2009 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 2010 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 2011 __ increment(qword_count); 2012 __ jcc(Assembler::notZero, L_copy_8_bytes); 2013 2014 // Original 'dest' is trashed, so we can't use it as a 2015 // base register for a possible trailing word copy 2016 2017 // Check for and copy trailing dword 2018 __ BIND(L_copy_4_bytes); 2019 __ testl(word_count, 2); 2020 __ jccb(Assembler::zero, L_copy_2_bytes); 2021 __ movl(rax, Address(end_from, 8)); 2022 __ movl(Address(end_to, 8), rax); 2023 2024 __ addptr(end_from, 4); 2025 __ addptr(end_to, 4); 2026 2027 // Check for and copy trailing word 2028 __ BIND(L_copy_2_bytes); 2029 __ testl(word_count, 1); 2030 __ jccb(Assembler::zero, L_exit); 2031 __ movw(rax, Address(end_from, 8)); 2032 __ movw(Address(end_to, 8), rax); 2033 2034 __ BIND(L_exit); 2035 restore_arg_regs(); 2036 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free 2037 __ xorptr(rax, rax); // return 0 2038 __ vzeroupper(); 2039 __ leave(); // required for proper stackwalking of RuntimeStub frame 2040 __ ret(0); 2041 2042 // Copy in multi-bytes chunks 2043 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 2044 __ jmp(L_copy_4_bytes); 2045 2046 return start; 2047 } 2048 2049 address generate_fill(BasicType t, bool aligned, const char *name) { 2050 __ align(CodeEntryAlignment); 2051 StubCodeMark mark(this, "StubRoutines", name); 2052 address start = __ pc(); 2053 2054 BLOCK_COMMENT("Entry:"); 2055 2056 const Register to = c_rarg0; // source array address 2057 const Register value = c_rarg1; // value 2058 const Register count = c_rarg2; // elements count 2059 2060 __ enter(); // required for proper stackwalking of RuntimeStub frame 2061 2062 __ generate_fill(t, aligned, to, value, count, rax, xmm0); 2063 2064 __ vzeroupper(); 2065 __ leave(); // required for proper stackwalking of RuntimeStub frame 2066 __ ret(0); 2067 return start; 2068 } 2069 2070 // Arguments: 2071 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 2072 // ignored 2073 // name - stub name string 2074 // 2075 // Inputs: 2076 // c_rarg0 - source array address 2077 // c_rarg1 - destination array address 2078 // c_rarg2 - element count, treated as ssize_t, can be zero 2079 // 2080 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 2081 // let the hardware handle it. The two or four words within dwords 2082 // or qwords that span cache line boundaries will still be loaded 2083 // and stored atomically. 2084 // 2085 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 2086 address *entry, const char *name) { 2087 __ align(CodeEntryAlignment); 2088 StubCodeMark mark(this, "StubRoutines", name); 2089 address start = __ pc(); 2090 2091 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes; 2092 const Register from = rdi; // source array address 2093 const Register to = rsi; // destination array address 2094 const Register count = rdx; // elements count 2095 const Register word_count = rcx; 2096 const Register qword_count = count; 2097 2098 __ enter(); // required for proper stackwalking of RuntimeStub frame 2099 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 2100 2101 if (entry != NULL) { 2102 *entry = __ pc(); 2103 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2104 BLOCK_COMMENT("Entry:"); 2105 } 2106 2107 array_overlap_test(nooverlap_target, Address::times_2); 2108 setup_arg_regs(); // from => rdi, to => rsi, count => rdx 2109 // r9 and r10 may be used to save non-volatile registers 2110 2111 // 'from', 'to' and 'count' are now valid 2112 __ movptr(word_count, count); 2113 __ shrptr(count, 2); // count => qword_count 2114 2115 // Copy from high to low addresses. Use 'to' as scratch. 2116 2117 // Check for and copy trailing word 2118 __ testl(word_count, 1); 2119 __ jccb(Assembler::zero, L_copy_4_bytes); 2120 __ movw(rax, Address(from, word_count, Address::times_2, -2)); 2121 __ movw(Address(to, word_count, Address::times_2, -2), rax); 2122 2123 // Check for and copy trailing dword 2124 __ BIND(L_copy_4_bytes); 2125 __ testl(word_count, 2); 2126 __ jcc(Assembler::zero, L_copy_bytes); 2127 __ movl(rax, Address(from, qword_count, Address::times_8)); 2128 __ movl(Address(to, qword_count, Address::times_8), rax); 2129 __ jmp(L_copy_bytes); 2130 2131 // Copy trailing qwords 2132 __ BIND(L_copy_8_bytes); 2133 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 2134 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 2135 __ decrement(qword_count); 2136 __ jcc(Assembler::notZero, L_copy_8_bytes); 2137 2138 restore_arg_regs(); 2139 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free 2140 __ xorptr(rax, rax); // return 0 2141 __ vzeroupper(); 2142 __ leave(); // required for proper stackwalking of RuntimeStub frame 2143 __ ret(0); 2144 2145 // Copy in multi-bytes chunks 2146 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 2147 2148 restore_arg_regs(); 2149 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free 2150 __ xorptr(rax, rax); // return 0 2151 __ vzeroupper(); 2152 __ leave(); // required for proper stackwalking of RuntimeStub frame 2153 __ ret(0); 2154 2155 return start; 2156 } 2157 2158 // Arguments: 2159 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 2160 // ignored 2161 // is_oop - true => oop array, so generate store check code 2162 // name - stub name string 2163 // 2164 // Inputs: 2165 // c_rarg0 - source array address 2166 // c_rarg1 - destination array address 2167 // c_rarg2 - element count, treated as ssize_t, can be zero 2168 // 2169 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 2170 // the hardware handle it. The two dwords within qwords that span 2171 // cache line boundaries will still be loaded and stored atomicly. 2172 // 2173 // Side Effects: 2174 // disjoint_int_copy_entry is set to the no-overlap entry point 2175 // used by generate_conjoint_int_oop_copy(). 2176 // 2177 address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry, 2178 const char *name, bool dest_uninitialized = false) { 2179 __ align(CodeEntryAlignment); 2180 StubCodeMark mark(this, "StubRoutines", name); 2181 address start = __ pc(); 2182 2183 Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit; 2184 const Register from = rdi; // source array address 2185 const Register to = rsi; // destination array address 2186 const Register count = rdx; // elements count 2187 const Register dword_count = rcx; 2188 const Register qword_count = count; 2189 const Register end_from = from; // source array end address 2190 const Register end_to = to; // destination array end address 2191 // End pointers are inclusive, and if count is not zero they point 2192 // to the last unit copied: end_to[0] := end_from[0] 2193 2194 __ enter(); // required for proper stackwalking of RuntimeStub frame 2195 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 2196 2197 if (entry != NULL) { 2198 *entry = __ pc(); 2199 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2200 BLOCK_COMMENT("Entry:"); 2201 } 2202 2203 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 2204 // r9 is used to save r15_thread 2205 2206 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 2207 if (dest_uninitialized) { 2208 decorators |= IS_DEST_UNINITIALIZED; 2209 } 2210 if (aligned) { 2211 decorators |= ARRAYCOPY_ALIGNED; 2212 } 2213 2214 BasicType type = is_oop ? T_OBJECT : T_INT; 2215 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2216 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 2217 2218 // 'from', 'to' and 'count' are now valid 2219 __ movptr(dword_count, count); 2220 __ shrptr(count, 1); // count => qword_count 2221 2222 // Copy from low to high addresses. Use 'to' as scratch. 2223 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 2224 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 2225 __ negptr(qword_count); 2226 __ jmp(L_copy_bytes); 2227 2228 // Copy trailing qwords 2229 __ BIND(L_copy_8_bytes); 2230 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 2231 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 2232 __ increment(qword_count); 2233 __ jcc(Assembler::notZero, L_copy_8_bytes); 2234 2235 // Check for and copy trailing dword 2236 __ BIND(L_copy_4_bytes); 2237 __ testl(dword_count, 1); // Only byte test since the value is 0 or 1 2238 __ jccb(Assembler::zero, L_exit); 2239 __ movl(rax, Address(end_from, 8)); 2240 __ movl(Address(end_to, 8), rax); 2241 2242 __ BIND(L_exit); 2243 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count); 2244 restore_arg_regs_using_thread(); 2245 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free 2246 __ vzeroupper(); 2247 __ xorptr(rax, rax); // return 0 2248 __ leave(); // required for proper stackwalking of RuntimeStub frame 2249 __ ret(0); 2250 2251 // Copy in multi-bytes chunks 2252 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 2253 __ jmp(L_copy_4_bytes); 2254 2255 return start; 2256 } 2257 2258 // Arguments: 2259 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 2260 // ignored 2261 // is_oop - true => oop array, so generate store check code 2262 // name - stub name string 2263 // 2264 // Inputs: 2265 // c_rarg0 - source array address 2266 // c_rarg1 - destination array address 2267 // c_rarg2 - element count, treated as ssize_t, can be zero 2268 // 2269 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 2270 // the hardware handle it. The two dwords within qwords that span 2271 // cache line boundaries will still be loaded and stored atomicly. 2272 // 2273 address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target, 2274 address *entry, const char *name, 2275 bool dest_uninitialized = false) { 2276 __ align(CodeEntryAlignment); 2277 StubCodeMark mark(this, "StubRoutines", name); 2278 address start = __ pc(); 2279 2280 Label L_copy_bytes, L_copy_8_bytes, L_exit; 2281 const Register from = rdi; // source array address 2282 const Register to = rsi; // destination array address 2283 const Register count = rdx; // elements count 2284 const Register dword_count = rcx; 2285 const Register qword_count = count; 2286 2287 __ enter(); // required for proper stackwalking of RuntimeStub frame 2288 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 2289 2290 if (entry != NULL) { 2291 *entry = __ pc(); 2292 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2293 BLOCK_COMMENT("Entry:"); 2294 } 2295 2296 array_overlap_test(nooverlap_target, Address::times_4); 2297 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 2298 // r9 is used to save r15_thread 2299 2300 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 2301 if (dest_uninitialized) { 2302 decorators |= IS_DEST_UNINITIALIZED; 2303 } 2304 if (aligned) { 2305 decorators |= ARRAYCOPY_ALIGNED; 2306 } 2307 2308 BasicType type = is_oop ? T_OBJECT : T_INT; 2309 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2310 // no registers are destroyed by this call 2311 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 2312 2313 assert_clean_int(count, rax); // Make sure 'count' is clean int. 2314 // 'from', 'to' and 'count' are now valid 2315 __ movptr(dword_count, count); 2316 __ shrptr(count, 1); // count => qword_count 2317 2318 // Copy from high to low addresses. Use 'to' as scratch. 2319 2320 // Check for and copy trailing dword 2321 __ testl(dword_count, 1); 2322 __ jcc(Assembler::zero, L_copy_bytes); 2323 __ movl(rax, Address(from, dword_count, Address::times_4, -4)); 2324 __ movl(Address(to, dword_count, Address::times_4, -4), rax); 2325 __ jmp(L_copy_bytes); 2326 2327 // Copy trailing qwords 2328 __ BIND(L_copy_8_bytes); 2329 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 2330 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 2331 __ decrement(qword_count); 2332 __ jcc(Assembler::notZero, L_copy_8_bytes); 2333 2334 if (is_oop) { 2335 __ jmp(L_exit); 2336 } 2337 restore_arg_regs_using_thread(); 2338 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free 2339 __ xorptr(rax, rax); // return 0 2340 __ vzeroupper(); 2341 __ leave(); // required for proper stackwalking of RuntimeStub frame 2342 __ ret(0); 2343 2344 // Copy in multi-bytes chunks 2345 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 2346 2347 __ BIND(L_exit); 2348 bs->arraycopy_epilogue(_masm, decorators, type, from, to, dword_count); 2349 restore_arg_regs_using_thread(); 2350 inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free 2351 __ xorptr(rax, rax); // return 0 2352 __ vzeroupper(); 2353 __ leave(); // required for proper stackwalking of RuntimeStub frame 2354 __ ret(0); 2355 2356 return start; 2357 } 2358 2359 // Arguments: 2360 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 2361 // ignored 2362 // is_oop - true => oop array, so generate store check code 2363 // name - stub name string 2364 // 2365 // Inputs: 2366 // c_rarg0 - source array address 2367 // c_rarg1 - destination array address 2368 // c_rarg2 - element count, treated as ssize_t, can be zero 2369 // 2370 // Side Effects: 2371 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 2372 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 2373 // 2374 address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry, 2375 const char *name, bool dest_uninitialized = false) { 2376 __ align(CodeEntryAlignment); 2377 StubCodeMark mark(this, "StubRoutines", name); 2378 address start = __ pc(); 2379 2380 Label L_copy_bytes, L_copy_8_bytes, L_exit; 2381 const Register from = rdi; // source array address 2382 const Register to = rsi; // destination array address 2383 const Register qword_count = rdx; // elements count 2384 const Register end_from = from; // source array end address 2385 const Register end_to = rcx; // destination array end address 2386 const Register saved_count = r11; 2387 // End pointers are inclusive, and if count is not zero they point 2388 // to the last unit copied: end_to[0] := end_from[0] 2389 2390 __ enter(); // required for proper stackwalking of RuntimeStub frame 2391 // Save no-overlap entry point for generate_conjoint_long_oop_copy() 2392 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 2393 2394 if (entry != NULL) { 2395 *entry = __ pc(); 2396 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2397 BLOCK_COMMENT("Entry:"); 2398 } 2399 2400 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 2401 // r9 is used to save r15_thread 2402 // 'from', 'to' and 'qword_count' are now valid 2403 2404 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 2405 if (dest_uninitialized) { 2406 decorators |= IS_DEST_UNINITIALIZED; 2407 } 2408 if (aligned) { 2409 decorators |= ARRAYCOPY_ALIGNED; 2410 } 2411 2412 BasicType type = is_oop ? T_OBJECT : T_LONG; 2413 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2414 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count); 2415 2416 // Copy from low to high addresses. Use 'to' as scratch. 2417 __ lea(end_from, Address(from, qword_count, Address::times_8, -8)); 2418 __ lea(end_to, Address(to, qword_count, Address::times_8, -8)); 2419 __ negptr(qword_count); 2420 __ jmp(L_copy_bytes); 2421 2422 // Copy trailing qwords 2423 __ BIND(L_copy_8_bytes); 2424 __ movq(rax, Address(end_from, qword_count, Address::times_8, 8)); 2425 __ movq(Address(end_to, qword_count, Address::times_8, 8), rax); 2426 __ increment(qword_count); 2427 __ jcc(Assembler::notZero, L_copy_8_bytes); 2428 2429 if (is_oop) { 2430 __ jmp(L_exit); 2431 } else { 2432 restore_arg_regs_using_thread(); 2433 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free 2434 __ xorptr(rax, rax); // return 0 2435 __ vzeroupper(); 2436 __ leave(); // required for proper stackwalking of RuntimeStub frame 2437 __ ret(0); 2438 } 2439 2440 // Copy in multi-bytes chunks 2441 copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 2442 2443 __ BIND(L_exit); 2444 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count); 2445 restore_arg_regs_using_thread(); 2446 if (is_oop) { 2447 inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free 2448 } else { 2449 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free 2450 } 2451 __ vzeroupper(); 2452 __ xorptr(rax, rax); // return 0 2453 __ leave(); // required for proper stackwalking of RuntimeStub frame 2454 __ ret(0); 2455 2456 return start; 2457 } 2458 2459 // Arguments: 2460 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 2461 // ignored 2462 // is_oop - true => oop array, so generate store check code 2463 // name - stub name string 2464 // 2465 // Inputs: 2466 // c_rarg0 - source array address 2467 // c_rarg1 - destination array address 2468 // c_rarg2 - element count, treated as ssize_t, can be zero 2469 // 2470 address generate_conjoint_long_oop_copy(bool aligned, bool is_oop, 2471 address nooverlap_target, address *entry, 2472 const char *name, bool dest_uninitialized = false) { 2473 __ align(CodeEntryAlignment); 2474 StubCodeMark mark(this, "StubRoutines", name); 2475 address start = __ pc(); 2476 2477 Label L_copy_bytes, L_copy_8_bytes, L_exit; 2478 const Register from = rdi; // source array address 2479 const Register to = rsi; // destination array address 2480 const Register qword_count = rdx; // elements count 2481 const Register saved_count = rcx; 2482 2483 __ enter(); // required for proper stackwalking of RuntimeStub frame 2484 assert_clean_int(c_rarg2, rax); // Make sure 'count' is clean int. 2485 2486 if (entry != NULL) { 2487 *entry = __ pc(); 2488 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 2489 BLOCK_COMMENT("Entry:"); 2490 } 2491 2492 array_overlap_test(nooverlap_target, Address::times_8); 2493 setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx 2494 // r9 is used to save r15_thread 2495 // 'from', 'to' and 'qword_count' are now valid 2496 2497 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 2498 if (dest_uninitialized) { 2499 decorators |= IS_DEST_UNINITIALIZED; 2500 } 2501 if (aligned) { 2502 decorators |= ARRAYCOPY_ALIGNED; 2503 } 2504 2505 BasicType type = is_oop ? T_OBJECT : T_LONG; 2506 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2507 bs->arraycopy_prologue(_masm, decorators, type, from, to, qword_count); 2508 2509 __ jmp(L_copy_bytes); 2510 2511 // Copy trailing qwords 2512 __ BIND(L_copy_8_bytes); 2513 __ movq(rax, Address(from, qword_count, Address::times_8, -8)); 2514 __ movq(Address(to, qword_count, Address::times_8, -8), rax); 2515 __ decrement(qword_count); 2516 __ jcc(Assembler::notZero, L_copy_8_bytes); 2517 2518 if (is_oop) { 2519 __ jmp(L_exit); 2520 } else { 2521 restore_arg_regs_using_thread(); 2522 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free 2523 __ xorptr(rax, rax); // return 0 2524 __ vzeroupper(); 2525 __ leave(); // required for proper stackwalking of RuntimeStub frame 2526 __ ret(0); 2527 } 2528 2529 // Copy in multi-bytes chunks 2530 copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes); 2531 2532 __ BIND(L_exit); 2533 bs->arraycopy_epilogue(_masm, decorators, type, from, to, qword_count); 2534 restore_arg_regs_using_thread(); 2535 if (is_oop) { 2536 inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free 2537 } else { 2538 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free 2539 } 2540 __ vzeroupper(); 2541 __ xorptr(rax, rax); // return 0 2542 __ leave(); // required for proper stackwalking of RuntimeStub frame 2543 __ ret(0); 2544 2545 return start; 2546 } 2547 2548 2549 // Helper for generating a dynamic type check. 2550 // Smashes no registers. 2551 void generate_type_check(Register sub_klass, 2552 Register super_check_offset, 2553 Register super_klass, 2554 Label& L_success) { 2555 assert_different_registers(sub_klass, super_check_offset, super_klass); 2556 2557 BLOCK_COMMENT("type_check:"); 2558 2559 Label L_miss; 2560 2561 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 2562 super_check_offset); 2563 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 2564 2565 // Fall through on failure! 2566 __ BIND(L_miss); 2567 } 2568 2569 // 2570 // Generate checkcasting array copy stub 2571 // 2572 // Input: 2573 // c_rarg0 - source array address 2574 // c_rarg1 - destination array address 2575 // c_rarg2 - element count, treated as ssize_t, can be zero 2576 // c_rarg3 - size_t ckoff (super_check_offset) 2577 // not Win64 2578 // c_rarg4 - oop ckval (super_klass) 2579 // Win64 2580 // rsp+40 - oop ckval (super_klass) 2581 // 2582 // Output: 2583 // rax == 0 - success 2584 // rax == -1^K - failure, where K is partial transfer count 2585 // 2586 address generate_checkcast_copy(const char *name, address *entry, 2587 bool dest_uninitialized = false) { 2588 2589 Label L_load_element, L_store_element, L_do_card_marks, L_done; 2590 2591 // Input registers (after setup_arg_regs) 2592 const Register from = rdi; // source array address 2593 const Register to = rsi; // destination array address 2594 const Register length = rdx; // elements count 2595 const Register ckoff = rcx; // super_check_offset 2596 const Register ckval = r8; // super_klass 2597 2598 // Registers used as temps (r13, r14 are save-on-entry) 2599 const Register end_from = from; // source array end address 2600 const Register end_to = r13; // destination array end address 2601 const Register count = rdx; // -(count_remaining) 2602 const Register r14_length = r14; // saved copy of length 2603 // End pointers are inclusive, and if length is not zero they point 2604 // to the last unit copied: end_to[0] := end_from[0] 2605 2606 const Register rax_oop = rax; // actual oop copied 2607 const Register r11_klass = r11; // oop._klass 2608 2609 //--------------------------------------------------------------- 2610 // Assembler stub will be used for this call to arraycopy 2611 // if the two arrays are subtypes of Object[] but the 2612 // destination array type is not equal to or a supertype 2613 // of the source type. Each element must be separately 2614 // checked. 2615 2616 __ align(CodeEntryAlignment); 2617 StubCodeMark mark(this, "StubRoutines", name); 2618 address start = __ pc(); 2619 2620 __ enter(); // required for proper stackwalking of RuntimeStub frame 2621 2622 #ifdef ASSERT 2623 // caller guarantees that the arrays really are different 2624 // otherwise, we would have to make conjoint checks 2625 { Label L; 2626 array_overlap_test(L, TIMES_OOP); 2627 __ stop("checkcast_copy within a single array"); 2628 __ bind(L); 2629 } 2630 #endif //ASSERT 2631 2632 setup_arg_regs(4); // from => rdi, to => rsi, length => rdx 2633 // ckoff => rcx, ckval => r8 2634 // r9 and r10 may be used to save non-volatile registers 2635 #ifdef _WIN64 2636 // last argument (#4) is on stack on Win64 2637 __ movptr(ckval, Address(rsp, 6 * wordSize)); 2638 #endif 2639 2640 // Caller of this entry point must set up the argument registers. 2641 if (entry != NULL) { 2642 *entry = __ pc(); 2643 BLOCK_COMMENT("Entry:"); 2644 } 2645 2646 // allocate spill slots for r13, r14 2647 enum { 2648 saved_r13_offset, 2649 saved_r14_offset, 2650 saved_r10_offset, 2651 saved_rbp_offset 2652 }; 2653 __ subptr(rsp, saved_rbp_offset * wordSize); 2654 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13); 2655 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14); 2656 __ movptr(Address(rsp, saved_r10_offset * wordSize), r10); 2657 2658 #ifdef ASSERT 2659 Label L2; 2660 __ get_thread(r14); 2661 __ cmpptr(r15_thread, r14); 2662 __ jcc(Assembler::equal, L2); 2663 __ stop("StubRoutines::call_stub: r15_thread is modified by call"); 2664 __ bind(L2); 2665 #endif // ASSERT 2666 2667 // check that int operands are properly extended to size_t 2668 assert_clean_int(length, rax); 2669 assert_clean_int(ckoff, rax); 2670 2671 #ifdef ASSERT 2672 BLOCK_COMMENT("assert consistent ckoff/ckval"); 2673 // The ckoff and ckval must be mutually consistent, 2674 // even though caller generates both. 2675 { Label L; 2676 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2677 __ cmpl(ckoff, Address(ckval, sco_offset)); 2678 __ jcc(Assembler::equal, L); 2679 __ stop("super_check_offset inconsistent"); 2680 __ bind(L); 2681 } 2682 #endif //ASSERT 2683 2684 // Loop-invariant addresses. They are exclusive end pointers. 2685 Address end_from_addr(from, length, TIMES_OOP, 0); 2686 Address end_to_addr(to, length, TIMES_OOP, 0); 2687 // Loop-variant addresses. They assume post-incremented count < 0. 2688 Address from_element_addr(end_from, count, TIMES_OOP, 0); 2689 Address to_element_addr(end_to, count, TIMES_OOP, 0); 2690 2691 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST; 2692 if (dest_uninitialized) { 2693 decorators |= IS_DEST_UNINITIALIZED; 2694 } 2695 2696 BasicType type = T_OBJECT; 2697 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2698 bs->arraycopy_prologue(_masm, decorators, type, from, to, count); 2699 2700 // Copy from low to high addresses, indexed from the end of each array. 2701 __ lea(end_from, end_from_addr); 2702 __ lea(end_to, end_to_addr); 2703 __ movptr(r14_length, length); // save a copy of the length 2704 assert(length == count, ""); // else fix next line: 2705 __ negptr(count); // negate and test the length 2706 __ jcc(Assembler::notZero, L_load_element); 2707 2708 // Empty array: Nothing to do. 2709 __ xorptr(rax, rax); // return 0 on (trivial) success 2710 __ jmp(L_done); 2711 2712 // ======== begin loop ======== 2713 // (Loop is rotated; its entry is L_load_element.) 2714 // Loop control: 2715 // for (count = -count; count != 0; count++) 2716 // Base pointers src, dst are biased by 8*(count-1),to last element. 2717 __ align(OptoLoopAlignment); 2718 2719 __ BIND(L_store_element); 2720 __ store_heap_oop(to_element_addr, rax_oop, noreg, noreg, AS_RAW); // store the oop 2721 __ increment(count); // increment the count toward zero 2722 __ jcc(Assembler::zero, L_do_card_marks); 2723 2724 // ======== loop entry is here ======== 2725 __ BIND(L_load_element); 2726 __ load_heap_oop(rax_oop, from_element_addr, noreg, noreg, AS_RAW); // load the oop 2727 __ testptr(rax_oop, rax_oop); 2728 __ jcc(Assembler::zero, L_store_element); 2729 2730 __ load_klass(r11_klass, rax_oop);// query the object klass 2731 generate_type_check(r11_klass, ckoff, ckval, L_store_element); 2732 // ======== end loop ======== 2733 2734 // It was a real error; we must depend on the caller to finish the job. 2735 // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops. 2736 // Emit GC store barriers for the oops we have copied (r14 + rdx), 2737 // and report their number to the caller. 2738 assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1); 2739 Label L_post_barrier; 2740 __ addptr(r14_length, count); // K = (original - remaining) oops 2741 __ movptr(rax, r14_length); // save the value 2742 __ notptr(rax); // report (-1^K) to caller (does not affect flags) 2743 __ jccb(Assembler::notZero, L_post_barrier); 2744 __ jmp(L_done); // K == 0, nothing was copied, skip post barrier 2745 2746 // Come here on success only. 2747 __ BIND(L_do_card_marks); 2748 __ xorptr(rax, rax); // return 0 on success 2749 2750 __ BIND(L_post_barrier); 2751 bs->arraycopy_epilogue(_masm, decorators, type, from, to, r14_length); 2752 2753 // Common exit point (success or failure). 2754 __ BIND(L_done); 2755 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize)); 2756 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize)); 2757 __ movptr(r10, Address(rsp, saved_r10_offset * wordSize)); 2758 restore_arg_regs(); 2759 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free 2760 __ leave(); // required for proper stackwalking of RuntimeStub frame 2761 __ ret(0); 2762 2763 return start; 2764 } 2765 2766 // 2767 // Generate 'unsafe' array copy stub 2768 // Though just as safe as the other stubs, it takes an unscaled 2769 // size_t argument instead of an element count. 2770 // 2771 // Input: 2772 // c_rarg0 - source array address 2773 // c_rarg1 - destination array address 2774 // c_rarg2 - byte count, treated as ssize_t, can be zero 2775 // 2776 // Examines the alignment of the operands and dispatches 2777 // to a long, int, short, or byte copy loop. 2778 // 2779 address generate_unsafe_copy(const char *name, 2780 address byte_copy_entry, address short_copy_entry, 2781 address int_copy_entry, address long_copy_entry) { 2782 2783 Label L_long_aligned, L_int_aligned, L_short_aligned; 2784 2785 // Input registers (before setup_arg_regs) 2786 const Register from = c_rarg0; // source array address 2787 const Register to = c_rarg1; // destination array address 2788 const Register size = c_rarg2; // byte count (size_t) 2789 2790 // Register used as a temp 2791 const Register bits = rax; // test copy of low bits 2792 2793 __ align(CodeEntryAlignment); 2794 StubCodeMark mark(this, "StubRoutines", name); 2795 address start = __ pc(); 2796 2797 __ enter(); // required for proper stackwalking of RuntimeStub frame 2798 2799 // bump this on entry, not on exit: 2800 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2801 2802 __ mov(bits, from); 2803 __ orptr(bits, to); 2804 __ orptr(bits, size); 2805 2806 __ testb(bits, BytesPerLong-1); 2807 __ jccb(Assembler::zero, L_long_aligned); 2808 2809 __ testb(bits, BytesPerInt-1); 2810 __ jccb(Assembler::zero, L_int_aligned); 2811 2812 __ testb(bits, BytesPerShort-1); 2813 __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry)); 2814 2815 __ BIND(L_short_aligned); 2816 __ shrptr(size, LogBytesPerShort); // size => short_count 2817 __ jump(RuntimeAddress(short_copy_entry)); 2818 2819 __ BIND(L_int_aligned); 2820 __ shrptr(size, LogBytesPerInt); // size => int_count 2821 __ jump(RuntimeAddress(int_copy_entry)); 2822 2823 __ BIND(L_long_aligned); 2824 __ shrptr(size, LogBytesPerLong); // size => qword_count 2825 __ jump(RuntimeAddress(long_copy_entry)); 2826 2827 return start; 2828 } 2829 2830 // Perform range checks on the proposed arraycopy. 2831 // Kills temp, but nothing else. 2832 // Also, clean the sign bits of src_pos and dst_pos. 2833 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 2834 Register src_pos, // source position (c_rarg1) 2835 Register dst, // destination array oo (c_rarg2) 2836 Register dst_pos, // destination position (c_rarg3) 2837 Register length, 2838 Register temp, 2839 Label& L_failed) { 2840 BLOCK_COMMENT("arraycopy_range_checks:"); 2841 2842 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2843 __ movl(temp, length); 2844 __ addl(temp, src_pos); // src_pos + length 2845 __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes())); 2846 __ jcc(Assembler::above, L_failed); 2847 2848 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2849 __ movl(temp, length); 2850 __ addl(temp, dst_pos); // dst_pos + length 2851 __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2852 __ jcc(Assembler::above, L_failed); 2853 2854 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'. 2855 // Move with sign extension can be used since they are positive. 2856 __ movslq(src_pos, src_pos); 2857 __ movslq(dst_pos, dst_pos); 2858 2859 BLOCK_COMMENT("arraycopy_range_checks done"); 2860 } 2861 2862 // 2863 // Generate generic array copy stubs 2864 // 2865 // Input: 2866 // c_rarg0 - src oop 2867 // c_rarg1 - src_pos (32-bits) 2868 // c_rarg2 - dst oop 2869 // c_rarg3 - dst_pos (32-bits) 2870 // not Win64 2871 // c_rarg4 - element count (32-bits) 2872 // Win64 2873 // rsp+40 - element count (32-bits) 2874 // 2875 // Output: 2876 // rax == 0 - success 2877 // rax == -1^K - failure, where K is partial transfer count 2878 // 2879 address generate_generic_copy(const char *name, 2880 address byte_copy_entry, address short_copy_entry, 2881 address int_copy_entry, address oop_copy_entry, 2882 address long_copy_entry, address checkcast_copy_entry) { 2883 2884 Label L_failed, L_failed_0, L_objArray; 2885 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2886 2887 // Input registers 2888 const Register src = c_rarg0; // source array oop 2889 const Register src_pos = c_rarg1; // source position 2890 const Register dst = c_rarg2; // destination array oop 2891 const Register dst_pos = c_rarg3; // destination position 2892 #ifndef _WIN64 2893 const Register length = c_rarg4; 2894 #else 2895 const Address length(rsp, 6 * wordSize); // elements count is on stack on Win64 2896 #endif 2897 2898 { int modulus = CodeEntryAlignment; 2899 int target = modulus - 5; // 5 = sizeof jmp(L_failed) 2900 int advance = target - (__ offset() % modulus); 2901 if (advance < 0) advance += modulus; 2902 if (advance > 0) __ nop(advance); 2903 } 2904 StubCodeMark mark(this, "StubRoutines", name); 2905 2906 // Short-hop target to L_failed. Makes for denser prologue code. 2907 __ BIND(L_failed_0); 2908 __ jmp(L_failed); 2909 assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed"); 2910 2911 __ align(CodeEntryAlignment); 2912 address start = __ pc(); 2913 2914 __ enter(); // required for proper stackwalking of RuntimeStub frame 2915 2916 // bump this on entry, not on exit: 2917 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2918 2919 //----------------------------------------------------------------------- 2920 // Assembler stub will be used for this call to arraycopy 2921 // if the following conditions are met: 2922 // 2923 // (1) src and dst must not be null. 2924 // (2) src_pos must not be negative. 2925 // (3) dst_pos must not be negative. 2926 // (4) length must not be negative. 2927 // (5) src klass and dst klass should be the same and not NULL. 2928 // (6) src and dst should be arrays. 2929 // (7) src_pos + length must not exceed length of src. 2930 // (8) dst_pos + length must not exceed length of dst. 2931 // 2932 2933 // if (src == NULL) return -1; 2934 __ testptr(src, src); // src oop 2935 size_t j1off = __ offset(); 2936 __ jccb(Assembler::zero, L_failed_0); 2937 2938 // if (src_pos < 0) return -1; 2939 __ testl(src_pos, src_pos); // src_pos (32-bits) 2940 __ jccb(Assembler::negative, L_failed_0); 2941 2942 // if (dst == NULL) return -1; 2943 __ testptr(dst, dst); // dst oop 2944 __ jccb(Assembler::zero, L_failed_0); 2945 2946 // if (dst_pos < 0) return -1; 2947 __ testl(dst_pos, dst_pos); // dst_pos (32-bits) 2948 size_t j4off = __ offset(); 2949 __ jccb(Assembler::negative, L_failed_0); 2950 2951 // The first four tests are very dense code, 2952 // but not quite dense enough to put four 2953 // jumps in a 16-byte instruction fetch buffer. 2954 // That's good, because some branch predicters 2955 // do not like jumps so close together. 2956 // Make sure of this. 2957 guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps"); 2958 2959 // registers used as temp 2960 const Register r11_length = r11; // elements count to copy 2961 const Register r10_src_klass = r10; // array klass 2962 2963 // if (length < 0) return -1; 2964 __ movl(r11_length, length); // length (elements count, 32-bits value) 2965 __ testl(r11_length, r11_length); 2966 __ jccb(Assembler::negative, L_failed_0); 2967 2968 __ load_klass(r10_src_klass, src); 2969 #ifdef ASSERT 2970 // assert(src->klass() != NULL); 2971 { 2972 BLOCK_COMMENT("assert klasses not null {"); 2973 Label L1, L2; 2974 __ testptr(r10_src_klass, r10_src_klass); 2975 __ jcc(Assembler::notZero, L2); // it is broken if klass is NULL 2976 __ bind(L1); 2977 __ stop("broken null klass"); 2978 __ bind(L2); 2979 __ load_klass(rax, dst); 2980 __ cmpq(rax, 0); 2981 __ jcc(Assembler::equal, L1); // this would be broken also 2982 BLOCK_COMMENT("} assert klasses not null done"); 2983 } 2984 #endif 2985 2986 // Load layout helper (32-bits) 2987 // 2988 // |array_tag| | header_size | element_type | |log2_element_size| 2989 // 32 30 24 16 8 2 0 2990 // 2991 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2992 // 2993 2994 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2995 2996 // Handle objArrays completely differently... 2997 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2998 __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh); 2999 __ jcc(Assembler::equal, L_objArray); 3000 3001 // if (src->klass() != dst->klass()) return -1; 3002 __ load_klass(rax, dst); 3003 __ cmpq(r10_src_klass, rax); 3004 __ jcc(Assembler::notEqual, L_failed); 3005 3006 const Register rax_lh = rax; // layout helper 3007 __ movl(rax_lh, Address(r10_src_klass, lh_offset)); 3008 3009 // if (!src->is_Array()) return -1; 3010 __ cmpl(rax_lh, Klass::_lh_neutral_value); 3011 __ jcc(Assembler::greaterEqual, L_failed); 3012 3013 // At this point, it is known to be a typeArray (array_tag 0x3). 3014 #ifdef ASSERT 3015 { 3016 BLOCK_COMMENT("assert primitive array {"); 3017 Label L; 3018 __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift)); 3019 __ jcc(Assembler::greaterEqual, L); 3020 __ stop("must be a primitive array"); 3021 __ bind(L); 3022 BLOCK_COMMENT("} assert primitive array done"); 3023 } 3024 #endif 3025 3026 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 3027 r10, L_failed); 3028 3029 // TypeArrayKlass 3030 // 3031 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 3032 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 3033 // 3034 3035 const Register r10_offset = r10; // array offset 3036 const Register rax_elsize = rax_lh; // element size 3037 3038 __ movl(r10_offset, rax_lh); 3039 __ shrl(r10_offset, Klass::_lh_header_size_shift); 3040 __ andptr(r10_offset, Klass::_lh_header_size_mask); // array_offset 3041 __ addptr(src, r10_offset); // src array offset 3042 __ addptr(dst, r10_offset); // dst array offset 3043 BLOCK_COMMENT("choose copy loop based on element size"); 3044 __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize 3045 3046 // next registers should be set before the jump to corresponding stub 3047 const Register from = c_rarg0; // source array address 3048 const Register to = c_rarg1; // destination array address 3049 const Register count = c_rarg2; // elements count 3050 3051 // 'from', 'to', 'count' registers should be set in such order 3052 // since they are the same as 'src', 'src_pos', 'dst'. 3053 3054 __ BIND(L_copy_bytes); 3055 __ cmpl(rax_elsize, 0); 3056 __ jccb(Assembler::notEqual, L_copy_shorts); 3057 __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr 3058 __ lea(to, Address(dst, dst_pos, Address::times_1, 0));// dst_addr 3059 __ movl2ptr(count, r11_length); // length 3060 __ jump(RuntimeAddress(byte_copy_entry)); 3061 3062 __ BIND(L_copy_shorts); 3063 __ cmpl(rax_elsize, LogBytesPerShort); 3064 __ jccb(Assembler::notEqual, L_copy_ints); 3065 __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr 3066 __ lea(to, Address(dst, dst_pos, Address::times_2, 0));// dst_addr 3067 __ movl2ptr(count, r11_length); // length 3068 __ jump(RuntimeAddress(short_copy_entry)); 3069 3070 __ BIND(L_copy_ints); 3071 __ cmpl(rax_elsize, LogBytesPerInt); 3072 __ jccb(Assembler::notEqual, L_copy_longs); 3073 __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr 3074 __ lea(to, Address(dst, dst_pos, Address::times_4, 0));// dst_addr 3075 __ movl2ptr(count, r11_length); // length 3076 __ jump(RuntimeAddress(int_copy_entry)); 3077 3078 __ BIND(L_copy_longs); 3079 #ifdef ASSERT 3080 { 3081 BLOCK_COMMENT("assert long copy {"); 3082 Label L; 3083 __ cmpl(rax_elsize, LogBytesPerLong); 3084 __ jcc(Assembler::equal, L); 3085 __ stop("must be long copy, but elsize is wrong"); 3086 __ bind(L); 3087 BLOCK_COMMENT("} assert long copy done"); 3088 } 3089 #endif 3090 __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr 3091 __ lea(to, Address(dst, dst_pos, Address::times_8, 0));// dst_addr 3092 __ movl2ptr(count, r11_length); // length 3093 __ jump(RuntimeAddress(long_copy_entry)); 3094 3095 // ObjArrayKlass 3096 __ BIND(L_objArray); 3097 // live at this point: r10_src_klass, r11_length, src[_pos], dst[_pos] 3098 3099 Label L_plain_copy, L_checkcast_copy; 3100 // test array classes for subtyping 3101 __ load_klass(rax, dst); 3102 __ cmpq(r10_src_klass, rax); // usual case is exact equality 3103 __ jcc(Assembler::notEqual, L_checkcast_copy); 3104 3105 // Identically typed arrays can be copied without element-wise checks. 3106 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 3107 r10, L_failed); 3108 3109 __ lea(from, Address(src, src_pos, TIMES_OOP, 3110 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr 3111 __ lea(to, Address(dst, dst_pos, TIMES_OOP, 3112 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr 3113 __ movl2ptr(count, r11_length); // length 3114 __ BIND(L_plain_copy); 3115 __ jump(RuntimeAddress(oop_copy_entry)); 3116 3117 __ BIND(L_checkcast_copy); 3118 // live at this point: r10_src_klass, r11_length, rax (dst_klass) 3119 { 3120 // Before looking at dst.length, make sure dst is also an objArray. 3121 __ cmpl(Address(rax, lh_offset), objArray_lh); 3122 __ jcc(Assembler::notEqual, L_failed); 3123 3124 // It is safe to examine both src.length and dst.length. 3125 arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length, 3126 rax, L_failed); 3127 3128 const Register r11_dst_klass = r11; 3129 __ load_klass(r11_dst_klass, dst); // reload 3130 3131 // Marshal the base address arguments now, freeing registers. 3132 __ lea(from, Address(src, src_pos, TIMES_OOP, 3133 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); 3134 __ lea(to, Address(dst, dst_pos, TIMES_OOP, 3135 arrayOopDesc::base_offset_in_bytes(T_OBJECT))); 3136 __ movl(count, length); // length (reloaded) 3137 Register sco_temp = c_rarg3; // this register is free now 3138 assert_different_registers(from, to, count, sco_temp, 3139 r11_dst_klass, r10_src_klass); 3140 assert_clean_int(count, sco_temp); 3141 3142 // Generate the type check. 3143 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 3144 __ movl(sco_temp, Address(r11_dst_klass, sco_offset)); 3145 assert_clean_int(sco_temp, rax); 3146 generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy); 3147 3148 // Fetch destination element klass from the ObjArrayKlass header. 3149 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 3150 __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset)); 3151 __ movl( sco_temp, Address(r11_dst_klass, sco_offset)); 3152 assert_clean_int(sco_temp, rax); 3153 3154 // the checkcast_copy loop needs two extra arguments: 3155 assert(c_rarg3 == sco_temp, "#3 already in place"); 3156 // Set up arguments for checkcast_copy_entry. 3157 setup_arg_regs(4); 3158 __ movptr(r8, r11_dst_klass); // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris 3159 __ jump(RuntimeAddress(checkcast_copy_entry)); 3160 } 3161 3162 __ BIND(L_failed); 3163 __ xorptr(rax, rax); 3164 __ notptr(rax); // return -1 3165 __ leave(); // required for proper stackwalking of RuntimeStub frame 3166 __ ret(0); 3167 3168 return start; 3169 } 3170 3171 void generate_arraycopy_stubs() { 3172 address entry; 3173 address entry_jbyte_arraycopy; 3174 address entry_jshort_arraycopy; 3175 address entry_jint_arraycopy; 3176 address entry_oop_arraycopy; 3177 address entry_jlong_arraycopy; 3178 address entry_checkcast_arraycopy; 3179 3180 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 3181 "jbyte_disjoint_arraycopy"); 3182 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy, 3183 "jbyte_arraycopy"); 3184 3185 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 3186 "jshort_disjoint_arraycopy"); 3187 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy, 3188 "jshort_arraycopy"); 3189 3190 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, false, &entry, 3191 "jint_disjoint_arraycopy"); 3192 StubRoutines::_jint_arraycopy = generate_conjoint_int_oop_copy(false, false, entry, 3193 &entry_jint_arraycopy, "jint_arraycopy"); 3194 3195 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, false, &entry, 3196 "jlong_disjoint_arraycopy"); 3197 StubRoutines::_jlong_arraycopy = generate_conjoint_long_oop_copy(false, false, entry, 3198 &entry_jlong_arraycopy, "jlong_arraycopy"); 3199 3200 3201 if (UseCompressedOops) { 3202 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_int_oop_copy(false, true, &entry, 3203 "oop_disjoint_arraycopy"); 3204 StubRoutines::_oop_arraycopy = generate_conjoint_int_oop_copy(false, true, entry, 3205 &entry_oop_arraycopy, "oop_arraycopy"); 3206 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_int_oop_copy(false, true, &entry, 3207 "oop_disjoint_arraycopy_uninit", 3208 /*dest_uninitialized*/true); 3209 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_int_oop_copy(false, true, entry, 3210 NULL, "oop_arraycopy_uninit", 3211 /*dest_uninitialized*/true); 3212 } else { 3213 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_long_oop_copy(false, true, &entry, 3214 "oop_disjoint_arraycopy"); 3215 StubRoutines::_oop_arraycopy = generate_conjoint_long_oop_copy(false, true, entry, 3216 &entry_oop_arraycopy, "oop_arraycopy"); 3217 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_long_oop_copy(false, true, &entry, 3218 "oop_disjoint_arraycopy_uninit", 3219 /*dest_uninitialized*/true); 3220 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_long_oop_copy(false, true, entry, 3221 NULL, "oop_arraycopy_uninit", 3222 /*dest_uninitialized*/true); 3223 } 3224 3225 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 3226 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 3227 /*dest_uninitialized*/true); 3228 3229 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 3230 entry_jbyte_arraycopy, 3231 entry_jshort_arraycopy, 3232 entry_jint_arraycopy, 3233 entry_jlong_arraycopy); 3234 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 3235 entry_jbyte_arraycopy, 3236 entry_jshort_arraycopy, 3237 entry_jint_arraycopy, 3238 entry_oop_arraycopy, 3239 entry_jlong_arraycopy, 3240 entry_checkcast_arraycopy); 3241 3242 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 3243 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 3244 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 3245 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 3246 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 3247 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 3248 3249 // We don't generate specialized code for HeapWord-aligned source 3250 // arrays, so just use the code we've already generated 3251 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = StubRoutines::_jbyte_disjoint_arraycopy; 3252 StubRoutines::_arrayof_jbyte_arraycopy = StubRoutines::_jbyte_arraycopy; 3253 3254 StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy; 3255 StubRoutines::_arrayof_jshort_arraycopy = StubRoutines::_jshort_arraycopy; 3256 3257 StubRoutines::_arrayof_jint_disjoint_arraycopy = StubRoutines::_jint_disjoint_arraycopy; 3258 StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy; 3259 3260 StubRoutines::_arrayof_jlong_disjoint_arraycopy = StubRoutines::_jlong_disjoint_arraycopy; 3261 StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy; 3262 3263 StubRoutines::_arrayof_oop_disjoint_arraycopy = StubRoutines::_oop_disjoint_arraycopy; 3264 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy; 3265 3266 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = StubRoutines::_oop_disjoint_arraycopy_uninit; 3267 StubRoutines::_arrayof_oop_arraycopy_uninit = StubRoutines::_oop_arraycopy_uninit; 3268 } 3269 3270 // AES intrinsic stubs 3271 enum {AESBlockSize = 16}; 3272 3273 address generate_key_shuffle_mask() { 3274 __ align(16); 3275 StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask"); 3276 address start = __ pc(); 3277 __ emit_data64( 0x0405060700010203, relocInfo::none ); 3278 __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none ); 3279 return start; 3280 } 3281 3282 address generate_counter_shuffle_mask() { 3283 __ align(16); 3284 StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask"); 3285 address start = __ pc(); 3286 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none); 3287 __ emit_data64(0x0001020304050607, relocInfo::none); 3288 return start; 3289 } 3290 3291 // Utility routine for loading a 128-bit key word in little endian format 3292 // can optionally specify that the shuffle mask is already in an xmmregister 3293 void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { 3294 __ movdqu(xmmdst, Address(key, offset)); 3295 if (xmm_shuf_mask != NULL) { 3296 __ pshufb(xmmdst, xmm_shuf_mask); 3297 } else { 3298 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3299 } 3300 } 3301 3302 // Utility routine for increase 128bit counter (iv in CTR mode) 3303 void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) { 3304 __ pextrq(reg, xmmdst, 0x0); 3305 __ addq(reg, inc_delta); 3306 __ pinsrq(xmmdst, reg, 0x0); 3307 __ jcc(Assembler::carryClear, next_block); // jump if no carry 3308 __ pextrq(reg, xmmdst, 0x01); // Carry 3309 __ addq(reg, 0x01); 3310 __ pinsrq(xmmdst, reg, 0x01); //Carry end 3311 __ BIND(next_block); // next instruction 3312 } 3313 3314 // Arguments: 3315 // 3316 // Inputs: 3317 // c_rarg0 - source byte array address 3318 // c_rarg1 - destination byte array address 3319 // c_rarg2 - K (key) in little endian int array 3320 // 3321 address generate_aescrypt_encryptBlock() { 3322 assert(UseAES, "need AES instructions and misaligned SSE support"); 3323 __ align(CodeEntryAlignment); 3324 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 3325 Label L_doLast; 3326 address start = __ pc(); 3327 3328 const Register from = c_rarg0; // source array address 3329 const Register to = c_rarg1; // destination array address 3330 const Register key = c_rarg2; // key array address 3331 const Register keylen = rax; 3332 3333 const XMMRegister xmm_result = xmm0; 3334 const XMMRegister xmm_key_shuf_mask = xmm1; 3335 // On win64 xmm6-xmm15 must be preserved so don't use them. 3336 const XMMRegister xmm_temp1 = xmm2; 3337 const XMMRegister xmm_temp2 = xmm3; 3338 const XMMRegister xmm_temp3 = xmm4; 3339 const XMMRegister xmm_temp4 = xmm5; 3340 3341 __ enter(); // required for proper stackwalking of RuntimeStub frame 3342 3343 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60} 3344 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3345 3346 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3347 __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input 3348 3349 // For encryption, the java expanded key ordering is just what we need 3350 // we don't know if the key is aligned, hence not using load-execute form 3351 3352 load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask); 3353 __ pxor(xmm_result, xmm_temp1); 3354 3355 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask); 3356 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask); 3357 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask); 3358 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask); 3359 3360 __ aesenc(xmm_result, xmm_temp1); 3361 __ aesenc(xmm_result, xmm_temp2); 3362 __ aesenc(xmm_result, xmm_temp3); 3363 __ aesenc(xmm_result, xmm_temp4); 3364 3365 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask); 3366 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask); 3367 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask); 3368 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask); 3369 3370 __ aesenc(xmm_result, xmm_temp1); 3371 __ aesenc(xmm_result, xmm_temp2); 3372 __ aesenc(xmm_result, xmm_temp3); 3373 __ aesenc(xmm_result, xmm_temp4); 3374 3375 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask); 3376 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask); 3377 3378 __ cmpl(keylen, 44); 3379 __ jccb(Assembler::equal, L_doLast); 3380 3381 __ aesenc(xmm_result, xmm_temp1); 3382 __ aesenc(xmm_result, xmm_temp2); 3383 3384 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask); 3385 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask); 3386 3387 __ cmpl(keylen, 52); 3388 __ jccb(Assembler::equal, L_doLast); 3389 3390 __ aesenc(xmm_result, xmm_temp1); 3391 __ aesenc(xmm_result, xmm_temp2); 3392 3393 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask); 3394 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask); 3395 3396 __ BIND(L_doLast); 3397 __ aesenc(xmm_result, xmm_temp1); 3398 __ aesenclast(xmm_result, xmm_temp2); 3399 __ movdqu(Address(to, 0), xmm_result); // store the result 3400 __ xorptr(rax, rax); // return 0 3401 __ leave(); // required for proper stackwalking of RuntimeStub frame 3402 __ ret(0); 3403 3404 return start; 3405 } 3406 3407 3408 // Arguments: 3409 // 3410 // Inputs: 3411 // c_rarg0 - source byte array address 3412 // c_rarg1 - destination byte array address 3413 // c_rarg2 - K (key) in little endian int array 3414 // 3415 address generate_aescrypt_decryptBlock() { 3416 assert(UseAES, "need AES instructions and misaligned SSE support"); 3417 __ align(CodeEntryAlignment); 3418 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 3419 Label L_doLast; 3420 address start = __ pc(); 3421 3422 const Register from = c_rarg0; // source array address 3423 const Register to = c_rarg1; // destination array address 3424 const Register key = c_rarg2; // key array address 3425 const Register keylen = rax; 3426 3427 const XMMRegister xmm_result = xmm0; 3428 const XMMRegister xmm_key_shuf_mask = xmm1; 3429 // On win64 xmm6-xmm15 must be preserved so don't use them. 3430 const XMMRegister xmm_temp1 = xmm2; 3431 const XMMRegister xmm_temp2 = xmm3; 3432 const XMMRegister xmm_temp3 = xmm4; 3433 const XMMRegister xmm_temp4 = xmm5; 3434 3435 __ enter(); // required for proper stackwalking of RuntimeStub frame 3436 3437 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60} 3438 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3439 3440 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3441 __ movdqu(xmm_result, Address(from, 0)); 3442 3443 // for decryption java expanded key ordering is rotated one position from what we want 3444 // so we start from 0x10 here and hit 0x00 last 3445 // we don't know if the key is aligned, hence not using load-execute form 3446 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask); 3447 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask); 3448 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask); 3449 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask); 3450 3451 __ pxor (xmm_result, xmm_temp1); 3452 __ aesdec(xmm_result, xmm_temp2); 3453 __ aesdec(xmm_result, xmm_temp3); 3454 __ aesdec(xmm_result, xmm_temp4); 3455 3456 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask); 3457 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask); 3458 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask); 3459 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask); 3460 3461 __ aesdec(xmm_result, xmm_temp1); 3462 __ aesdec(xmm_result, xmm_temp2); 3463 __ aesdec(xmm_result, xmm_temp3); 3464 __ aesdec(xmm_result, xmm_temp4); 3465 3466 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask); 3467 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask); 3468 load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask); 3469 3470 __ cmpl(keylen, 44); 3471 __ jccb(Assembler::equal, L_doLast); 3472 3473 __ aesdec(xmm_result, xmm_temp1); 3474 __ aesdec(xmm_result, xmm_temp2); 3475 3476 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask); 3477 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask); 3478 3479 __ cmpl(keylen, 52); 3480 __ jccb(Assembler::equal, L_doLast); 3481 3482 __ aesdec(xmm_result, xmm_temp1); 3483 __ aesdec(xmm_result, xmm_temp2); 3484 3485 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask); 3486 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask); 3487 3488 __ BIND(L_doLast); 3489 __ aesdec(xmm_result, xmm_temp1); 3490 __ aesdec(xmm_result, xmm_temp2); 3491 3492 // for decryption the aesdeclast operation is always on key+0x00 3493 __ aesdeclast(xmm_result, xmm_temp3); 3494 __ movdqu(Address(to, 0), xmm_result); // store the result 3495 __ xorptr(rax, rax); // return 0 3496 __ leave(); // required for proper stackwalking of RuntimeStub frame 3497 __ ret(0); 3498 3499 return start; 3500 } 3501 3502 3503 // Arguments: 3504 // 3505 // Inputs: 3506 // c_rarg0 - source byte array address 3507 // c_rarg1 - destination byte array address 3508 // c_rarg2 - K (key) in little endian int array 3509 // c_rarg3 - r vector byte array address 3510 // c_rarg4 - input length 3511 // 3512 // Output: 3513 // rax - input length 3514 // 3515 address generate_cipherBlockChaining_encryptAESCrypt() { 3516 assert(UseAES, "need AES instructions and misaligned SSE support"); 3517 __ align(CodeEntryAlignment); 3518 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 3519 address start = __ pc(); 3520 3521 Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256; 3522 const Register from = c_rarg0; // source array address 3523 const Register to = c_rarg1; // destination array address 3524 const Register key = c_rarg2; // key array address 3525 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 3526 // and left with the results of the last encryption block 3527 #ifndef _WIN64 3528 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 3529 #else 3530 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 3531 const Register len_reg = r11; // pick the volatile windows register 3532 #endif 3533 const Register pos = rax; 3534 3535 // xmm register assignments for the loops below 3536 const XMMRegister xmm_result = xmm0; 3537 const XMMRegister xmm_temp = xmm1; 3538 // keys 0-10 preloaded into xmm2-xmm12 3539 const int XMM_REG_NUM_KEY_FIRST = 2; 3540 const int XMM_REG_NUM_KEY_LAST = 15; 3541 const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); 3542 const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10); 3543 const XMMRegister xmm_key11 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11); 3544 const XMMRegister xmm_key12 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12); 3545 const XMMRegister xmm_key13 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13); 3546 3547 __ enter(); // required for proper stackwalking of RuntimeStub frame 3548 3549 #ifdef _WIN64 3550 // on win64, fill len_reg from stack position 3551 __ movl(len_reg, len_mem); 3552 #else 3553 __ push(len_reg); // Save 3554 #endif 3555 3556 const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front 3557 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3558 // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0 3559 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) { 3560 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); 3561 offset += 0x10; 3562 } 3563 __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec 3564 3565 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) 3566 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3567 __ cmpl(rax, 44); 3568 __ jcc(Assembler::notEqual, L_key_192_256); 3569 3570 // 128 bit code follows here 3571 __ movptr(pos, 0); 3572 __ align(OptoLoopAlignment); 3573 3574 __ BIND(L_loopTop_128); 3575 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 3576 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 3577 __ pxor (xmm_result, xmm_key0); // do the aes rounds 3578 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) { 3579 __ aesenc(xmm_result, as_XMMRegister(rnum)); 3580 } 3581 __ aesenclast(xmm_result, xmm_key10); 3582 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3583 // no need to store r to memory until we exit 3584 __ addptr(pos, AESBlockSize); 3585 __ subptr(len_reg, AESBlockSize); 3586 __ jcc(Assembler::notEqual, L_loopTop_128); 3587 3588 __ BIND(L_exit); 3589 __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object 3590 3591 #ifdef _WIN64 3592 __ movl(rax, len_mem); 3593 #else 3594 __ pop(rax); // return length 3595 #endif 3596 __ leave(); // required for proper stackwalking of RuntimeStub frame 3597 __ ret(0); 3598 3599 __ BIND(L_key_192_256); 3600 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) 3601 load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask); 3602 load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask); 3603 __ cmpl(rax, 52); 3604 __ jcc(Assembler::notEqual, L_key_256); 3605 3606 // 192-bit code follows here (could be changed to use more xmm registers) 3607 __ movptr(pos, 0); 3608 __ align(OptoLoopAlignment); 3609 3610 __ BIND(L_loopTop_192); 3611 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 3612 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 3613 __ pxor (xmm_result, xmm_key0); // do the aes rounds 3614 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) { 3615 __ aesenc(xmm_result, as_XMMRegister(rnum)); 3616 } 3617 __ aesenclast(xmm_result, xmm_key12); 3618 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3619 // no need to store r to memory until we exit 3620 __ addptr(pos, AESBlockSize); 3621 __ subptr(len_reg, AESBlockSize); 3622 __ jcc(Assembler::notEqual, L_loopTop_192); 3623 __ jmp(L_exit); 3624 3625 __ BIND(L_key_256); 3626 // 256-bit code follows here (could be changed to use more xmm registers) 3627 load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask); 3628 __ movptr(pos, 0); 3629 __ align(OptoLoopAlignment); 3630 3631 __ BIND(L_loopTop_256); 3632 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 3633 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 3634 __ pxor (xmm_result, xmm_key0); // do the aes rounds 3635 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) { 3636 __ aesenc(xmm_result, as_XMMRegister(rnum)); 3637 } 3638 load_key(xmm_temp, key, 0xe0); 3639 __ aesenclast(xmm_result, xmm_temp); 3640 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3641 // no need to store r to memory until we exit 3642 __ addptr(pos, AESBlockSize); 3643 __ subptr(len_reg, AESBlockSize); 3644 __ jcc(Assembler::notEqual, L_loopTop_256); 3645 __ jmp(L_exit); 3646 3647 return start; 3648 } 3649 3650 // Safefetch stubs. 3651 void generate_safefetch(const char* name, int size, address* entry, 3652 address* fault_pc, address* continuation_pc) { 3653 // safefetch signatures: 3654 // int SafeFetch32(int* adr, int errValue); 3655 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3656 // 3657 // arguments: 3658 // c_rarg0 = adr 3659 // c_rarg1 = errValue 3660 // 3661 // result: 3662 // PPC_RET = *adr or errValue 3663 3664 StubCodeMark mark(this, "StubRoutines", name); 3665 3666 // Entry point, pc or function descriptor. 3667 *entry = __ pc(); 3668 3669 // Load *adr into c_rarg1, may fault. 3670 *fault_pc = __ pc(); 3671 switch (size) { 3672 case 4: 3673 // int32_t 3674 __ movl(c_rarg1, Address(c_rarg0, 0)); 3675 break; 3676 case 8: 3677 // int64_t 3678 __ movq(c_rarg1, Address(c_rarg0, 0)); 3679 break; 3680 default: 3681 ShouldNotReachHere(); 3682 } 3683 3684 // return errValue or *adr 3685 *continuation_pc = __ pc(); 3686 __ movq(rax, c_rarg1); 3687 __ ret(0); 3688 } 3689 3690 // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time 3691 // to hide instruction latency 3692 // 3693 // Arguments: 3694 // 3695 // Inputs: 3696 // c_rarg0 - source byte array address 3697 // c_rarg1 - destination byte array address 3698 // c_rarg2 - K (key) in little endian int array 3699 // c_rarg3 - r vector byte array address 3700 // c_rarg4 - input length 3701 // 3702 // Output: 3703 // rax - input length 3704 // 3705 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { 3706 assert(UseAES, "need AES instructions and misaligned SSE support"); 3707 __ align(CodeEntryAlignment); 3708 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 3709 address start = __ pc(); 3710 3711 const Register from = c_rarg0; // source array address 3712 const Register to = c_rarg1; // destination array address 3713 const Register key = c_rarg2; // key array address 3714 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 3715 // and left with the results of the last encryption block 3716 #ifndef _WIN64 3717 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 3718 #else 3719 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 3720 const Register len_reg = r11; // pick the volatile windows register 3721 #endif 3722 const Register pos = rax; 3723 3724 const int PARALLEL_FACTOR = 4; 3725 const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256 3726 3727 Label L_exit; 3728 Label L_singleBlock_loopTopHead[3]; // 128, 192, 256 3729 Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256 3730 Label L_singleBlock_loopTop[3]; // 128, 192, 256 3731 Label L_multiBlock_loopTopHead[3]; // 128, 192, 256 3732 Label L_multiBlock_loopTop[3]; // 128, 192, 256 3733 3734 // keys 0-10 preloaded into xmm5-xmm15 3735 const int XMM_REG_NUM_KEY_FIRST = 5; 3736 const int XMM_REG_NUM_KEY_LAST = 15; 3737 const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); 3738 const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST); 3739 3740 __ enter(); // required for proper stackwalking of RuntimeStub frame 3741 3742 #ifdef _WIN64 3743 // on win64, fill len_reg from stack position 3744 __ movl(len_reg, len_mem); 3745 #else 3746 __ push(len_reg); // Save 3747 #endif 3748 __ push(rbx); 3749 // the java expanded key ordering is rotated one position from what we want 3750 // so we start from 0x10 here and hit 0x00 last 3751 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front 3752 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3753 // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00 3754 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) { 3755 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); 3756 offset += 0x10; 3757 } 3758 load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask); 3759 3760 const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block 3761 3762 // registers holding the four results in the parallelized loop 3763 const XMMRegister xmm_result0 = xmm0; 3764 const XMMRegister xmm_result1 = xmm2; 3765 const XMMRegister xmm_result2 = xmm3; 3766 const XMMRegister xmm_result3 = xmm4; 3767 3768 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec 3769 3770 __ xorptr(pos, pos); 3771 3772 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) 3773 __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3774 __ cmpl(rbx, 52); 3775 __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]); 3776 __ cmpl(rbx, 60); 3777 __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]); 3778 3779 #define DoFour(opc, src_reg) \ 3780 __ opc(xmm_result0, src_reg); \ 3781 __ opc(xmm_result1, src_reg); \ 3782 __ opc(xmm_result2, src_reg); \ 3783 __ opc(xmm_result3, src_reg); \ 3784 3785 for (int k = 0; k < 3; ++k) { 3786 __ BIND(L_multiBlock_loopTopHead[k]); 3787 if (k != 0) { 3788 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left 3789 __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]); 3790 } 3791 if (k == 1) { 3792 __ subptr(rsp, 6 * wordSize); 3793 __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15 3794 load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0 3795 __ movdqu(Address(rsp, 2 * wordSize), xmm15); 3796 load_key(xmm1, key, 0xc0); // 0xc0; 3797 __ movdqu(Address(rsp, 4 * wordSize), xmm1); 3798 } else if (k == 2) { 3799 __ subptr(rsp, 10 * wordSize); 3800 __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15 3801 load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes upto 0xe0 3802 __ movdqu(Address(rsp, 6 * wordSize), xmm15); 3803 load_key(xmm1, key, 0xe0); // 0xe0; 3804 __ movdqu(Address(rsp, 8 * wordSize), xmm1); 3805 load_key(xmm15, key, 0xb0); // 0xb0; 3806 __ movdqu(Address(rsp, 2 * wordSize), xmm15); 3807 load_key(xmm1, key, 0xc0); // 0xc0; 3808 __ movdqu(Address(rsp, 4 * wordSize), xmm1); 3809 } 3810 __ align(OptoLoopAlignment); 3811 __ BIND(L_multiBlock_loopTop[k]); 3812 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left 3813 __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]); 3814 3815 if (k != 0) { 3816 __ movdqu(xmm15, Address(rsp, 2 * wordSize)); 3817 __ movdqu(xmm1, Address(rsp, 4 * wordSize)); 3818 } 3819 3820 __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers 3821 __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize)); 3822 __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize)); 3823 __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize)); 3824 3825 DoFour(pxor, xmm_key_first); 3826 if (k == 0) { 3827 for (int rnum = 1; rnum < ROUNDS[k]; rnum++) { 3828 DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST)); 3829 } 3830 DoFour(aesdeclast, xmm_key_last); 3831 } else if (k == 1) { 3832 for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) { 3833 DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST)); 3834 } 3835 __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again. 3836 DoFour(aesdec, xmm1); // key : 0xc0 3837 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // xmm1 needs to be loaded again 3838 DoFour(aesdeclast, xmm_key_last); 3839 } else if (k == 2) { 3840 for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) { 3841 DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST)); 3842 } 3843 DoFour(aesdec, xmm1); // key : 0xc0 3844 __ movdqu(xmm15, Address(rsp, 6 * wordSize)); 3845 __ movdqu(xmm1, Address(rsp, 8 * wordSize)); 3846 DoFour(aesdec, xmm15); // key : 0xd0 3847 __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again. 3848 DoFour(aesdec, xmm1); // key : 0xe0 3849 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // xmm1 needs to be loaded again 3850 DoFour(aesdeclast, xmm_key_last); 3851 } 3852 3853 // for each result, xor with the r vector of previous cipher block 3854 __ pxor(xmm_result0, xmm_prev_block_cipher); 3855 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize)); 3856 __ pxor(xmm_result1, xmm_prev_block_cipher); 3857 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize)); 3858 __ pxor(xmm_result2, xmm_prev_block_cipher); 3859 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize)); 3860 __ pxor(xmm_result3, xmm_prev_block_cipher); 3861 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize)); // this will carry over to next set of blocks 3862 if (k != 0) { 3863 __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher); 3864 } 3865 3866 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); // store 4 results into the next 64 bytes of output 3867 __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1); 3868 __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2); 3869 __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3); 3870 3871 __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); 3872 __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); 3873 __ jmp(L_multiBlock_loopTop[k]); 3874 3875 // registers used in the non-parallelized loops 3876 // xmm register assignments for the loops below 3877 const XMMRegister xmm_result = xmm0; 3878 const XMMRegister xmm_prev_block_cipher_save = xmm2; 3879 const XMMRegister xmm_key11 = xmm3; 3880 const XMMRegister xmm_key12 = xmm4; 3881 const XMMRegister key_tmp = xmm4; 3882 3883 __ BIND(L_singleBlock_loopTopHead[k]); 3884 if (k == 1) { 3885 __ addptr(rsp, 6 * wordSize); 3886 } else if (k == 2) { 3887 __ addptr(rsp, 10 * wordSize); 3888 } 3889 __ cmpptr(len_reg, 0); // any blocks left?? 3890 __ jcc(Assembler::equal, L_exit); 3891 __ BIND(L_singleBlock_loopTopHead2[k]); 3892 if (k == 1) { 3893 load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes upto 0xc0 3894 load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes upto 0xc0 3895 } 3896 if (k == 2) { 3897 load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes upto 0xe0 3898 } 3899 __ align(OptoLoopAlignment); 3900 __ BIND(L_singleBlock_loopTop[k]); 3901 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input 3902 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector 3903 __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds 3904 for (int rnum = 1; rnum <= 9 ; rnum++) { 3905 __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST)); 3906 } 3907 if (k == 1) { 3908 __ aesdec(xmm_result, xmm_key11); 3909 __ aesdec(xmm_result, xmm_key12); 3910 } 3911 if (k == 2) { 3912 __ aesdec(xmm_result, xmm_key11); 3913 load_key(key_tmp, key, 0xc0); 3914 __ aesdec(xmm_result, key_tmp); 3915 load_key(key_tmp, key, 0xd0); 3916 __ aesdec(xmm_result, key_tmp); 3917 load_key(key_tmp, key, 0xe0); 3918 __ aesdec(xmm_result, key_tmp); 3919 } 3920 3921 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0 3922 __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector 3923 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3924 // no need to store r to memory until we exit 3925 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block 3926 __ addptr(pos, AESBlockSize); 3927 __ subptr(len_reg, AESBlockSize); 3928 __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]); 3929 if (k != 2) { 3930 __ jmp(L_exit); 3931 } 3932 } //for 128/192/256 3933 3934 __ BIND(L_exit); 3935 __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object 3936 __ pop(rbx); 3937 #ifdef _WIN64 3938 __ movl(rax, len_mem); 3939 #else 3940 __ pop(rax); // return length 3941 #endif 3942 __ leave(); // required for proper stackwalking of RuntimeStub frame 3943 __ ret(0); 3944 return start; 3945 } 3946 3947 address generate_upper_word_mask() { 3948 __ align(64); 3949 StubCodeMark mark(this, "StubRoutines", "upper_word_mask"); 3950 address start = __ pc(); 3951 __ emit_data64(0x0000000000000000, relocInfo::none); 3952 __ emit_data64(0xFFFFFFFF00000000, relocInfo::none); 3953 return start; 3954 } 3955 3956 address generate_shuffle_byte_flip_mask() { 3957 __ align(64); 3958 StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask"); 3959 address start = __ pc(); 3960 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none); 3961 __ emit_data64(0x0001020304050607, relocInfo::none); 3962 return start; 3963 } 3964 3965 // ofs and limit are use for multi-block byte array. 3966 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) 3967 address generate_sha1_implCompress(bool multi_block, const char *name) { 3968 __ align(CodeEntryAlignment); 3969 StubCodeMark mark(this, "StubRoutines", name); 3970 address start = __ pc(); 3971 3972 Register buf = c_rarg0; 3973 Register state = c_rarg1; 3974 Register ofs = c_rarg2; 3975 Register limit = c_rarg3; 3976 3977 const XMMRegister abcd = xmm0; 3978 const XMMRegister e0 = xmm1; 3979 const XMMRegister e1 = xmm2; 3980 const XMMRegister msg0 = xmm3; 3981 3982 const XMMRegister msg1 = xmm4; 3983 const XMMRegister msg2 = xmm5; 3984 const XMMRegister msg3 = xmm6; 3985 const XMMRegister shuf_mask = xmm7; 3986 3987 __ enter(); 3988 3989 __ subptr(rsp, 4 * wordSize); 3990 3991 __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask, 3992 buf, state, ofs, limit, rsp, multi_block); 3993 3994 __ addptr(rsp, 4 * wordSize); 3995 3996 __ leave(); 3997 __ ret(0); 3998 return start; 3999 } 4000 4001 address generate_pshuffle_byte_flip_mask() { 4002 __ align(64); 4003 StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask"); 4004 address start = __ pc(); 4005 __ emit_data64(0x0405060700010203, relocInfo::none); 4006 __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none); 4007 4008 if (VM_Version::supports_avx2()) { 4009 __ emit_data64(0x0405060700010203, relocInfo::none); // second copy 4010 __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none); 4011 // _SHUF_00BA 4012 __ emit_data64(0x0b0a090803020100, relocInfo::none); 4013 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); 4014 __ emit_data64(0x0b0a090803020100, relocInfo::none); 4015 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); 4016 // _SHUF_DC00 4017 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); 4018 __ emit_data64(0x0b0a090803020100, relocInfo::none); 4019 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); 4020 __ emit_data64(0x0b0a090803020100, relocInfo::none); 4021 } 4022 4023 return start; 4024 } 4025 4026 //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb. 4027 address generate_pshuffle_byte_flip_mask_sha512() { 4028 __ align(32); 4029 StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512"); 4030 address start = __ pc(); 4031 if (VM_Version::supports_avx2()) { 4032 __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK 4033 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none); 4034 __ emit_data64(0x1011121314151617, relocInfo::none); 4035 __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none); 4036 __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO 4037 __ emit_data64(0x0000000000000000, relocInfo::none); 4038 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); 4039 __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none); 4040 } 4041 4042 return start; 4043 } 4044 4045 // ofs and limit are use for multi-block byte array. 4046 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit) 4047 address generate_sha256_implCompress(bool multi_block, const char *name) { 4048 assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), ""); 4049 __ align(CodeEntryAlignment); 4050 StubCodeMark mark(this, "StubRoutines", name); 4051 address start = __ pc(); 4052 4053 Register buf = c_rarg0; 4054 Register state = c_rarg1; 4055 Register ofs = c_rarg2; 4056 Register limit = c_rarg3; 4057 4058 const XMMRegister msg = xmm0; 4059 const XMMRegister state0 = xmm1; 4060 const XMMRegister state1 = xmm2; 4061 const XMMRegister msgtmp0 = xmm3; 4062 4063 const XMMRegister msgtmp1 = xmm4; 4064 const XMMRegister msgtmp2 = xmm5; 4065 const XMMRegister msgtmp3 = xmm6; 4066 const XMMRegister msgtmp4 = xmm7; 4067 4068 const XMMRegister shuf_mask = xmm8; 4069 4070 __ enter(); 4071 4072 __ subptr(rsp, 4 * wordSize); 4073 4074 if (VM_Version::supports_sha()) { 4075 __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4, 4076 buf, state, ofs, limit, rsp, multi_block, shuf_mask); 4077 } else if (VM_Version::supports_avx2()) { 4078 __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4, 4079 buf, state, ofs, limit, rsp, multi_block, shuf_mask); 4080 } 4081 __ addptr(rsp, 4 * wordSize); 4082 __ vzeroupper(); 4083 __ leave(); 4084 __ ret(0); 4085 return start; 4086 } 4087 4088 address generate_sha512_implCompress(bool multi_block, const char *name) { 4089 assert(VM_Version::supports_avx2(), ""); 4090 assert(VM_Version::supports_bmi2(), ""); 4091 __ align(CodeEntryAlignment); 4092 StubCodeMark mark(this, "StubRoutines", name); 4093 address start = __ pc(); 4094 4095 Register buf = c_rarg0; 4096 Register state = c_rarg1; 4097 Register ofs = c_rarg2; 4098 Register limit = c_rarg3; 4099 4100 const XMMRegister msg = xmm0; 4101 const XMMRegister state0 = xmm1; 4102 const XMMRegister state1 = xmm2; 4103 const XMMRegister msgtmp0 = xmm3; 4104 const XMMRegister msgtmp1 = xmm4; 4105 const XMMRegister msgtmp2 = xmm5; 4106 const XMMRegister msgtmp3 = xmm6; 4107 const XMMRegister msgtmp4 = xmm7; 4108 4109 const XMMRegister shuf_mask = xmm8; 4110 4111 __ enter(); 4112 4113 __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4, 4114 buf, state, ofs, limit, rsp, multi_block, shuf_mask); 4115 4116 __ vzeroupper(); 4117 __ leave(); 4118 __ ret(0); 4119 return start; 4120 } 4121 4122 // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time 4123 // to hide instruction latency 4124 // 4125 // Arguments: 4126 // 4127 // Inputs: 4128 // c_rarg0 - source byte array address 4129 // c_rarg1 - destination byte array address 4130 // c_rarg2 - K (key) in little endian int array 4131 // c_rarg3 - counter vector byte array address 4132 // Linux 4133 // c_rarg4 - input length 4134 // c_rarg5 - saved encryptedCounter start 4135 // rbp + 6 * wordSize - saved used length 4136 // Windows 4137 // rbp + 6 * wordSize - input length 4138 // rbp + 7 * wordSize - saved encryptedCounter start 4139 // rbp + 8 * wordSize - saved used length 4140 // 4141 // Output: 4142 // rax - input length 4143 // 4144 address generate_counterMode_AESCrypt_Parallel() { 4145 assert(UseAES, "need AES instructions and misaligned SSE support"); 4146 __ align(CodeEntryAlignment); 4147 StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt"); 4148 address start = __ pc(); 4149 const Register from = c_rarg0; // source array address 4150 const Register to = c_rarg1; // destination array address 4151 const Register key = c_rarg2; // key array address 4152 const Register counter = c_rarg3; // counter byte array initialized from counter array address 4153 // and updated with the incremented counter in the end 4154 #ifndef _WIN64 4155 const Register len_reg = c_rarg4; 4156 const Register saved_encCounter_start = c_rarg5; 4157 const Register used_addr = r10; 4158 const Address used_mem(rbp, 2 * wordSize); 4159 const Register used = r11; 4160 #else 4161 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 4162 const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64 4163 const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64 4164 const Register len_reg = r10; // pick the first volatile windows register 4165 const Register saved_encCounter_start = r11; 4166 const Register used_addr = r13; 4167 const Register used = r14; 4168 #endif 4169 const Register pos = rax; 4170 4171 const int PARALLEL_FACTOR = 6; 4172 const XMMRegister xmm_counter_shuf_mask = xmm0; 4173 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front 4174 const XMMRegister xmm_curr_counter = xmm2; 4175 4176 const XMMRegister xmm_key_tmp0 = xmm3; 4177 const XMMRegister xmm_key_tmp1 = xmm4; 4178 4179 // registers holding the four results in the parallelized loop 4180 const XMMRegister xmm_result0 = xmm5; 4181 const XMMRegister xmm_result1 = xmm6; 4182 const XMMRegister xmm_result2 = xmm7; 4183 const XMMRegister xmm_result3 = xmm8; 4184 const XMMRegister xmm_result4 = xmm9; 4185 const XMMRegister xmm_result5 = xmm10; 4186 4187 const XMMRegister xmm_from0 = xmm11; 4188 const XMMRegister xmm_from1 = xmm12; 4189 const XMMRegister xmm_from2 = xmm13; 4190 const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64. 4191 const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text 4192 const XMMRegister xmm_from5 = xmm4; 4193 4194 //for key_128, key_192, key_256 4195 const int rounds[3] = {10, 12, 14}; 4196 Label L_exit_preLoop, L_preLoop_start; 4197 Label L_multiBlock_loopTop[3]; 4198 Label L_singleBlockLoopTop[3]; 4199 Label L__incCounter[3][6]; //for 6 blocks 4200 Label L__incCounter_single[3]; //for single block, key128, key192, key256 4201 Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3]; 4202 Label L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3]; 4203 4204 Label L_exit; 4205 4206 __ enter(); // required for proper stackwalking of RuntimeStub frame 4207 4208 #ifdef _WIN64 4209 // allocate spill slots for r13, r14 4210 enum { 4211 saved_r13_offset, 4212 saved_r14_offset 4213 }; 4214 __ subptr(rsp, 2 * wordSize); 4215 __ movptr(Address(rsp, saved_r13_offset * wordSize), r13); 4216 __ movptr(Address(rsp, saved_r14_offset * wordSize), r14); 4217 4218 // on win64, fill len_reg from stack position 4219 __ movl(len_reg, len_mem); 4220 __ movptr(saved_encCounter_start, saved_encCounter_mem); 4221 __ movptr(used_addr, used_mem); 4222 __ movl(used, Address(used_addr, 0)); 4223 #else 4224 __ push(len_reg); // Save 4225 __ movptr(used_addr, used_mem); 4226 __ movl(used, Address(used_addr, 0)); 4227 #endif 4228 4229 __ push(rbx); // Save RBX 4230 __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter 4231 __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()), pos); // pos as scratch 4232 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled 4233 __ movptr(pos, 0); 4234 4235 // Use the partially used encrpyted counter from last invocation 4236 __ BIND(L_preLoop_start); 4237 __ cmpptr(used, 16); 4238 __ jcc(Assembler::aboveEqual, L_exit_preLoop); 4239 __ cmpptr(len_reg, 0); 4240 __ jcc(Assembler::lessEqual, L_exit_preLoop); 4241 __ movb(rbx, Address(saved_encCounter_start, used)); 4242 __ xorb(rbx, Address(from, pos)); 4243 __ movb(Address(to, pos), rbx); 4244 __ addptr(pos, 1); 4245 __ addptr(used, 1); 4246 __ subptr(len_reg, 1); 4247 4248 __ jmp(L_preLoop_start); 4249 4250 __ BIND(L_exit_preLoop); 4251 __ movl(Address(used_addr, 0), used); 4252 4253 // key length could be only {11, 13, 15} * 4 = {44, 52, 60} 4254 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()), rbx); // rbx as scratch 4255 __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 4256 __ cmpl(rbx, 52); 4257 __ jcc(Assembler::equal, L_multiBlock_loopTop[1]); 4258 __ cmpl(rbx, 60); 4259 __ jcc(Assembler::equal, L_multiBlock_loopTop[2]); 4260 4261 #define CTR_DoSix(opc, src_reg) \ 4262 __ opc(xmm_result0, src_reg); \ 4263 __ opc(xmm_result1, src_reg); \ 4264 __ opc(xmm_result2, src_reg); \ 4265 __ opc(xmm_result3, src_reg); \ 4266 __ opc(xmm_result4, src_reg); \ 4267 __ opc(xmm_result5, src_reg); 4268 4269 // k == 0 : generate code for key_128 4270 // k == 1 : generate code for key_192 4271 // k == 2 : generate code for key_256 4272 for (int k = 0; k < 3; ++k) { 4273 //multi blocks starts here 4274 __ align(OptoLoopAlignment); 4275 __ BIND(L_multiBlock_loopTop[k]); 4276 __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left 4277 __ jcc(Assembler::less, L_singleBlockLoopTop[k]); 4278 load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask); 4279 4280 //load, then increase counters 4281 CTR_DoSix(movdqa, xmm_curr_counter); 4282 inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]); 4283 inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]); 4284 inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]); 4285 inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]); 4286 inc_counter(rbx, xmm_result5, 0x05, L__incCounter[k][4]); 4287 inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]); 4288 CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR 4289 CTR_DoSix(pxor, xmm_key_tmp0); //PXOR with Round 0 key 4290 4291 //load two ROUND_KEYs at a time 4292 for (int i = 1; i < rounds[k]; ) { 4293 load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask); 4294 load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask); 4295 CTR_DoSix(aesenc, xmm_key_tmp1); 4296 i++; 4297 if (i != rounds[k]) { 4298 CTR_DoSix(aesenc, xmm_key_tmp0); 4299 } else { 4300 CTR_DoSix(aesenclast, xmm_key_tmp0); 4301 } 4302 i++; 4303 } 4304 4305 // get next PARALLEL_FACTOR blocks into xmm_result registers 4306 __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); 4307 __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize)); 4308 __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize)); 4309 __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize)); 4310 __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize)); 4311 __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize)); 4312 4313 __ pxor(xmm_result0, xmm_from0); 4314 __ pxor(xmm_result1, xmm_from1); 4315 __ pxor(xmm_result2, xmm_from2); 4316 __ pxor(xmm_result3, xmm_from3); 4317 __ pxor(xmm_result4, xmm_from4); 4318 __ pxor(xmm_result5, xmm_from5); 4319 4320 // store 6 results into the next 64 bytes of output 4321 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); 4322 __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1); 4323 __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2); 4324 __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3); 4325 __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4); 4326 __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5); 4327 4328 __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text 4329 __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length 4330 __ jmp(L_multiBlock_loopTop[k]); 4331 4332 // singleBlock starts here 4333 __ align(OptoLoopAlignment); 4334 __ BIND(L_singleBlockLoopTop[k]); 4335 __ cmpptr(len_reg, 0); 4336 __ jcc(Assembler::lessEqual, L_exit); 4337 load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask); 4338 __ movdqa(xmm_result0, xmm_curr_counter); 4339 inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]); 4340 __ pshufb(xmm_result0, xmm_counter_shuf_mask); 4341 __ pxor(xmm_result0, xmm_key_tmp0); 4342 for (int i = 1; i < rounds[k]; i++) { 4343 load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask); 4344 __ aesenc(xmm_result0, xmm_key_tmp0); 4345 } 4346 load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask); 4347 __ aesenclast(xmm_result0, xmm_key_tmp0); 4348 __ cmpptr(len_reg, AESBlockSize); 4349 __ jcc(Assembler::less, L_processTail_insr[k]); 4350 __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); 4351 __ pxor(xmm_result0, xmm_from0); 4352 __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0); 4353 __ addptr(pos, AESBlockSize); 4354 __ subptr(len_reg, AESBlockSize); 4355 __ jmp(L_singleBlockLoopTop[k]); 4356 __ BIND(L_processTail_insr[k]); // Process the tail part of the input array 4357 __ addptr(pos, len_reg); // 1. Insert bytes from src array into xmm_from0 register 4358 __ testptr(len_reg, 8); 4359 __ jcc(Assembler::zero, L_processTail_4_insr[k]); 4360 __ subptr(pos,8); 4361 __ pinsrq(xmm_from0, Address(from, pos), 0); 4362 __ BIND(L_processTail_4_insr[k]); 4363 __ testptr(len_reg, 4); 4364 __ jcc(Assembler::zero, L_processTail_2_insr[k]); 4365 __ subptr(pos,4); 4366 __ pslldq(xmm_from0, 4); 4367 __ pinsrd(xmm_from0, Address(from, pos), 0); 4368 __ BIND(L_processTail_2_insr[k]); 4369 __ testptr(len_reg, 2); 4370 __ jcc(Assembler::zero, L_processTail_1_insr[k]); 4371 __ subptr(pos, 2); 4372 __ pslldq(xmm_from0, 2); 4373 __ pinsrw(xmm_from0, Address(from, pos), 0); 4374 __ BIND(L_processTail_1_insr[k]); 4375 __ testptr(len_reg, 1); 4376 __ jcc(Assembler::zero, L_processTail_exit_insr[k]); 4377 __ subptr(pos, 1); 4378 __ pslldq(xmm_from0, 1); 4379 __ pinsrb(xmm_from0, Address(from, pos), 0); 4380 __ BIND(L_processTail_exit_insr[k]); 4381 4382 __ movdqu(Address(saved_encCounter_start, 0), xmm_result0); // 2. Perform pxor of the encrypted counter and plaintext Bytes. 4383 __ pxor(xmm_result0, xmm_from0); // Also the encrypted counter is saved for next invocation. 4384 4385 __ testptr(len_reg, 8); 4386 __ jcc(Assembler::zero, L_processTail_4_extr[k]); // 3. Extract bytes from xmm_result0 into the dest. array 4387 __ pextrq(Address(to, pos), xmm_result0, 0); 4388 __ psrldq(xmm_result0, 8); 4389 __ addptr(pos, 8); 4390 __ BIND(L_processTail_4_extr[k]); 4391 __ testptr(len_reg, 4); 4392 __ jcc(Assembler::zero, L_processTail_2_extr[k]); 4393 __ pextrd(Address(to, pos), xmm_result0, 0); 4394 __ psrldq(xmm_result0, 4); 4395 __ addptr(pos, 4); 4396 __ BIND(L_processTail_2_extr[k]); 4397 __ testptr(len_reg, 2); 4398 __ jcc(Assembler::zero, L_processTail_1_extr[k]); 4399 __ pextrw(Address(to, pos), xmm_result0, 0); 4400 __ psrldq(xmm_result0, 2); 4401 __ addptr(pos, 2); 4402 __ BIND(L_processTail_1_extr[k]); 4403 __ testptr(len_reg, 1); 4404 __ jcc(Assembler::zero, L_processTail_exit_extr[k]); 4405 __ pextrb(Address(to, pos), xmm_result0, 0); 4406 4407 __ BIND(L_processTail_exit_extr[k]); 4408 __ movl(Address(used_addr, 0), len_reg); 4409 __ jmp(L_exit); 4410 4411 } 4412 4413 __ BIND(L_exit); 4414 __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back. 4415 __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back 4416 __ pop(rbx); // pop the saved RBX. 4417 #ifdef _WIN64 4418 __ movl(rax, len_mem); 4419 __ movptr(r13, Address(rsp, saved_r13_offset * wordSize)); 4420 __ movptr(r14, Address(rsp, saved_r14_offset * wordSize)); 4421 __ addptr(rsp, 2 * wordSize); 4422 #else 4423 __ pop(rax); // return 'len' 4424 #endif 4425 __ leave(); // required for proper stackwalking of RuntimeStub frame 4426 __ ret(0); 4427 return start; 4428 } 4429 4430 void roundDec(XMMRegister xmm_reg) { 4431 __ vaesdec(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit); 4432 __ vaesdec(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit); 4433 __ vaesdec(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit); 4434 __ vaesdec(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit); 4435 __ vaesdec(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit); 4436 __ vaesdec(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit); 4437 __ vaesdec(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit); 4438 __ vaesdec(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit); 4439 } 4440 4441 void roundDeclast(XMMRegister xmm_reg) { 4442 __ vaesdeclast(xmm1, xmm1, xmm_reg, Assembler::AVX_512bit); 4443 __ vaesdeclast(xmm2, xmm2, xmm_reg, Assembler::AVX_512bit); 4444 __ vaesdeclast(xmm3, xmm3, xmm_reg, Assembler::AVX_512bit); 4445 __ vaesdeclast(xmm4, xmm4, xmm_reg, Assembler::AVX_512bit); 4446 __ vaesdeclast(xmm5, xmm5, xmm_reg, Assembler::AVX_512bit); 4447 __ vaesdeclast(xmm6, xmm6, xmm_reg, Assembler::AVX_512bit); 4448 __ vaesdeclast(xmm7, xmm7, xmm_reg, Assembler::AVX_512bit); 4449 __ vaesdeclast(xmm8, xmm8, xmm_reg, Assembler::AVX_512bit); 4450 } 4451 4452 void ev_load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask = NULL) { 4453 __ movdqu(xmmdst, Address(key, offset)); 4454 if (xmm_shuf_mask != NULL) { 4455 __ pshufb(xmmdst, xmm_shuf_mask); 4456 } else { 4457 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 4458 } 4459 __ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit); 4460 4461 } 4462 4463 address generate_cipherBlockChaining_decryptVectorAESCrypt() { 4464 assert(VM_Version::supports_vaes(), "need AES instructions and misaligned SSE support"); 4465 __ align(CodeEntryAlignment); 4466 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 4467 address start = __ pc(); 4468 4469 const Register from = c_rarg0; // source array address 4470 const Register to = c_rarg1; // destination array address 4471 const Register key = c_rarg2; // key array address 4472 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 4473 // and left with the results of the last encryption block 4474 #ifndef _WIN64 4475 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 4476 #else 4477 const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64 4478 const Register len_reg = r11; // pick the volatile windows register 4479 #endif 4480 4481 Label Loop, Loop1, L_128, L_256, L_192, KEY_192, KEY_256, Loop2, Lcbc_dec_rem_loop, 4482 Lcbc_dec_rem_last, Lcbc_dec_ret, Lcbc_dec_rem, Lcbc_exit; 4483 4484 __ enter(); 4485 4486 #ifdef _WIN64 4487 // on win64, fill len_reg from stack position 4488 __ movl(len_reg, len_mem); 4489 #else 4490 __ push(len_reg); // Save 4491 #endif 4492 __ push(rbx); 4493 __ vzeroupper(); 4494 4495 // Temporary variable declaration for swapping key bytes 4496 const XMMRegister xmm_key_shuf_mask = xmm1; 4497 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 4498 4499 // Calculate number of rounds from key size: 44 for 10-rounds, 52 for 12-rounds, 60 for 14-rounds 4500 const Register rounds = rbx; 4501 __ movl(rounds, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 4502 4503 const XMMRegister IV = xmm0; 4504 // Load IV and broadcast value to 512-bits 4505 __ evbroadcasti64x2(IV, Address(rvec, 0), Assembler::AVX_512bit); 4506 4507 // Temporary variables for storing round keys 4508 const XMMRegister RK0 = xmm30; 4509 const XMMRegister RK1 = xmm9; 4510 const XMMRegister RK2 = xmm18; 4511 const XMMRegister RK3 = xmm19; 4512 const XMMRegister RK4 = xmm20; 4513 const XMMRegister RK5 = xmm21; 4514 const XMMRegister RK6 = xmm22; 4515 const XMMRegister RK7 = xmm23; 4516 const XMMRegister RK8 = xmm24; 4517 const XMMRegister RK9 = xmm25; 4518 const XMMRegister RK10 = xmm26; 4519 4520 // Load and shuffle key 4521 // the java expanded key ordering is rotated one position from what we want 4522 // so we start from 1*16 here and hit 0*16 last 4523 ev_load_key(RK1, key, 1 * 16, xmm_key_shuf_mask); 4524 ev_load_key(RK2, key, 2 * 16, xmm_key_shuf_mask); 4525 ev_load_key(RK3, key, 3 * 16, xmm_key_shuf_mask); 4526 ev_load_key(RK4, key, 4 * 16, xmm_key_shuf_mask); 4527 ev_load_key(RK5, key, 5 * 16, xmm_key_shuf_mask); 4528 ev_load_key(RK6, key, 6 * 16, xmm_key_shuf_mask); 4529 ev_load_key(RK7, key, 7 * 16, xmm_key_shuf_mask); 4530 ev_load_key(RK8, key, 8 * 16, xmm_key_shuf_mask); 4531 ev_load_key(RK9, key, 9 * 16, xmm_key_shuf_mask); 4532 ev_load_key(RK10, key, 10 * 16, xmm_key_shuf_mask); 4533 ev_load_key(RK0, key, 0*16, xmm_key_shuf_mask); 4534 4535 // Variables for storing source cipher text 4536 const XMMRegister S0 = xmm10; 4537 const XMMRegister S1 = xmm11; 4538 const XMMRegister S2 = xmm12; 4539 const XMMRegister S3 = xmm13; 4540 const XMMRegister S4 = xmm14; 4541 const XMMRegister S5 = xmm15; 4542 const XMMRegister S6 = xmm16; 4543 const XMMRegister S7 = xmm17; 4544 4545 // Variables for storing decrypted text 4546 const XMMRegister B0 = xmm1; 4547 const XMMRegister B1 = xmm2; 4548 const XMMRegister B2 = xmm3; 4549 const XMMRegister B3 = xmm4; 4550 const XMMRegister B4 = xmm5; 4551 const XMMRegister B5 = xmm6; 4552 const XMMRegister B6 = xmm7; 4553 const XMMRegister B7 = xmm8; 4554 4555 __ cmpl(rounds, 44); 4556 __ jcc(Assembler::greater, KEY_192); 4557 __ jmp(Loop); 4558 4559 __ BIND(KEY_192); 4560 const XMMRegister RK11 = xmm27; 4561 const XMMRegister RK12 = xmm28; 4562 ev_load_key(RK11, key, 11*16, xmm_key_shuf_mask); 4563 ev_load_key(RK12, key, 12*16, xmm_key_shuf_mask); 4564 4565 __ cmpl(rounds, 52); 4566 __ jcc(Assembler::greater, KEY_256); 4567 __ jmp(Loop); 4568 4569 __ BIND(KEY_256); 4570 const XMMRegister RK13 = xmm29; 4571 const XMMRegister RK14 = xmm31; 4572 ev_load_key(RK13, key, 13*16, xmm_key_shuf_mask); 4573 ev_load_key(RK14, key, 14*16, xmm_key_shuf_mask); 4574 4575 __ BIND(Loop); 4576 __ cmpl(len_reg, 512); 4577 __ jcc(Assembler::below, Lcbc_dec_rem); 4578 __ BIND(Loop1); 4579 __ subl(len_reg, 512); 4580 __ evmovdquq(S0, Address(from, 0 * 64), Assembler::AVX_512bit); 4581 __ evmovdquq(S1, Address(from, 1 * 64), Assembler::AVX_512bit); 4582 __ evmovdquq(S2, Address(from, 2 * 64), Assembler::AVX_512bit); 4583 __ evmovdquq(S3, Address(from, 3 * 64), Assembler::AVX_512bit); 4584 __ evmovdquq(S4, Address(from, 4 * 64), Assembler::AVX_512bit); 4585 __ evmovdquq(S5, Address(from, 5 * 64), Assembler::AVX_512bit); 4586 __ evmovdquq(S6, Address(from, 6 * 64), Assembler::AVX_512bit); 4587 __ evmovdquq(S7, Address(from, 7 * 64), Assembler::AVX_512bit); 4588 __ leaq(from, Address(from, 8 * 64)); 4589 4590 __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit); 4591 __ evpxorq(B1, S1, RK1, Assembler::AVX_512bit); 4592 __ evpxorq(B2, S2, RK1, Assembler::AVX_512bit); 4593 __ evpxorq(B3, S3, RK1, Assembler::AVX_512bit); 4594 __ evpxorq(B4, S4, RK1, Assembler::AVX_512bit); 4595 __ evpxorq(B5, S5, RK1, Assembler::AVX_512bit); 4596 __ evpxorq(B6, S6, RK1, Assembler::AVX_512bit); 4597 __ evpxorq(B7, S7, RK1, Assembler::AVX_512bit); 4598 4599 __ evalignq(IV, S0, IV, 0x06); 4600 __ evalignq(S0, S1, S0, 0x06); 4601 __ evalignq(S1, S2, S1, 0x06); 4602 __ evalignq(S2, S3, S2, 0x06); 4603 __ evalignq(S3, S4, S3, 0x06); 4604 __ evalignq(S4, S5, S4, 0x06); 4605 __ evalignq(S5, S6, S5, 0x06); 4606 __ evalignq(S6, S7, S6, 0x06); 4607 4608 roundDec(RK2); 4609 roundDec(RK3); 4610 roundDec(RK4); 4611 roundDec(RK5); 4612 roundDec(RK6); 4613 roundDec(RK7); 4614 roundDec(RK8); 4615 roundDec(RK9); 4616 roundDec(RK10); 4617 4618 __ cmpl(rounds, 44); 4619 __ jcc(Assembler::belowEqual, L_128); 4620 roundDec(RK11); 4621 roundDec(RK12); 4622 4623 __ cmpl(rounds, 52); 4624 __ jcc(Assembler::belowEqual, L_192); 4625 roundDec(RK13); 4626 roundDec(RK14); 4627 4628 __ BIND(L_256); 4629 roundDeclast(RK0); 4630 __ jmp(Loop2); 4631 4632 __ BIND(L_128); 4633 roundDeclast(RK0); 4634 __ jmp(Loop2); 4635 4636 __ BIND(L_192); 4637 roundDeclast(RK0); 4638 4639 __ BIND(Loop2); 4640 __ evpxorq(B0, B0, IV, Assembler::AVX_512bit); 4641 __ evpxorq(B1, B1, S0, Assembler::AVX_512bit); 4642 __ evpxorq(B2, B2, S1, Assembler::AVX_512bit); 4643 __ evpxorq(B3, B3, S2, Assembler::AVX_512bit); 4644 __ evpxorq(B4, B4, S3, Assembler::AVX_512bit); 4645 __ evpxorq(B5, B5, S4, Assembler::AVX_512bit); 4646 __ evpxorq(B6, B6, S5, Assembler::AVX_512bit); 4647 __ evpxorq(B7, B7, S6, Assembler::AVX_512bit); 4648 __ evmovdquq(IV, S7, Assembler::AVX_512bit); 4649 4650 __ evmovdquq(Address(to, 0 * 64), B0, Assembler::AVX_512bit); 4651 __ evmovdquq(Address(to, 1 * 64), B1, Assembler::AVX_512bit); 4652 __ evmovdquq(Address(to, 2 * 64), B2, Assembler::AVX_512bit); 4653 __ evmovdquq(Address(to, 3 * 64), B3, Assembler::AVX_512bit); 4654 __ evmovdquq(Address(to, 4 * 64), B4, Assembler::AVX_512bit); 4655 __ evmovdquq(Address(to, 5 * 64), B5, Assembler::AVX_512bit); 4656 __ evmovdquq(Address(to, 6 * 64), B6, Assembler::AVX_512bit); 4657 __ evmovdquq(Address(to, 7 * 64), B7, Assembler::AVX_512bit); 4658 __ leaq(to, Address(to, 8 * 64)); 4659 __ jmp(Loop); 4660 4661 __ BIND(Lcbc_dec_rem); 4662 __ evshufi64x2(IV, IV, IV, 0x03, Assembler::AVX_512bit); 4663 4664 __ BIND(Lcbc_dec_rem_loop); 4665 __ subl(len_reg, 16); 4666 __ jcc(Assembler::carrySet, Lcbc_dec_ret); 4667 4668 __ movdqu(S0, Address(from, 0)); 4669 __ evpxorq(B0, S0, RK1, Assembler::AVX_512bit); 4670 __ vaesdec(B0, B0, RK2, Assembler::AVX_512bit); 4671 __ vaesdec(B0, B0, RK3, Assembler::AVX_512bit); 4672 __ vaesdec(B0, B0, RK4, Assembler::AVX_512bit); 4673 __ vaesdec(B0, B0, RK5, Assembler::AVX_512bit); 4674 __ vaesdec(B0, B0, RK6, Assembler::AVX_512bit); 4675 __ vaesdec(B0, B0, RK7, Assembler::AVX_512bit); 4676 __ vaesdec(B0, B0, RK8, Assembler::AVX_512bit); 4677 __ vaesdec(B0, B0, RK9, Assembler::AVX_512bit); 4678 __ vaesdec(B0, B0, RK10, Assembler::AVX_512bit); 4679 __ cmpl(rounds, 44); 4680 __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last); 4681 4682 __ vaesdec(B0, B0, RK11, Assembler::AVX_512bit); 4683 __ vaesdec(B0, B0, RK12, Assembler::AVX_512bit); 4684 __ cmpl(rounds, 52); 4685 __ jcc(Assembler::belowEqual, Lcbc_dec_rem_last); 4686 4687 __ vaesdec(B0, B0, RK13, Assembler::AVX_512bit); 4688 __ vaesdec(B0, B0, RK14, Assembler::AVX_512bit); 4689 4690 __ BIND(Lcbc_dec_rem_last); 4691 __ vaesdeclast(B0, B0, RK0, Assembler::AVX_512bit); 4692 4693 __ evpxorq(B0, B0, IV, Assembler::AVX_512bit); 4694 __ evmovdquq(IV, S0, Assembler::AVX_512bit); 4695 __ movdqu(Address(to, 0), B0); 4696 __ leaq(from, Address(from, 16)); 4697 __ leaq(to, Address(to, 16)); 4698 __ jmp(Lcbc_dec_rem_loop); 4699 4700 __ BIND(Lcbc_dec_ret); 4701 __ movdqu(Address(rvec, 0), IV); 4702 4703 // Zero out the round keys 4704 __ evpxorq(RK0, RK0, RK0, Assembler::AVX_512bit); 4705 __ evpxorq(RK1, RK1, RK1, Assembler::AVX_512bit); 4706 __ evpxorq(RK2, RK2, RK2, Assembler::AVX_512bit); 4707 __ evpxorq(RK3, RK3, RK3, Assembler::AVX_512bit); 4708 __ evpxorq(RK4, RK4, RK4, Assembler::AVX_512bit); 4709 __ evpxorq(RK5, RK5, RK5, Assembler::AVX_512bit); 4710 __ evpxorq(RK6, RK6, RK6, Assembler::AVX_512bit); 4711 __ evpxorq(RK7, RK7, RK7, Assembler::AVX_512bit); 4712 __ evpxorq(RK8, RK8, RK8, Assembler::AVX_512bit); 4713 __ evpxorq(RK9, RK9, RK9, Assembler::AVX_512bit); 4714 __ evpxorq(RK10, RK10, RK10, Assembler::AVX_512bit); 4715 __ cmpl(rounds, 44); 4716 __ jcc(Assembler::belowEqual, Lcbc_exit); 4717 __ evpxorq(RK11, RK11, RK11, Assembler::AVX_512bit); 4718 __ evpxorq(RK12, RK12, RK12, Assembler::AVX_512bit); 4719 __ cmpl(rounds, 52); 4720 __ jcc(Assembler::belowEqual, Lcbc_exit); 4721 __ evpxorq(RK13, RK13, RK13, Assembler::AVX_512bit); 4722 __ evpxorq(RK14, RK14, RK14, Assembler::AVX_512bit); 4723 4724 __ BIND(Lcbc_exit); 4725 __ pop(rbx); 4726 #ifdef _WIN64 4727 __ movl(rax, len_mem); 4728 #else 4729 __ pop(rax); // return length 4730 #endif 4731 __ leave(); // required for proper stackwalking of RuntimeStub frame 4732 __ ret(0); 4733 return start; 4734 } 4735 4736 // Polynomial x^128+x^127+x^126+x^121+1 4737 address ghash_polynomial_addr() { 4738 __ align(CodeEntryAlignment); 4739 StubCodeMark mark(this, "StubRoutines", "_ghash_poly_addr"); 4740 address start = __ pc(); 4741 __ emit_data64(0x0000000000000001, relocInfo::none); 4742 __ emit_data64(0xc200000000000000, relocInfo::none); 4743 return start; 4744 } 4745 4746 address ghash_shufflemask_addr() { 4747 __ align(CodeEntryAlignment); 4748 StubCodeMark mark(this, "StubRoutines", "_ghash_shuffmask_addr"); 4749 address start = __ pc(); 4750 __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none); 4751 __ emit_data64(0x0f0f0f0f0f0f0f0f, relocInfo::none); 4752 return start; 4753 } 4754 4755 // Ghash single and multi block operations using AVX instructions 4756 address generate_avx_ghash_processBlocks() { 4757 __ align(CodeEntryAlignment); 4758 4759 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4760 address start = __ pc(); 4761 4762 // arguments 4763 const Register state = c_rarg0; 4764 const Register htbl = c_rarg1; 4765 const Register data = c_rarg2; 4766 const Register blocks = c_rarg3; 4767 __ enter(); 4768 // Save state before entering routine 4769 __ avx_ghash(state, htbl, data, blocks); 4770 __ leave(); // required for proper stackwalking of RuntimeStub frame 4771 __ ret(0); 4772 return start; 4773 } 4774 4775 // byte swap x86 long 4776 address generate_ghash_long_swap_mask() { 4777 __ align(CodeEntryAlignment); 4778 StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask"); 4779 address start = __ pc(); 4780 __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none ); 4781 __ emit_data64(0x0706050403020100, relocInfo::none ); 4782 return start; 4783 } 4784 4785 // byte swap x86 byte array 4786 address generate_ghash_byte_swap_mask() { 4787 __ align(CodeEntryAlignment); 4788 StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask"); 4789 address start = __ pc(); 4790 __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none ); 4791 __ emit_data64(0x0001020304050607, relocInfo::none ); 4792 return start; 4793 } 4794 4795 /* Single and multi-block ghash operations */ 4796 address generate_ghash_processBlocks() { 4797 __ align(CodeEntryAlignment); 4798 Label L_ghash_loop, L_exit; 4799 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4800 address start = __ pc(); 4801 4802 const Register state = c_rarg0; 4803 const Register subkeyH = c_rarg1; 4804 const Register data = c_rarg2; 4805 const Register blocks = c_rarg3; 4806 4807 const XMMRegister xmm_temp0 = xmm0; 4808 const XMMRegister xmm_temp1 = xmm1; 4809 const XMMRegister xmm_temp2 = xmm2; 4810 const XMMRegister xmm_temp3 = xmm3; 4811 const XMMRegister xmm_temp4 = xmm4; 4812 const XMMRegister xmm_temp5 = xmm5; 4813 const XMMRegister xmm_temp6 = xmm6; 4814 const XMMRegister xmm_temp7 = xmm7; 4815 const XMMRegister xmm_temp8 = xmm8; 4816 const XMMRegister xmm_temp9 = xmm9; 4817 const XMMRegister xmm_temp10 = xmm10; 4818 4819 __ enter(); 4820 4821 __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); 4822 4823 __ movdqu(xmm_temp0, Address(state, 0)); 4824 __ pshufb(xmm_temp0, xmm_temp10); 4825 4826 4827 __ BIND(L_ghash_loop); 4828 __ movdqu(xmm_temp2, Address(data, 0)); 4829 __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); 4830 4831 __ movdqu(xmm_temp1, Address(subkeyH, 0)); 4832 __ pshufb(xmm_temp1, xmm_temp10); 4833 4834 __ pxor(xmm_temp0, xmm_temp2); 4835 4836 // 4837 // Multiply with the hash key 4838 // 4839 __ movdqu(xmm_temp3, xmm_temp0); 4840 __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0 4841 __ movdqu(xmm_temp4, xmm_temp0); 4842 __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1 4843 4844 __ movdqu(xmm_temp5, xmm_temp0); 4845 __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0 4846 __ movdqu(xmm_temp6, xmm_temp0); 4847 __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1 4848 4849 __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0 4850 4851 __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5 4852 __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right 4853 __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left 4854 __ pxor(xmm_temp3, xmm_temp5); 4855 __ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result 4856 // of the carry-less multiplication of 4857 // xmm0 by xmm1. 4858 4859 // We shift the result of the multiplication by one bit position 4860 // to the left to cope for the fact that the bits are reversed. 4861 __ movdqu(xmm_temp7, xmm_temp3); 4862 __ movdqu(xmm_temp8, xmm_temp6); 4863 __ pslld(xmm_temp3, 1); 4864 __ pslld(xmm_temp6, 1); 4865 __ psrld(xmm_temp7, 31); 4866 __ psrld(xmm_temp8, 31); 4867 __ movdqu(xmm_temp9, xmm_temp7); 4868 __ pslldq(xmm_temp8, 4); 4869 __ pslldq(xmm_temp7, 4); 4870 __ psrldq(xmm_temp9, 12); 4871 __ por(xmm_temp3, xmm_temp7); 4872 __ por(xmm_temp6, xmm_temp8); 4873 __ por(xmm_temp6, xmm_temp9); 4874 4875 // 4876 // First phase of the reduction 4877 // 4878 // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts 4879 // independently. 4880 __ movdqu(xmm_temp7, xmm_temp3); 4881 __ movdqu(xmm_temp8, xmm_temp3); 4882 __ movdqu(xmm_temp9, xmm_temp3); 4883 __ pslld(xmm_temp7, 31); // packed right shift shifting << 31 4884 __ pslld(xmm_temp8, 30); // packed right shift shifting << 30 4885 __ pslld(xmm_temp9, 25); // packed right shift shifting << 25 4886 __ pxor(xmm_temp7, xmm_temp8); // xor the shifted versions 4887 __ pxor(xmm_temp7, xmm_temp9); 4888 __ movdqu(xmm_temp8, xmm_temp7); 4889 __ pslldq(xmm_temp7, 12); 4890 __ psrldq(xmm_temp8, 4); 4891 __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete 4892 4893 // 4894 // Second phase of the reduction 4895 // 4896 // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these 4897 // shift operations. 4898 __ movdqu(xmm_temp2, xmm_temp3); 4899 __ movdqu(xmm_temp4, xmm_temp3); 4900 __ movdqu(xmm_temp5, xmm_temp3); 4901 __ psrld(xmm_temp2, 1); // packed left shifting >> 1 4902 __ psrld(xmm_temp4, 2); // packed left shifting >> 2 4903 __ psrld(xmm_temp5, 7); // packed left shifting >> 7 4904 __ pxor(xmm_temp2, xmm_temp4); // xor the shifted versions 4905 __ pxor(xmm_temp2, xmm_temp5); 4906 __ pxor(xmm_temp2, xmm_temp8); 4907 __ pxor(xmm_temp3, xmm_temp2); 4908 __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6 4909 4910 __ decrement(blocks); 4911 __ jcc(Assembler::zero, L_exit); 4912 __ movdqu(xmm_temp0, xmm_temp6); 4913 __ addptr(data, 16); 4914 __ jmp(L_ghash_loop); 4915 4916 __ BIND(L_exit); 4917 __ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result 4918 __ movdqu(Address(state, 0), xmm_temp6); // store the result 4919 __ leave(); 4920 __ ret(0); 4921 return start; 4922 } 4923 4924 //base64 character set 4925 address base64_charset_addr() { 4926 __ align(CodeEntryAlignment); 4927 StubCodeMark mark(this, "StubRoutines", "base64_charset"); 4928 address start = __ pc(); 4929 __ emit_data64(0x0000004200000041, relocInfo::none); 4930 __ emit_data64(0x0000004400000043, relocInfo::none); 4931 __ emit_data64(0x0000004600000045, relocInfo::none); 4932 __ emit_data64(0x0000004800000047, relocInfo::none); 4933 __ emit_data64(0x0000004a00000049, relocInfo::none); 4934 __ emit_data64(0x0000004c0000004b, relocInfo::none); 4935 __ emit_data64(0x0000004e0000004d, relocInfo::none); 4936 __ emit_data64(0x000000500000004f, relocInfo::none); 4937 __ emit_data64(0x0000005200000051, relocInfo::none); 4938 __ emit_data64(0x0000005400000053, relocInfo::none); 4939 __ emit_data64(0x0000005600000055, relocInfo::none); 4940 __ emit_data64(0x0000005800000057, relocInfo::none); 4941 __ emit_data64(0x0000005a00000059, relocInfo::none); 4942 __ emit_data64(0x0000006200000061, relocInfo::none); 4943 __ emit_data64(0x0000006400000063, relocInfo::none); 4944 __ emit_data64(0x0000006600000065, relocInfo::none); 4945 __ emit_data64(0x0000006800000067, relocInfo::none); 4946 __ emit_data64(0x0000006a00000069, relocInfo::none); 4947 __ emit_data64(0x0000006c0000006b, relocInfo::none); 4948 __ emit_data64(0x0000006e0000006d, relocInfo::none); 4949 __ emit_data64(0x000000700000006f, relocInfo::none); 4950 __ emit_data64(0x0000007200000071, relocInfo::none); 4951 __ emit_data64(0x0000007400000073, relocInfo::none); 4952 __ emit_data64(0x0000007600000075, relocInfo::none); 4953 __ emit_data64(0x0000007800000077, relocInfo::none); 4954 __ emit_data64(0x0000007a00000079, relocInfo::none); 4955 __ emit_data64(0x0000003100000030, relocInfo::none); 4956 __ emit_data64(0x0000003300000032, relocInfo::none); 4957 __ emit_data64(0x0000003500000034, relocInfo::none); 4958 __ emit_data64(0x0000003700000036, relocInfo::none); 4959 __ emit_data64(0x0000003900000038, relocInfo::none); 4960 __ emit_data64(0x0000002f0000002b, relocInfo::none); 4961 return start; 4962 } 4963 4964 //base64 url character set 4965 address base64url_charset_addr() { 4966 __ align(CodeEntryAlignment); 4967 StubCodeMark mark(this, "StubRoutines", "base64url_charset"); 4968 address start = __ pc(); 4969 __ emit_data64(0x0000004200000041, relocInfo::none); 4970 __ emit_data64(0x0000004400000043, relocInfo::none); 4971 __ emit_data64(0x0000004600000045, relocInfo::none); 4972 __ emit_data64(0x0000004800000047, relocInfo::none); 4973 __ emit_data64(0x0000004a00000049, relocInfo::none); 4974 __ emit_data64(0x0000004c0000004b, relocInfo::none); 4975 __ emit_data64(0x0000004e0000004d, relocInfo::none); 4976 __ emit_data64(0x000000500000004f, relocInfo::none); 4977 __ emit_data64(0x0000005200000051, relocInfo::none); 4978 __ emit_data64(0x0000005400000053, relocInfo::none); 4979 __ emit_data64(0x0000005600000055, relocInfo::none); 4980 __ emit_data64(0x0000005800000057, relocInfo::none); 4981 __ emit_data64(0x0000005a00000059, relocInfo::none); 4982 __ emit_data64(0x0000006200000061, relocInfo::none); 4983 __ emit_data64(0x0000006400000063, relocInfo::none); 4984 __ emit_data64(0x0000006600000065, relocInfo::none); 4985 __ emit_data64(0x0000006800000067, relocInfo::none); 4986 __ emit_data64(0x0000006a00000069, relocInfo::none); 4987 __ emit_data64(0x0000006c0000006b, relocInfo::none); 4988 __ emit_data64(0x0000006e0000006d, relocInfo::none); 4989 __ emit_data64(0x000000700000006f, relocInfo::none); 4990 __ emit_data64(0x0000007200000071, relocInfo::none); 4991 __ emit_data64(0x0000007400000073, relocInfo::none); 4992 __ emit_data64(0x0000007600000075, relocInfo::none); 4993 __ emit_data64(0x0000007800000077, relocInfo::none); 4994 __ emit_data64(0x0000007a00000079, relocInfo::none); 4995 __ emit_data64(0x0000003100000030, relocInfo::none); 4996 __ emit_data64(0x0000003300000032, relocInfo::none); 4997 __ emit_data64(0x0000003500000034, relocInfo::none); 4998 __ emit_data64(0x0000003700000036, relocInfo::none); 4999 __ emit_data64(0x0000003900000038, relocInfo::none); 5000 __ emit_data64(0x0000005f0000002d, relocInfo::none); 5001 5002 return start; 5003 } 5004 5005 address base64_bswap_mask_addr() { 5006 __ align(CodeEntryAlignment); 5007 StubCodeMark mark(this, "StubRoutines", "bswap_mask_base64"); 5008 address start = __ pc(); 5009 __ emit_data64(0x0504038002010080, relocInfo::none); 5010 __ emit_data64(0x0b0a098008070680, relocInfo::none); 5011 __ emit_data64(0x0908078006050480, relocInfo::none); 5012 __ emit_data64(0x0f0e0d800c0b0a80, relocInfo::none); 5013 __ emit_data64(0x0605048003020180, relocInfo::none); 5014 __ emit_data64(0x0c0b0a8009080780, relocInfo::none); 5015 __ emit_data64(0x0504038002010080, relocInfo::none); 5016 __ emit_data64(0x0b0a098008070680, relocInfo::none); 5017 5018 return start; 5019 } 5020 5021 address base64_right_shift_mask_addr() { 5022 __ align(CodeEntryAlignment); 5023 StubCodeMark mark(this, "StubRoutines", "right_shift_mask"); 5024 address start = __ pc(); 5025 __ emit_data64(0x0006000400020000, relocInfo::none); 5026 __ emit_data64(0x0006000400020000, relocInfo::none); 5027 __ emit_data64(0x0006000400020000, relocInfo::none); 5028 __ emit_data64(0x0006000400020000, relocInfo::none); 5029 __ emit_data64(0x0006000400020000, relocInfo::none); 5030 __ emit_data64(0x0006000400020000, relocInfo::none); 5031 __ emit_data64(0x0006000400020000, relocInfo::none); 5032 __ emit_data64(0x0006000400020000, relocInfo::none); 5033 5034 return start; 5035 } 5036 5037 address base64_left_shift_mask_addr() { 5038 __ align(CodeEntryAlignment); 5039 StubCodeMark mark(this, "StubRoutines", "left_shift_mask"); 5040 address start = __ pc(); 5041 __ emit_data64(0x0000000200040000, relocInfo::none); 5042 __ emit_data64(0x0000000200040000, relocInfo::none); 5043 __ emit_data64(0x0000000200040000, relocInfo::none); 5044 __ emit_data64(0x0000000200040000, relocInfo::none); 5045 __ emit_data64(0x0000000200040000, relocInfo::none); 5046 __ emit_data64(0x0000000200040000, relocInfo::none); 5047 __ emit_data64(0x0000000200040000, relocInfo::none); 5048 __ emit_data64(0x0000000200040000, relocInfo::none); 5049 5050 return start; 5051 } 5052 5053 address base64_and_mask_addr() { 5054 __ align(CodeEntryAlignment); 5055 StubCodeMark mark(this, "StubRoutines", "and_mask"); 5056 address start = __ pc(); 5057 __ emit_data64(0x3f003f003f000000, relocInfo::none); 5058 __ emit_data64(0x3f003f003f000000, relocInfo::none); 5059 __ emit_data64(0x3f003f003f000000, relocInfo::none); 5060 __ emit_data64(0x3f003f003f000000, relocInfo::none); 5061 __ emit_data64(0x3f003f003f000000, relocInfo::none); 5062 __ emit_data64(0x3f003f003f000000, relocInfo::none); 5063 __ emit_data64(0x3f003f003f000000, relocInfo::none); 5064 __ emit_data64(0x3f003f003f000000, relocInfo::none); 5065 return start; 5066 } 5067 5068 address base64_gather_mask_addr() { 5069 __ align(CodeEntryAlignment); 5070 StubCodeMark mark(this, "StubRoutines", "gather_mask"); 5071 address start = __ pc(); 5072 __ emit_data64(0xffffffffffffffff, relocInfo::none); 5073 return start; 5074 } 5075 5076 // Code for generating Base64 encoding. 5077 // Intrinsic function prototype in Base64.java: 5078 // private void encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL) { 5079 address generate_base64_encodeBlock() { 5080 __ align(CodeEntryAlignment); 5081 StubCodeMark mark(this, "StubRoutines", "implEncode"); 5082 address start = __ pc(); 5083 __ enter(); 5084 5085 // Save callee-saved registers before using them 5086 __ push(r12); 5087 __ push(r13); 5088 __ push(r14); 5089 __ push(r15); 5090 5091 // arguments 5092 const Register source = c_rarg0; // Source Array 5093 const Register start_offset = c_rarg1; // start offset 5094 const Register end_offset = c_rarg2; // end offset 5095 const Register dest = c_rarg3; // destination array 5096 5097 #ifndef _WIN64 5098 const Register dp = c_rarg4; // Position for writing to dest array 5099 const Register isURL = c_rarg5;// Base64 or URL character set 5100 #else 5101 const Address dp_mem(rbp, 6 * wordSize); // length is on stack on Win64 5102 const Address isURL_mem(rbp, 7 * wordSize); 5103 const Register isURL = r10; // pick the volatile windows register 5104 const Register dp = r12; 5105 __ movl(dp, dp_mem); 5106 __ movl(isURL, isURL_mem); 5107 #endif 5108 5109 const Register length = r14; 5110 Label L_process80, L_process32, L_process3, L_exit, L_processdata; 5111 5112 // calculate length from offsets 5113 __ movl(length, end_offset); 5114 __ subl(length, start_offset); 5115 __ cmpl(length, 0); 5116 __ jcc(Assembler::lessEqual, L_exit); 5117 5118 __ lea(r11, ExternalAddress(StubRoutines::x86::base64_charset_addr())); 5119 // check if base64 charset(isURL=0) or base64 url charset(isURL=1) needs to be loaded 5120 __ cmpl(isURL, 0); 5121 __ jcc(Assembler::equal, L_processdata); 5122 __ lea(r11, ExternalAddress(StubRoutines::x86::base64url_charset_addr())); 5123 5124 // load masks required for encoding data 5125 __ BIND(L_processdata); 5126 __ movdqu(xmm16, ExternalAddress(StubRoutines::x86::base64_gather_mask_addr())); 5127 // Set 64 bits of K register. 5128 __ evpcmpeqb(k3, xmm16, xmm16, Assembler::AVX_512bit); 5129 __ evmovdquq(xmm12, ExternalAddress(StubRoutines::x86::base64_bswap_mask_addr()), Assembler::AVX_256bit, r13); 5130 __ evmovdquq(xmm13, ExternalAddress(StubRoutines::x86::base64_right_shift_mask_addr()), Assembler::AVX_512bit, r13); 5131 __ evmovdquq(xmm14, ExternalAddress(StubRoutines::x86::base64_left_shift_mask_addr()), Assembler::AVX_512bit, r13); 5132 __ evmovdquq(xmm15, ExternalAddress(StubRoutines::x86::base64_and_mask_addr()), Assembler::AVX_512bit, r13); 5133 5134 // Vector Base64 implementation, producing 96 bytes of encoded data 5135 __ BIND(L_process80); 5136 __ cmpl(length, 80); 5137 __ jcc(Assembler::below, L_process32); 5138 __ evmovdquq(xmm0, Address(source, start_offset, Address::times_1, 0), Assembler::AVX_256bit); 5139 __ evmovdquq(xmm1, Address(source, start_offset, Address::times_1, 24), Assembler::AVX_256bit); 5140 __ evmovdquq(xmm2, Address(source, start_offset, Address::times_1, 48), Assembler::AVX_256bit); 5141 5142 //permute the input data in such a manner that we have continuity of the source 5143 __ vpermq(xmm3, xmm0, 148, Assembler::AVX_256bit); 5144 __ vpermq(xmm4, xmm1, 148, Assembler::AVX_256bit); 5145 __ vpermq(xmm5, xmm2, 148, Assembler::AVX_256bit); 5146 5147 //shuffle input and group 3 bytes of data and to it add 0 as the 4th byte. 5148 //we can deal with 12 bytes at a time in a 128 bit register 5149 __ vpshufb(xmm3, xmm3, xmm12, Assembler::AVX_256bit); 5150 __ vpshufb(xmm4, xmm4, xmm12, Assembler::AVX_256bit); 5151 __ vpshufb(xmm5, xmm5, xmm12, Assembler::AVX_256bit); 5152 5153 //convert byte to word. Each 128 bit register will have 6 bytes for processing 5154 __ vpmovzxbw(xmm3, xmm3, Assembler::AVX_512bit); 5155 __ vpmovzxbw(xmm4, xmm4, Assembler::AVX_512bit); 5156 __ vpmovzxbw(xmm5, xmm5, Assembler::AVX_512bit); 5157 5158 // Extract bits in the following pattern 6, 4+2, 2+4, 6 to convert 3, 8 bit numbers to 4, 6 bit numbers 5159 __ evpsrlvw(xmm0, xmm3, xmm13, Assembler::AVX_512bit); 5160 __ evpsrlvw(xmm1, xmm4, xmm13, Assembler::AVX_512bit); 5161 __ evpsrlvw(xmm2, xmm5, xmm13, Assembler::AVX_512bit); 5162 5163 __ evpsllvw(xmm3, xmm3, xmm14, Assembler::AVX_512bit); 5164 __ evpsllvw(xmm4, xmm4, xmm14, Assembler::AVX_512bit); 5165 __ evpsllvw(xmm5, xmm5, xmm14, Assembler::AVX_512bit); 5166 5167 __ vpsrlq(xmm0, xmm0, 8, Assembler::AVX_512bit); 5168 __ vpsrlq(xmm1, xmm1, 8, Assembler::AVX_512bit); 5169 __ vpsrlq(xmm2, xmm2, 8, Assembler::AVX_512bit); 5170 5171 __ vpsllq(xmm3, xmm3, 8, Assembler::AVX_512bit); 5172 __ vpsllq(xmm4, xmm4, 8, Assembler::AVX_512bit); 5173 __ vpsllq(xmm5, xmm5, 8, Assembler::AVX_512bit); 5174 5175 __ vpandq(xmm3, xmm3, xmm15, Assembler::AVX_512bit); 5176 __ vpandq(xmm4, xmm4, xmm15, Assembler::AVX_512bit); 5177 __ vpandq(xmm5, xmm5, xmm15, Assembler::AVX_512bit); 5178 5179 // Get the final 4*6 bits base64 encoding 5180 __ vporq(xmm3, xmm3, xmm0, Assembler::AVX_512bit); 5181 __ vporq(xmm4, xmm4, xmm1, Assembler::AVX_512bit); 5182 __ vporq(xmm5, xmm5, xmm2, Assembler::AVX_512bit); 5183 5184 // Shift 5185 __ vpsrlq(xmm3, xmm3, 8, Assembler::AVX_512bit); 5186 __ vpsrlq(xmm4, xmm4, 8, Assembler::AVX_512bit); 5187 __ vpsrlq(xmm5, xmm5, 8, Assembler::AVX_512bit); 5188 5189 // look up 6 bits in the base64 character set to fetch the encoding 5190 // we are converting word to dword as gather instructions need dword indices for looking up encoding 5191 __ vextracti64x4(xmm6, xmm3, 0); 5192 __ vpmovzxwd(xmm0, xmm6, Assembler::AVX_512bit); 5193 __ vextracti64x4(xmm6, xmm3, 1); 5194 __ vpmovzxwd(xmm1, xmm6, Assembler::AVX_512bit); 5195 5196 __ vextracti64x4(xmm6, xmm4, 0); 5197 __ vpmovzxwd(xmm2, xmm6, Assembler::AVX_512bit); 5198 __ vextracti64x4(xmm6, xmm4, 1); 5199 __ vpmovzxwd(xmm3, xmm6, Assembler::AVX_512bit); 5200 5201 __ vextracti64x4(xmm4, xmm5, 0); 5202 __ vpmovzxwd(xmm6, xmm4, Assembler::AVX_512bit); 5203 5204 __ vextracti64x4(xmm4, xmm5, 1); 5205 __ vpmovzxwd(xmm7, xmm4, Assembler::AVX_512bit); 5206 5207 __ kmovql(k2, k3); 5208 __ evpgatherdd(xmm4, k2, Address(r11, xmm0, Address::times_4, 0), Assembler::AVX_512bit); 5209 __ kmovql(k2, k3); 5210 __ evpgatherdd(xmm5, k2, Address(r11, xmm1, Address::times_4, 0), Assembler::AVX_512bit); 5211 __ kmovql(k2, k3); 5212 __ evpgatherdd(xmm8, k2, Address(r11, xmm2, Address::times_4, 0), Assembler::AVX_512bit); 5213 __ kmovql(k2, k3); 5214 __ evpgatherdd(xmm9, k2, Address(r11, xmm3, Address::times_4, 0), Assembler::AVX_512bit); 5215 __ kmovql(k2, k3); 5216 __ evpgatherdd(xmm10, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit); 5217 __ kmovql(k2, k3); 5218 __ evpgatherdd(xmm11, k2, Address(r11, xmm7, Address::times_4, 0), Assembler::AVX_512bit); 5219 5220 //Down convert dword to byte. Final output is 16*6 = 96 bytes long 5221 __ evpmovdb(Address(dest, dp, Address::times_1, 0), xmm4, Assembler::AVX_512bit); 5222 __ evpmovdb(Address(dest, dp, Address::times_1, 16), xmm5, Assembler::AVX_512bit); 5223 __ evpmovdb(Address(dest, dp, Address::times_1, 32), xmm8, Assembler::AVX_512bit); 5224 __ evpmovdb(Address(dest, dp, Address::times_1, 48), xmm9, Assembler::AVX_512bit); 5225 __ evpmovdb(Address(dest, dp, Address::times_1, 64), xmm10, Assembler::AVX_512bit); 5226 __ evpmovdb(Address(dest, dp, Address::times_1, 80), xmm11, Assembler::AVX_512bit); 5227 5228 __ addq(dest, 96); 5229 __ addq(source, 72); 5230 __ subq(length, 72); 5231 __ jmp(L_process80); 5232 5233 // Vector Base64 implementation generating 32 bytes of encoded data 5234 __ BIND(L_process32); 5235 __ cmpl(length, 32); 5236 __ jcc(Assembler::below, L_process3); 5237 __ evmovdquq(xmm0, Address(source, start_offset), Assembler::AVX_256bit); 5238 __ vpermq(xmm0, xmm0, 148, Assembler::AVX_256bit); 5239 __ vpshufb(xmm6, xmm0, xmm12, Assembler::AVX_256bit); 5240 __ vpmovzxbw(xmm6, xmm6, Assembler::AVX_512bit); 5241 __ evpsrlvw(xmm2, xmm6, xmm13, Assembler::AVX_512bit); 5242 __ evpsllvw(xmm3, xmm6, xmm14, Assembler::AVX_512bit); 5243 5244 __ vpsrlq(xmm2, xmm2, 8, Assembler::AVX_512bit); 5245 __ vpsllq(xmm3, xmm3, 8, Assembler::AVX_512bit); 5246 __ vpandq(xmm3, xmm3, xmm15, Assembler::AVX_512bit); 5247 __ vporq(xmm1, xmm2, xmm3, Assembler::AVX_512bit); 5248 __ vpsrlq(xmm1, xmm1, 8, Assembler::AVX_512bit); 5249 __ vextracti64x4(xmm9, xmm1, 0); 5250 __ vpmovzxwd(xmm6, xmm9, Assembler::AVX_512bit); 5251 __ vextracti64x4(xmm9, xmm1, 1); 5252 __ vpmovzxwd(xmm5, xmm9, Assembler::AVX_512bit); 5253 __ kmovql(k2, k3); 5254 __ evpgatherdd(xmm8, k2, Address(r11, xmm6, Address::times_4, 0), Assembler::AVX_512bit); 5255 __ kmovql(k2, k3); 5256 __ evpgatherdd(xmm10, k2, Address(r11, xmm5, Address::times_4, 0), Assembler::AVX_512bit); 5257 __ evpmovdb(Address(dest, dp, Address::times_1, 0), xmm8, Assembler::AVX_512bit); 5258 __ evpmovdb(Address(dest, dp, Address::times_1, 16), xmm10, Assembler::AVX_512bit); 5259 __ subq(length, 24); 5260 __ addq(dest, 32); 5261 __ addq(source, 24); 5262 __ jmp(L_process32); 5263 5264 // Scalar data processing takes 3 bytes at a time and produces 4 bytes of encoded data 5265 /* This code corresponds to the scalar version of the following snippet in Base64.java 5266 ** int bits = (src[sp0++] & 0xff) << 16 |(src[sp0++] & 0xff) << 8 |(src[sp0++] & 0xff); 5267 ** dst[dp0++] = (byte)base64[(bits >> > 18) & 0x3f]; 5268 ** dst[dp0++] = (byte)base64[(bits >> > 12) & 0x3f]; 5269 ** dst[dp0++] = (byte)base64[(bits >> > 6) & 0x3f]; 5270 ** dst[dp0++] = (byte)base64[bits & 0x3f];*/ 5271 __ BIND(L_process3); 5272 __ cmpl(length, 3); 5273 __ jcc(Assembler::below, L_exit); 5274 // Read 1 byte at a time 5275 __ movzbl(rax, Address(source, start_offset)); 5276 __ shll(rax, 0x10); 5277 __ movl(r15, rax); 5278 __ movzbl(rax, Address(source, start_offset, Address::times_1, 1)); 5279 __ shll(rax, 0x8); 5280 __ movzwl(rax, rax); 5281 __ orl(r15, rax); 5282 __ movzbl(rax, Address(source, start_offset, Address::times_1, 2)); 5283 __ orl(rax, r15); 5284 // Save 3 bytes read in r15 5285 __ movl(r15, rax); 5286 __ shrl(rax, 0x12); 5287 __ andl(rax, 0x3f); 5288 // rax contains the index, r11 contains base64 lookup table 5289 __ movb(rax, Address(r11, rax, Address::times_4)); 5290 // Write the encoded byte to destination 5291 __ movb(Address(dest, dp, Address::times_1, 0), rax); 5292 __ movl(rax, r15); 5293 __ shrl(rax, 0xc); 5294 __ andl(rax, 0x3f); 5295 __ movb(rax, Address(r11, rax, Address::times_4)); 5296 __ movb(Address(dest, dp, Address::times_1, 1), rax); 5297 __ movl(rax, r15); 5298 __ shrl(rax, 0x6); 5299 __ andl(rax, 0x3f); 5300 __ movb(rax, Address(r11, rax, Address::times_4)); 5301 __ movb(Address(dest, dp, Address::times_1, 2), rax); 5302 __ movl(rax, r15); 5303 __ andl(rax, 0x3f); 5304 __ movb(rax, Address(r11, rax, Address::times_4)); 5305 __ movb(Address(dest, dp, Address::times_1, 3), rax); 5306 __ subl(length, 3); 5307 __ addq(dest, 4); 5308 __ addq(source, 3); 5309 __ jmp(L_process3); 5310 __ BIND(L_exit); 5311 __ pop(r15); 5312 __ pop(r14); 5313 __ pop(r13); 5314 __ pop(r12); 5315 __ leave(); 5316 __ ret(0); 5317 return start; 5318 } 5319 5320 /** 5321 * Arguments: 5322 * 5323 * Inputs: 5324 * c_rarg0 - int crc 5325 * c_rarg1 - byte* buf 5326 * c_rarg2 - int length 5327 * 5328 * Ouput: 5329 * rax - int crc result 5330 */ 5331 address generate_updateBytesCRC32() { 5332 assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions"); 5333 5334 __ align(CodeEntryAlignment); 5335 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 5336 5337 address start = __ pc(); 5338 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) 5339 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...) 5340 // rscratch1: r10 5341 const Register crc = c_rarg0; // crc 5342 const Register buf = c_rarg1; // source java byte array address 5343 const Register len = c_rarg2; // length 5344 const Register table = c_rarg3; // crc_table address (reuse register) 5345 const Register tmp = r11; 5346 assert_different_registers(crc, buf, len, table, tmp, rax); 5347 5348 BLOCK_COMMENT("Entry:"); 5349 __ enter(); // required for proper stackwalking of RuntimeStub frame 5350 5351 __ kernel_crc32(crc, buf, len, table, tmp); 5352 5353 __ movl(rax, crc); 5354 __ vzeroupper(); 5355 __ leave(); // required for proper stackwalking of RuntimeStub frame 5356 __ ret(0); 5357 5358 return start; 5359 } 5360 5361 /** 5362 * Arguments: 5363 * 5364 * Inputs: 5365 * c_rarg0 - int crc 5366 * c_rarg1 - byte* buf 5367 * c_rarg2 - long length 5368 * c_rarg3 - table_start - optional (present only when doing a library_call, 5369 * not used by x86 algorithm) 5370 * 5371 * Ouput: 5372 * rax - int crc result 5373 */ 5374 address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) { 5375 assert(UseCRC32CIntrinsics, "need SSE4_2"); 5376 __ align(CodeEntryAlignment); 5377 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 5378 address start = __ pc(); 5379 //reg.arg int#0 int#1 int#2 int#3 int#4 int#5 float regs 5380 //Windows RCX RDX R8 R9 none none XMM0..XMM3 5381 //Lin / Sol RDI RSI RDX RCX R8 R9 XMM0..XMM7 5382 const Register crc = c_rarg0; // crc 5383 const Register buf = c_rarg1; // source java byte array address 5384 const Register len = c_rarg2; // length 5385 const Register a = rax; 5386 const Register j = r9; 5387 const Register k = r10; 5388 const Register l = r11; 5389 #ifdef _WIN64 5390 const Register y = rdi; 5391 const Register z = rsi; 5392 #else 5393 const Register y = rcx; 5394 const Register z = r8; 5395 #endif 5396 assert_different_registers(crc, buf, len, a, j, k, l, y, z); 5397 5398 BLOCK_COMMENT("Entry:"); 5399 __ enter(); // required for proper stackwalking of RuntimeStub frame 5400 #ifdef _WIN64 5401 __ push(y); 5402 __ push(z); 5403 #endif 5404 __ crc32c_ipl_alg2_alt2(crc, buf, len, 5405 a, j, k, 5406 l, y, z, 5407 c_farg0, c_farg1, c_farg2, 5408 is_pclmulqdq_supported); 5409 __ movl(rax, crc); 5410 #ifdef _WIN64 5411 __ pop(z); 5412 __ pop(y); 5413 #endif 5414 __ vzeroupper(); 5415 __ leave(); // required for proper stackwalking of RuntimeStub frame 5416 __ ret(0); 5417 5418 return start; 5419 } 5420 5421 /** 5422 * Arguments: 5423 * 5424 * Input: 5425 * c_rarg0 - x address 5426 * c_rarg1 - x length 5427 * c_rarg2 - y address 5428 * c_rarg3 - y length 5429 * not Win64 5430 * c_rarg4 - z address 5431 * c_rarg5 - z length 5432 * Win64 5433 * rsp+40 - z address 5434 * rsp+48 - z length 5435 */ 5436 address generate_multiplyToLen() { 5437 __ align(CodeEntryAlignment); 5438 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 5439 5440 address start = __ pc(); 5441 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) 5442 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...) 5443 const Register x = rdi; 5444 const Register xlen = rax; 5445 const Register y = rsi; 5446 const Register ylen = rcx; 5447 const Register z = r8; 5448 const Register zlen = r11; 5449 5450 // Next registers will be saved on stack in multiply_to_len(). 5451 const Register tmp1 = r12; 5452 const Register tmp2 = r13; 5453 const Register tmp3 = r14; 5454 const Register tmp4 = r15; 5455 const Register tmp5 = rbx; 5456 5457 BLOCK_COMMENT("Entry:"); 5458 __ enter(); // required for proper stackwalking of RuntimeStub frame 5459 5460 #ifndef _WIN64 5461 __ movptr(zlen, r9); // Save r9 in r11 - zlen 5462 #endif 5463 setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx 5464 // ylen => rcx, z => r8, zlen => r11 5465 // r9 and r10 may be used to save non-volatile registers 5466 #ifdef _WIN64 5467 // last 2 arguments (#4, #5) are on stack on Win64 5468 __ movptr(z, Address(rsp, 6 * wordSize)); 5469 __ movptr(zlen, Address(rsp, 7 * wordSize)); 5470 #endif 5471 5472 __ movptr(xlen, rsi); 5473 __ movptr(y, rdx); 5474 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5); 5475 5476 restore_arg_regs(); 5477 5478 __ leave(); // required for proper stackwalking of RuntimeStub frame 5479 __ ret(0); 5480 5481 return start; 5482 } 5483 5484 /** 5485 * Arguments: 5486 * 5487 * Input: 5488 * c_rarg0 - obja address 5489 * c_rarg1 - objb address 5490 * c_rarg3 - length length 5491 * c_rarg4 - scale log2_array_indxscale 5492 * 5493 * Output: 5494 * rax - int >= mismatched index, < 0 bitwise complement of tail 5495 */ 5496 address generate_vectorizedMismatch() { 5497 __ align(CodeEntryAlignment); 5498 StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch"); 5499 address start = __ pc(); 5500 5501 BLOCK_COMMENT("Entry:"); 5502 __ enter(); 5503 5504 #ifdef _WIN64 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) 5505 const Register scale = c_rarg0; //rcx, will exchange with r9 5506 const Register objb = c_rarg1; //rdx 5507 const Register length = c_rarg2; //r8 5508 const Register obja = c_rarg3; //r9 5509 __ xchgq(obja, scale); //now obja and scale contains the correct contents 5510 5511 const Register tmp1 = r10; 5512 const Register tmp2 = r11; 5513 #endif 5514 #ifndef _WIN64 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...) 5515 const Register obja = c_rarg0; //U:rdi 5516 const Register objb = c_rarg1; //U:rsi 5517 const Register length = c_rarg2; //U:rdx 5518 const Register scale = c_rarg3; //U:rcx 5519 const Register tmp1 = r8; 5520 const Register tmp2 = r9; 5521 #endif 5522 const Register result = rax; //return value 5523 const XMMRegister vec0 = xmm0; 5524 const XMMRegister vec1 = xmm1; 5525 const XMMRegister vec2 = xmm2; 5526 5527 __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2); 5528 5529 __ vzeroupper(); 5530 __ leave(); 5531 __ ret(0); 5532 5533 return start; 5534 } 5535 5536 /** 5537 * Arguments: 5538 * 5539 // Input: 5540 // c_rarg0 - x address 5541 // c_rarg1 - x length 5542 // c_rarg2 - z address 5543 // c_rarg3 - z lenth 5544 * 5545 */ 5546 address generate_squareToLen() { 5547 5548 __ align(CodeEntryAlignment); 5549 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 5550 5551 address start = __ pc(); 5552 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) 5553 // Unix: rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...) 5554 const Register x = rdi; 5555 const Register len = rsi; 5556 const Register z = r8; 5557 const Register zlen = rcx; 5558 5559 const Register tmp1 = r12; 5560 const Register tmp2 = r13; 5561 const Register tmp3 = r14; 5562 const Register tmp4 = r15; 5563 const Register tmp5 = rbx; 5564 5565 BLOCK_COMMENT("Entry:"); 5566 __ enter(); // required for proper stackwalking of RuntimeStub frame 5567 5568 setup_arg_regs(4); // x => rdi, len => rsi, z => rdx 5569 // zlen => rcx 5570 // r9 and r10 may be used to save non-volatile registers 5571 __ movptr(r8, rdx); 5572 __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax); 5573 5574 restore_arg_regs(); 5575 5576 __ leave(); // required for proper stackwalking of RuntimeStub frame 5577 __ ret(0); 5578 5579 return start; 5580 } 5581 5582 address generate_method_entry_barrier() { 5583 __ align(CodeEntryAlignment); 5584 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 5585 5586 Label deoptimize_label; 5587 5588 address start = __ pc(); 5589 5590 __ push(-1); // cookie, this is used for writing the new rsp when deoptimizing 5591 5592 BLOCK_COMMENT("Entry:"); 5593 __ enter(); // save rbp 5594 5595 // save c_rarg0, because we want to use that value. 5596 // We could do without it but then we depend on the number of slots used by pusha 5597 __ push(c_rarg0); 5598 5599 __ lea(c_rarg0, Address(rsp, wordSize * 3)); // 1 for cookie, 1 for rbp, 1 for c_rarg0 - this should be the return address 5600 5601 __ pusha(); 5602 5603 // The method may have floats as arguments, and we must spill them before calling 5604 // the VM runtime. 5605 assert(Argument::n_float_register_parameters_j == 8, "Assumption"); 5606 const int xmm_size = wordSize * 2; 5607 const int xmm_spill_size = xmm_size * Argument::n_float_register_parameters_j; 5608 __ subptr(rsp, xmm_spill_size); 5609 __ movdqu(Address(rsp, xmm_size * 7), xmm7); 5610 __ movdqu(Address(rsp, xmm_size * 6), xmm6); 5611 __ movdqu(Address(rsp, xmm_size * 5), xmm5); 5612 __ movdqu(Address(rsp, xmm_size * 4), xmm4); 5613 __ movdqu(Address(rsp, xmm_size * 3), xmm3); 5614 __ movdqu(Address(rsp, xmm_size * 2), xmm2); 5615 __ movdqu(Address(rsp, xmm_size * 1), xmm1); 5616 __ movdqu(Address(rsp, xmm_size * 0), xmm0); 5617 5618 __ call_VM_leaf(CAST_FROM_FN_PTR(address, static_cast<int (*)(address*)>(BarrierSetNMethod::nmethod_stub_entry_barrier)), 1); 5619 5620 __ movdqu(xmm0, Address(rsp, xmm_size * 0)); 5621 __ movdqu(xmm1, Address(rsp, xmm_size * 1)); 5622 __ movdqu(xmm2, Address(rsp, xmm_size * 2)); 5623 __ movdqu(xmm3, Address(rsp, xmm_size * 3)); 5624 __ movdqu(xmm4, Address(rsp, xmm_size * 4)); 5625 __ movdqu(xmm5, Address(rsp, xmm_size * 5)); 5626 __ movdqu(xmm6, Address(rsp, xmm_size * 6)); 5627 __ movdqu(xmm7, Address(rsp, xmm_size * 7)); 5628 __ addptr(rsp, xmm_spill_size); 5629 5630 __ cmpl(rax, 1); // 1 means deoptimize 5631 __ jcc(Assembler::equal, deoptimize_label); 5632 5633 __ popa(); 5634 __ pop(c_rarg0); 5635 5636 __ leave(); 5637 5638 __ addptr(rsp, 1 * wordSize); // cookie 5639 __ ret(0); 5640 5641 5642 __ BIND(deoptimize_label); 5643 5644 __ popa(); 5645 __ pop(c_rarg0); 5646 5647 __ leave(); 5648 5649 // this can be taken out, but is good for verification purposes. getting a SIGSEGV 5650 // here while still having a correct stack is valuable 5651 __ testptr(rsp, Address(rsp, 0)); 5652 5653 __ movptr(rsp, Address(rsp, 0)); // new rsp was written in the barrier 5654 __ jmp(Address(rsp, -1 * wordSize)); // jmp target should be callers verified_entry_point 5655 5656 return start; 5657 } 5658 5659 /** 5660 * Arguments: 5661 * 5662 * Input: 5663 * c_rarg0 - out address 5664 * c_rarg1 - in address 5665 * c_rarg2 - offset 5666 * c_rarg3 - len 5667 * not Win64 5668 * c_rarg4 - k 5669 * Win64 5670 * rsp+40 - k 5671 */ 5672 address generate_mulAdd() { 5673 __ align(CodeEntryAlignment); 5674 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 5675 5676 address start = __ pc(); 5677 // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) 5678 // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...) 5679 const Register out = rdi; 5680 const Register in = rsi; 5681 const Register offset = r11; 5682 const Register len = rcx; 5683 const Register k = r8; 5684 5685 // Next registers will be saved on stack in mul_add(). 5686 const Register tmp1 = r12; 5687 const Register tmp2 = r13; 5688 const Register tmp3 = r14; 5689 const Register tmp4 = r15; 5690 const Register tmp5 = rbx; 5691 5692 BLOCK_COMMENT("Entry:"); 5693 __ enter(); // required for proper stackwalking of RuntimeStub frame 5694 5695 setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx 5696 // len => rcx, k => r8 5697 // r9 and r10 may be used to save non-volatile registers 5698 #ifdef _WIN64 5699 // last argument is on stack on Win64 5700 __ movl(k, Address(rsp, 6 * wordSize)); 5701 #endif 5702 __ movptr(r11, rdx); // move offset in rdx to offset(r11) 5703 __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax); 5704 5705 restore_arg_regs(); 5706 5707 __ leave(); // required for proper stackwalking of RuntimeStub frame 5708 __ ret(0); 5709 5710 return start; 5711 } 5712 5713 address generate_libmExp() { 5714 StubCodeMark mark(this, "StubRoutines", "libmExp"); 5715 5716 address start = __ pc(); 5717 5718 const XMMRegister x0 = xmm0; 5719 const XMMRegister x1 = xmm1; 5720 const XMMRegister x2 = xmm2; 5721 const XMMRegister x3 = xmm3; 5722 5723 const XMMRegister x4 = xmm4; 5724 const XMMRegister x5 = xmm5; 5725 const XMMRegister x6 = xmm6; 5726 const XMMRegister x7 = xmm7; 5727 5728 const Register tmp = r11; 5729 5730 BLOCK_COMMENT("Entry:"); 5731 __ enter(); // required for proper stackwalking of RuntimeStub frame 5732 5733 __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp); 5734 5735 __ leave(); // required for proper stackwalking of RuntimeStub frame 5736 __ ret(0); 5737 5738 return start; 5739 5740 } 5741 5742 address generate_libmLog() { 5743 StubCodeMark mark(this, "StubRoutines", "libmLog"); 5744 5745 address start = __ pc(); 5746 5747 const XMMRegister x0 = xmm0; 5748 const XMMRegister x1 = xmm1; 5749 const XMMRegister x2 = xmm2; 5750 const XMMRegister x3 = xmm3; 5751 5752 const XMMRegister x4 = xmm4; 5753 const XMMRegister x5 = xmm5; 5754 const XMMRegister x6 = xmm6; 5755 const XMMRegister x7 = xmm7; 5756 5757 const Register tmp1 = r11; 5758 const Register tmp2 = r8; 5759 5760 BLOCK_COMMENT("Entry:"); 5761 __ enter(); // required for proper stackwalking of RuntimeStub frame 5762 5763 __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2); 5764 5765 __ leave(); // required for proper stackwalking of RuntimeStub frame 5766 __ ret(0); 5767 5768 return start; 5769 5770 } 5771 5772 address generate_libmLog10() { 5773 StubCodeMark mark(this, "StubRoutines", "libmLog10"); 5774 5775 address start = __ pc(); 5776 5777 const XMMRegister x0 = xmm0; 5778 const XMMRegister x1 = xmm1; 5779 const XMMRegister x2 = xmm2; 5780 const XMMRegister x3 = xmm3; 5781 5782 const XMMRegister x4 = xmm4; 5783 const XMMRegister x5 = xmm5; 5784 const XMMRegister x6 = xmm6; 5785 const XMMRegister x7 = xmm7; 5786 5787 const Register tmp = r11; 5788 5789 BLOCK_COMMENT("Entry:"); 5790 __ enter(); // required for proper stackwalking of RuntimeStub frame 5791 5792 __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp); 5793 5794 __ leave(); // required for proper stackwalking of RuntimeStub frame 5795 __ ret(0); 5796 5797 return start; 5798 5799 } 5800 5801 address generate_libmPow() { 5802 StubCodeMark mark(this, "StubRoutines", "libmPow"); 5803 5804 address start = __ pc(); 5805 5806 const XMMRegister x0 = xmm0; 5807 const XMMRegister x1 = xmm1; 5808 const XMMRegister x2 = xmm2; 5809 const XMMRegister x3 = xmm3; 5810 5811 const XMMRegister x4 = xmm4; 5812 const XMMRegister x5 = xmm5; 5813 const XMMRegister x6 = xmm6; 5814 const XMMRegister x7 = xmm7; 5815 5816 const Register tmp1 = r8; 5817 const Register tmp2 = r9; 5818 const Register tmp3 = r10; 5819 const Register tmp4 = r11; 5820 5821 BLOCK_COMMENT("Entry:"); 5822 __ enter(); // required for proper stackwalking of RuntimeStub frame 5823 5824 __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4); 5825 5826 __ leave(); // required for proper stackwalking of RuntimeStub frame 5827 __ ret(0); 5828 5829 return start; 5830 5831 } 5832 5833 address generate_libmSin() { 5834 StubCodeMark mark(this, "StubRoutines", "libmSin"); 5835 5836 address start = __ pc(); 5837 5838 const XMMRegister x0 = xmm0; 5839 const XMMRegister x1 = xmm1; 5840 const XMMRegister x2 = xmm2; 5841 const XMMRegister x3 = xmm3; 5842 5843 const XMMRegister x4 = xmm4; 5844 const XMMRegister x5 = xmm5; 5845 const XMMRegister x6 = xmm6; 5846 const XMMRegister x7 = xmm7; 5847 5848 const Register tmp1 = r8; 5849 const Register tmp2 = r9; 5850 const Register tmp3 = r10; 5851 const Register tmp4 = r11; 5852 5853 BLOCK_COMMENT("Entry:"); 5854 __ enter(); // required for proper stackwalking of RuntimeStub frame 5855 5856 #ifdef _WIN64 5857 __ push(rsi); 5858 __ push(rdi); 5859 #endif 5860 __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4); 5861 5862 #ifdef _WIN64 5863 __ pop(rdi); 5864 __ pop(rsi); 5865 #endif 5866 5867 __ leave(); // required for proper stackwalking of RuntimeStub frame 5868 __ ret(0); 5869 5870 return start; 5871 5872 } 5873 5874 address generate_libmCos() { 5875 StubCodeMark mark(this, "StubRoutines", "libmCos"); 5876 5877 address start = __ pc(); 5878 5879 const XMMRegister x0 = xmm0; 5880 const XMMRegister x1 = xmm1; 5881 const XMMRegister x2 = xmm2; 5882 const XMMRegister x3 = xmm3; 5883 5884 const XMMRegister x4 = xmm4; 5885 const XMMRegister x5 = xmm5; 5886 const XMMRegister x6 = xmm6; 5887 const XMMRegister x7 = xmm7; 5888 5889 const Register tmp1 = r8; 5890 const Register tmp2 = r9; 5891 const Register tmp3 = r10; 5892 const Register tmp4 = r11; 5893 5894 BLOCK_COMMENT("Entry:"); 5895 __ enter(); // required for proper stackwalking of RuntimeStub frame 5896 5897 #ifdef _WIN64 5898 __ push(rsi); 5899 __ push(rdi); 5900 #endif 5901 __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4); 5902 5903 #ifdef _WIN64 5904 __ pop(rdi); 5905 __ pop(rsi); 5906 #endif 5907 5908 __ leave(); // required for proper stackwalking of RuntimeStub frame 5909 __ ret(0); 5910 5911 return start; 5912 5913 } 5914 5915 address generate_libmTan() { 5916 StubCodeMark mark(this, "StubRoutines", "libmTan"); 5917 5918 address start = __ pc(); 5919 5920 const XMMRegister x0 = xmm0; 5921 const XMMRegister x1 = xmm1; 5922 const XMMRegister x2 = xmm2; 5923 const XMMRegister x3 = xmm3; 5924 5925 const XMMRegister x4 = xmm4; 5926 const XMMRegister x5 = xmm5; 5927 const XMMRegister x6 = xmm6; 5928 const XMMRegister x7 = xmm7; 5929 5930 const Register tmp1 = r8; 5931 const Register tmp2 = r9; 5932 const Register tmp3 = r10; 5933 const Register tmp4 = r11; 5934 5935 BLOCK_COMMENT("Entry:"); 5936 __ enter(); // required for proper stackwalking of RuntimeStub frame 5937 5938 #ifdef _WIN64 5939 __ push(rsi); 5940 __ push(rdi); 5941 #endif 5942 __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4); 5943 5944 #ifdef _WIN64 5945 __ pop(rdi); 5946 __ pop(rsi); 5947 #endif 5948 5949 __ leave(); // required for proper stackwalking of RuntimeStub frame 5950 __ ret(0); 5951 5952 return start; 5953 5954 } 5955 5956 #undef __ 5957 #define __ masm-> 5958 5959 // Continuation point for throwing of implicit exceptions that are 5960 // not handled in the current activation. Fabricates an exception 5961 // oop and initiates normal exception dispatching in this 5962 // frame. Since we need to preserve callee-saved values (currently 5963 // only for C2, but done for C1 as well) we need a callee-saved oop 5964 // map and therefore have to make these stubs into RuntimeStubs 5965 // rather than BufferBlobs. If the compiler needs all registers to 5966 // be preserved between the fault point and the exception handler 5967 // then it must assume responsibility for that in 5968 // AbstractCompiler::continuation_for_implicit_null_exception or 5969 // continuation_for_implicit_division_by_zero_exception. All other 5970 // implicit exceptions (e.g., NullPointerException or 5971 // AbstractMethodError on entry) are either at call sites or 5972 // otherwise assume that stack unwinding will be initiated, so 5973 // caller saved registers were assumed volatile in the compiler. 5974 address generate_throw_exception(const char* name, 5975 address runtime_entry, 5976 Register arg1 = noreg, 5977 Register arg2 = noreg) { 5978 // Information about frame layout at time of blocking runtime call. 5979 // Note that we only have to preserve callee-saved registers since 5980 // the compilers are responsible for supplying a continuation point 5981 // if they expect all registers to be preserved. 5982 enum layout { 5983 rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt, 5984 rbp_off2, 5985 return_off, 5986 return_off2, 5987 framesize // inclusive of return address 5988 }; 5989 5990 int insts_size = 512; 5991 int locs_size = 64; 5992 5993 CodeBuffer code(name, insts_size, locs_size); 5994 OopMapSet* oop_maps = new OopMapSet(); 5995 MacroAssembler* masm = new MacroAssembler(&code); 5996 5997 address start = __ pc(); 5998 5999 // This is an inlined and slightly modified version of call_VM 6000 // which has the ability to fetch the return PC out of 6001 // thread-local storage and also sets up last_Java_sp slightly 6002 // differently than the real call_VM 6003 6004 __ enter(); // required for proper stackwalking of RuntimeStub frame 6005 6006 assert(is_even(framesize/2), "sp not 16-byte aligned"); 6007 6008 // return address and rbp are already in place 6009 __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog 6010 6011 int frame_complete = __ pc() - start; 6012 6013 // Set up last_Java_sp and last_Java_fp 6014 address the_pc = __ pc(); 6015 __ set_last_Java_frame(rsp, rbp, the_pc); 6016 __ andptr(rsp, -(StackAlignmentInBytes)); // Align stack 6017 6018 // Call runtime 6019 if (arg1 != noreg) { 6020 assert(arg2 != c_rarg1, "clobbered"); 6021 __ movptr(c_rarg1, arg1); 6022 } 6023 if (arg2 != noreg) { 6024 __ movptr(c_rarg2, arg2); 6025 } 6026 __ movptr(c_rarg0, r15_thread); 6027 BLOCK_COMMENT("call runtime_entry"); 6028 __ call(RuntimeAddress(runtime_entry)); 6029 6030 // Generate oop map 6031 OopMap* map = new OopMap(framesize, 0); 6032 6033 oop_maps->add_gc_map(the_pc - start, map); 6034 6035 __ reset_last_Java_frame(true); 6036 6037 __ leave(); // required for proper stackwalking of RuntimeStub frame 6038 6039 // check for pending exceptions 6040 #ifdef ASSERT 6041 Label L; 6042 __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), 6043 (int32_t) NULL_WORD); 6044 __ jcc(Assembler::notEqual, L); 6045 __ should_not_reach_here(); 6046 __ bind(L); 6047 #endif // ASSERT 6048 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 6049 6050 6051 // codeBlob framesize is in words (not VMRegImpl::slot_size) 6052 RuntimeStub* stub = 6053 RuntimeStub::new_runtime_stub(name, 6054 &code, 6055 frame_complete, 6056 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 6057 oop_maps, false); 6058 return stub->entry_point(); 6059 } 6060 6061 void create_control_words() { 6062 // Round to nearest, 53-bit mode, exceptions masked 6063 StubRoutines::_fpu_cntrl_wrd_std = 0x027F; 6064 // Round to zero, 53-bit mode, exception mased 6065 StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F; 6066 // Round to nearest, 24-bit mode, exceptions masked 6067 StubRoutines::_fpu_cntrl_wrd_24 = 0x007F; 6068 // Round to nearest, 64-bit mode, exceptions masked 6069 StubRoutines::_mxcsr_std = 0x1F80; 6070 // Note: the following two constants are 80-bit values 6071 // layout is critical for correct loading by FPU. 6072 // Bias for strict fp multiply/divide 6073 StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000 6074 StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000; 6075 StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff; 6076 // Un-Bias for strict fp multiply/divide 6077 StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000 6078 StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000; 6079 StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff; 6080 } 6081 6082 // Initialization 6083 void generate_initial() { 6084 // Generates all stubs and initializes the entry points 6085 6086 // This platform-specific settings are needed by generate_call_stub() 6087 create_control_words(); 6088 6089 // entry points that exist in all platforms Note: This is code 6090 // that could be shared among different platforms - however the 6091 // benefit seems to be smaller than the disadvantage of having a 6092 // much more complicated generator structure. See also comment in 6093 // stubRoutines.hpp. 6094 6095 StubRoutines::_forward_exception_entry = generate_forward_exception(); 6096 6097 StubRoutines::_call_stub_entry = 6098 generate_call_stub(StubRoutines::_call_stub_return_address); 6099 6100 // is referenced by megamorphic call 6101 StubRoutines::_catch_exception_entry = generate_catch_exception(); 6102 6103 // atomic calls 6104 StubRoutines::_atomic_xchg_entry = generate_atomic_xchg(); 6105 StubRoutines::_atomic_xchg_long_entry = generate_atomic_xchg_long(); 6106 StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg(); 6107 StubRoutines::_atomic_cmpxchg_byte_entry = generate_atomic_cmpxchg_byte(); 6108 StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long(); 6109 StubRoutines::_atomic_add_entry = generate_atomic_add(); 6110 StubRoutines::_atomic_add_long_entry = generate_atomic_add_long(); 6111 StubRoutines::_fence_entry = generate_orderaccess_fence(); 6112 6113 // platform dependent 6114 StubRoutines::x86::_get_previous_fp_entry = generate_get_previous_fp(); 6115 StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp(); 6116 6117 StubRoutines::x86::_verify_mxcsr_entry = generate_verify_mxcsr(); 6118 6119 // Build this early so it's available for the interpreter. 6120 StubRoutines::_throw_StackOverflowError_entry = 6121 generate_throw_exception("StackOverflowError throw_exception", 6122 CAST_FROM_FN_PTR(address, 6123 SharedRuntime:: 6124 throw_StackOverflowError)); 6125 StubRoutines::_throw_delayed_StackOverflowError_entry = 6126 generate_throw_exception("delayed StackOverflowError throw_exception", 6127 CAST_FROM_FN_PTR(address, 6128 SharedRuntime:: 6129 throw_delayed_StackOverflowError)); 6130 if (UseCRC32Intrinsics) { 6131 // set table address before stub generation which use it 6132 StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table; 6133 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 6134 } 6135 6136 if (UseCRC32CIntrinsics) { 6137 bool supports_clmul = VM_Version::supports_clmul(); 6138 StubRoutines::x86::generate_CRC32C_table(supports_clmul); 6139 StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table; 6140 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul); 6141 } 6142 if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) { 6143 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) || 6144 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) || 6145 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) { 6146 StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF; 6147 StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2; 6148 StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4; 6149 StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable; 6150 StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2; 6151 StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3; 6152 StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1; 6153 StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE; 6154 StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4; 6155 StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV; 6156 StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK; 6157 StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1; 6158 StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3; 6159 StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO; 6160 } 6161 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) { 6162 StubRoutines::_dexp = generate_libmExp(); 6163 } 6164 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 6165 StubRoutines::_dlog = generate_libmLog(); 6166 } 6167 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) { 6168 StubRoutines::_dlog10 = generate_libmLog10(); 6169 } 6170 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) { 6171 StubRoutines::_dpow = generate_libmPow(); 6172 } 6173 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 6174 StubRoutines::_dsin = generate_libmSin(); 6175 } 6176 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 6177 StubRoutines::_dcos = generate_libmCos(); 6178 } 6179 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) { 6180 StubRoutines::_dtan = generate_libmTan(); 6181 } 6182 } 6183 } 6184 6185 void generate_all() { 6186 // Generates all stubs and initializes the entry points 6187 6188 // These entry points require SharedInfo::stack0 to be set up in 6189 // non-core builds and need to be relocatable, so they each 6190 // fabricate a RuntimeStub internally. 6191 StubRoutines::_throw_AbstractMethodError_entry = 6192 generate_throw_exception("AbstractMethodError throw_exception", 6193 CAST_FROM_FN_PTR(address, 6194 SharedRuntime:: 6195 throw_AbstractMethodError)); 6196 6197 StubRoutines::_throw_IncompatibleClassChangeError_entry = 6198 generate_throw_exception("IncompatibleClassChangeError throw_exception", 6199 CAST_FROM_FN_PTR(address, 6200 SharedRuntime:: 6201 throw_IncompatibleClassChangeError)); 6202 6203 StubRoutines::_throw_NullPointerException_at_call_entry = 6204 generate_throw_exception("NullPointerException at call throw_exception", 6205 CAST_FROM_FN_PTR(address, 6206 SharedRuntime:: 6207 throw_NullPointerException_at_call)); 6208 6209 // entry points that are platform specific 6210 StubRoutines::x86::_f2i_fixup = generate_f2i_fixup(); 6211 StubRoutines::x86::_f2l_fixup = generate_f2l_fixup(); 6212 StubRoutines::x86::_d2i_fixup = generate_d2i_fixup(); 6213 StubRoutines::x86::_d2l_fixup = generate_d2l_fixup(); 6214 6215 StubRoutines::x86::_float_sign_mask = generate_fp_mask("float_sign_mask", 0x7FFFFFFF7FFFFFFF); 6216 StubRoutines::x86::_float_sign_flip = generate_fp_mask("float_sign_flip", 0x8000000080000000); 6217 StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF); 6218 StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000); 6219 StubRoutines::x86::_vector_float_sign_mask = generate_vector_fp_mask("vector_float_sign_mask", 0x7FFFFFFF7FFFFFFF); 6220 StubRoutines::x86::_vector_float_sign_flip = generate_vector_fp_mask("vector_float_sign_flip", 0x8000000080000000); 6221 StubRoutines::x86::_vector_double_sign_mask = generate_vector_fp_mask("vector_double_sign_mask", 0x7FFFFFFFFFFFFFFF); 6222 StubRoutines::x86::_vector_double_sign_flip = generate_vector_fp_mask("vector_double_sign_flip", 0x8000000000000000); 6223 StubRoutines::x86::_vector_all_bits_set = generate_vector_fp_mask("vector_all_bits_set", 0xFFFFFFFFFFFFFFFF); 6224 StubRoutines::x86::_vector_byte_bitset = generate_vector_fp_mask("vector_byte_bitset", 0x0101010101010101); 6225 StubRoutines::x86::_vector_long_perm_mask = generate_vector_custom_i32("vector_long_perm_mask", Assembler::AVX_512bit, 6226 0, 2, 4, 6, 8, 10, 12, 14); 6227 StubRoutines::x86::_vector_short_to_byte_mask = generate_vector_fp_mask("vector_short_to_byte_mask", 0x00ff00ff00ff00ff); 6228 StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask"); 6229 StubRoutines::x86::_vector_int_to_byte_mask = generate_vector_fp_mask("vector_int_to_byte_mask", 0x000000ff000000ff); 6230 StubRoutines::x86::_vector_int_to_short_mask = generate_vector_fp_mask("vector_int_to_short_mask", 0x0000ffff0000ffff); 6231 StubRoutines::x86::_vector_32_bit_mask = generate_vector_custom_i32("vector_32_bit_mask", Assembler::AVX_512bit, 6232 0xFFFFFFFF, 0, 0, 0); 6233 StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit, 6234 0xFFFFFFFF, 0xFFFFFFFF, 0, 0); 6235 StubRoutines::x86::_vector_int_shuffle_mask = generate_vector_fp_mask("vector_int_shuffle_mask", 0x0302010003020100); 6236 StubRoutines::x86::_vector_int_size_mask = generate_vector_fp_mask("vector_int_size_mask", 0x0000000400000004); 6237 StubRoutines::x86::_vector_short_shuffle_mask = generate_vector_fp_mask("vector_short_shuffle_mask", 0x0100010001000100); 6238 StubRoutines::x86::_vector_short_size_mask = generate_vector_fp_mask("vector_short_size_mask", 0x0002000200020002); 6239 StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_fp_mask("vector_long_shuffle_mask", 0x0000000100000000); 6240 StubRoutines::x86::_vector_long_size_mask = generate_vector_fp_mask("vector_long_size_mask", 0x0000000200000002); 6241 6242 // support for verify_oop (must happen after universe_init) 6243 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 6244 6245 // arraycopy stubs used by compilers 6246 generate_arraycopy_stubs(); 6247 6248 // don't bother generating these AES intrinsic stubs unless global flag is set 6249 if (UseAESIntrinsics) { 6250 StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask(); // needed by the others 6251 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 6252 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 6253 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 6254 if (VM_Version::supports_vaes() && VM_Version::supports_avx512vl() && VM_Version::supports_avx512dq() ) { 6255 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptVectorAESCrypt(); 6256 } else { 6257 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); 6258 } 6259 } 6260 if (UseAESCTRIntrinsics){ 6261 StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask(); 6262 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel(); 6263 } 6264 6265 if (UseSHA1Intrinsics) { 6266 StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask(); 6267 StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask(); 6268 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 6269 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 6270 } 6271 if (UseSHA256Intrinsics) { 6272 StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256; 6273 char* dst = (char*)StubRoutines::x86::_k256_W; 6274 char* src = (char*)StubRoutines::x86::_k256; 6275 for (int ii = 0; ii < 16; ++ii) { 6276 memcpy(dst + 32 * ii, src + 16 * ii, 16); 6277 memcpy(dst + 32 * ii + 16, src + 16 * ii, 16); 6278 } 6279 StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W; 6280 StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask(); 6281 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 6282 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 6283 } 6284 if (UseSHA512Intrinsics) { 6285 StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W; 6286 StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512(); 6287 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 6288 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 6289 } 6290 6291 // Generate GHASH intrinsics code 6292 if (UseGHASHIntrinsics) { 6293 StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask(); 6294 StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask(); 6295 if (VM_Version::supports_avx()) { 6296 StubRoutines::x86::_ghash_shuffmask_addr = ghash_shufflemask_addr(); 6297 StubRoutines::x86::_ghash_poly_addr = ghash_polynomial_addr(); 6298 StubRoutines::_ghash_processBlocks = generate_avx_ghash_processBlocks(); 6299 } else { 6300 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 6301 } 6302 } 6303 6304 if (UseBASE64Intrinsics) { 6305 StubRoutines::x86::_and_mask = base64_and_mask_addr(); 6306 StubRoutines::x86::_bswap_mask = base64_bswap_mask_addr(); 6307 StubRoutines::x86::_base64_charset = base64_charset_addr(); 6308 StubRoutines::x86::_url_charset = base64url_charset_addr(); 6309 StubRoutines::x86::_gather_mask = base64_gather_mask_addr(); 6310 StubRoutines::x86::_left_shift_mask = base64_left_shift_mask_addr(); 6311 StubRoutines::x86::_right_shift_mask = base64_right_shift_mask_addr(); 6312 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock(); 6313 } 6314 6315 // Safefetch stubs. 6316 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 6317 &StubRoutines::_safefetch32_fault_pc, 6318 &StubRoutines::_safefetch32_continuation_pc); 6319 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 6320 &StubRoutines::_safefetchN_fault_pc, 6321 &StubRoutines::_safefetchN_continuation_pc); 6322 6323 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 6324 if (bs_nm != NULL) { 6325 StubRoutines::x86::_method_entry_barrier = generate_method_entry_barrier(); 6326 } 6327 #ifdef COMPILER2 6328 if (UseMultiplyToLenIntrinsic) { 6329 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 6330 } 6331 if (UseSquareToLenIntrinsic) { 6332 StubRoutines::_squareToLen = generate_squareToLen(); 6333 } 6334 if (UseMulAddIntrinsic) { 6335 StubRoutines::_mulAdd = generate_mulAdd(); 6336 } 6337 #ifndef _WINDOWS 6338 if (UseMontgomeryMultiplyIntrinsic) { 6339 StubRoutines::_montgomeryMultiply 6340 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply); 6341 } 6342 if (UseMontgomerySquareIntrinsic) { 6343 StubRoutines::_montgomerySquare 6344 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square); 6345 } 6346 #endif // WINDOWS 6347 #endif // COMPILER2 6348 6349 if (UseVectorizedMismatchIntrinsic) { 6350 StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch(); 6351 } 6352 6353 #ifdef __VECTOR_API_MATH_INTRINSICS_COMMON 6354 if (UseVectorApiIntrinsics) { 6355 if (UseAVX >= 1) { 6356 #if defined(__VECTOR_API_MATH_INTRINSICS_LINUX) 6357 if (UseAVX > 2) { 6358 StubRoutines::_vector_float512_exp = CAST_FROM_FN_PTR(address, __svml_expf16_ha_z0); 6359 StubRoutines::_vector_double512_exp = CAST_FROM_FN_PTR(address, __svml_exp8_ha_z0); 6360 StubRoutines::_vector_float512_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f16_ha_z0); 6361 StubRoutines::_vector_double512_expm1 = CAST_FROM_FN_PTR(address, __svml_expm18_ha_z0); 6362 StubRoutines::_vector_float512_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf16_ha_z0); 6363 StubRoutines::_vector_double512_log1p = CAST_FROM_FN_PTR(address, __svml_log1p8_ha_z0); 6364 StubRoutines::_vector_float512_log = CAST_FROM_FN_PTR(address, __svml_logf16_ha_z0); 6365 StubRoutines::_vector_double512_log = CAST_FROM_FN_PTR(address, __svml_log8_ha_z0); 6366 StubRoutines::_vector_float512_log10 = CAST_FROM_FN_PTR(address, __svml_log10f16_ha_z0); 6367 StubRoutines::_vector_double512_log10 = CAST_FROM_FN_PTR(address, __svml_log108_ha_z0); 6368 StubRoutines::_vector_float512_sin = CAST_FROM_FN_PTR(address, __svml_sinf16_ha_z0); 6369 StubRoutines::_vector_double512_sin = CAST_FROM_FN_PTR(address, __svml_sin8_ha_z0); 6370 StubRoutines::_vector_float512_cos = CAST_FROM_FN_PTR(address, __svml_cosf16_ha_z0); 6371 StubRoutines::_vector_double512_cos = CAST_FROM_FN_PTR(address, __svml_cos8_ha_z0); 6372 StubRoutines::_vector_float512_tan = CAST_FROM_FN_PTR(address, __svml_tanf16_ha_z0); 6373 StubRoutines::_vector_double512_tan = CAST_FROM_FN_PTR(address, __svml_tan8_ha_z0); 6374 StubRoutines::_vector_float512_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf16_ha_z0); 6375 StubRoutines::_vector_double512_sinh = CAST_FROM_FN_PTR(address, __svml_sinh8_ha_z0); 6376 StubRoutines::_vector_float512_cosh = CAST_FROM_FN_PTR(address, __svml_coshf16_ha_z0); 6377 StubRoutines::_vector_double512_cosh = CAST_FROM_FN_PTR(address, __svml_cosh8_ha_z0); 6378 StubRoutines::_vector_float512_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf16_ha_z0); 6379 StubRoutines::_vector_double512_tanh = CAST_FROM_FN_PTR(address, __svml_tanh8_ha_z0); 6380 StubRoutines::_vector_float512_acos = CAST_FROM_FN_PTR(address, __svml_acosf16_ha_z0); 6381 StubRoutines::_vector_double512_acos = CAST_FROM_FN_PTR(address, __svml_acos8_ha_z0); 6382 StubRoutines::_vector_float512_asin = CAST_FROM_FN_PTR(address, __svml_asinf16_ha_z0); 6383 StubRoutines::_vector_double512_asin = CAST_FROM_FN_PTR(address, __svml_asin8_ha_z0); 6384 StubRoutines::_vector_float512_atan = CAST_FROM_FN_PTR(address, __svml_atanf16_ha_z0); 6385 StubRoutines::_vector_double512_atan = CAST_FROM_FN_PTR(address, __svml_atan8_ha_z0); 6386 StubRoutines::_vector_float512_pow = CAST_FROM_FN_PTR(address, __svml_powf16_ha_z0); 6387 StubRoutines::_vector_double512_pow = CAST_FROM_FN_PTR(address, __svml_pow8_ha_z0); 6388 StubRoutines::_vector_float512_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf16_ha_z0); 6389 StubRoutines::_vector_double512_hypot = CAST_FROM_FN_PTR(address, __svml_hypot8_ha_z0); 6390 StubRoutines::_vector_float512_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf16_ha_z0); 6391 StubRoutines::_vector_double512_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt8_ha_z0); 6392 StubRoutines::_vector_float512_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f16_ha_z0); 6393 StubRoutines::_vector_double512_atan2 = CAST_FROM_FN_PTR(address, __svml_atan28_ha_z0); 6394 } 6395 #endif 6396 if (UseAVX==1) { 6397 StubRoutines::_vector_float64_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_e9); 6398 StubRoutines::_vector_float128_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_e9); 6399 StubRoutines::_vector_float256_exp = CAST_FROM_FN_PTR(address, __svml_expf8_ha_e9); 6400 StubRoutines::_vector_double64_exp = CAST_FROM_FN_PTR(address, __svml_exp1_ha_e9); 6401 StubRoutines::_vector_double128_exp = CAST_FROM_FN_PTR(address, __svml_exp2_ha_e9); 6402 StubRoutines::_vector_double256_exp = CAST_FROM_FN_PTR(address, __svml_exp4_ha_e9); 6403 StubRoutines::_vector_float64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_e9); 6404 StubRoutines::_vector_float128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_e9); 6405 StubRoutines::_vector_float256_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f8_ha_e9); 6406 StubRoutines::_vector_double64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm11_ha_e9); 6407 StubRoutines::_vector_double128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm12_ha_e9); 6408 StubRoutines::_vector_double256_expm1 = CAST_FROM_FN_PTR(address, __svml_expm14_ha_e9); 6409 StubRoutines::_vector_float64_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_e9); 6410 StubRoutines::_vector_float128_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_e9); 6411 StubRoutines::_vector_float256_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf8_ha_e9); 6412 StubRoutines::_vector_double64_log1p = CAST_FROM_FN_PTR(address, __svml_log1p1_ha_e9); 6413 StubRoutines::_vector_double128_log1p = CAST_FROM_FN_PTR(address, __svml_log1p2_ha_e9); 6414 StubRoutines::_vector_double256_log1p = CAST_FROM_FN_PTR(address, __svml_log1p4_ha_e9); 6415 StubRoutines::_vector_float64_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_e9); 6416 StubRoutines::_vector_float128_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_e9); 6417 StubRoutines::_vector_float256_log = CAST_FROM_FN_PTR(address, __svml_logf8_ha_e9); 6418 StubRoutines::_vector_double64_log = CAST_FROM_FN_PTR(address, __svml_log1_ha_e9); 6419 StubRoutines::_vector_double128_log = CAST_FROM_FN_PTR(address, __svml_log2_ha_e9); 6420 StubRoutines::_vector_double256_log = CAST_FROM_FN_PTR(address, __svml_log4_ha_e9); 6421 StubRoutines::_vector_float64_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_e9); 6422 StubRoutines::_vector_float128_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_e9); 6423 StubRoutines::_vector_float256_log10 = CAST_FROM_FN_PTR(address, __svml_log10f8_ha_e9); 6424 StubRoutines::_vector_double64_log10 = CAST_FROM_FN_PTR(address, __svml_log101_ha_e9); 6425 StubRoutines::_vector_double128_log10 = CAST_FROM_FN_PTR(address, __svml_log102_ha_e9); 6426 StubRoutines::_vector_double256_log10 = CAST_FROM_FN_PTR(address, __svml_log104_ha_e9); 6427 StubRoutines::_vector_float64_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_e9); 6428 StubRoutines::_vector_float128_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_e9); 6429 StubRoutines::_vector_float256_sin = CAST_FROM_FN_PTR(address, __svml_sinf8_ha_e9); 6430 StubRoutines::_vector_double64_sin = CAST_FROM_FN_PTR(address, __svml_sin1_ha_e9); 6431 StubRoutines::_vector_double128_sin = CAST_FROM_FN_PTR(address, __svml_sin2_ha_e9); 6432 StubRoutines::_vector_double256_sin = CAST_FROM_FN_PTR(address, __svml_sin4_ha_e9); 6433 StubRoutines::_vector_float64_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_e9); 6434 StubRoutines::_vector_float128_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_e9); 6435 StubRoutines::_vector_float256_cos = CAST_FROM_FN_PTR(address, __svml_cosf8_ha_e9); 6436 StubRoutines::_vector_double64_cos = CAST_FROM_FN_PTR(address, __svml_cos1_ha_e9); 6437 StubRoutines::_vector_double128_cos = CAST_FROM_FN_PTR(address, __svml_cos2_ha_e9); 6438 StubRoutines::_vector_double256_cos = CAST_FROM_FN_PTR(address, __svml_cos4_ha_e9); 6439 StubRoutines::_vector_float64_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_e9); 6440 StubRoutines::_vector_float128_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_e9); 6441 StubRoutines::_vector_float256_tan = CAST_FROM_FN_PTR(address, __svml_tanf8_ha_e9); 6442 StubRoutines::_vector_double64_tan = CAST_FROM_FN_PTR(address, __svml_tan1_ha_e9); 6443 StubRoutines::_vector_double128_tan = CAST_FROM_FN_PTR(address, __svml_tan2_ha_e9); 6444 StubRoutines::_vector_double256_tan = CAST_FROM_FN_PTR(address, __svml_tan4_ha_e9); 6445 StubRoutines::_vector_float64_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_e9); 6446 StubRoutines::_vector_float128_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_e9); 6447 StubRoutines::_vector_float256_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf8_ha_e9); 6448 StubRoutines::_vector_double64_sinh = CAST_FROM_FN_PTR(address, __svml_sinh1_ha_e9); 6449 StubRoutines::_vector_double128_sinh = CAST_FROM_FN_PTR(address, __svml_sinh2_ha_e9); 6450 StubRoutines::_vector_double256_sinh = CAST_FROM_FN_PTR(address, __svml_sinh4_ha_e9); 6451 StubRoutines::_vector_float64_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_e9); 6452 StubRoutines::_vector_float128_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_e9); 6453 StubRoutines::_vector_float256_cosh = CAST_FROM_FN_PTR(address, __svml_coshf8_ha_e9); 6454 StubRoutines::_vector_double64_cosh = CAST_FROM_FN_PTR(address, __svml_cosh1_ha_e9); 6455 StubRoutines::_vector_double128_cosh = CAST_FROM_FN_PTR(address, __svml_cosh2_ha_e9); 6456 StubRoutines::_vector_double256_cosh = CAST_FROM_FN_PTR(address, __svml_cosh4_ha_e9); 6457 StubRoutines::_vector_float64_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_e9); 6458 StubRoutines::_vector_float128_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_e9); 6459 StubRoutines::_vector_float256_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf8_ha_e9); 6460 StubRoutines::_vector_double64_tanh = CAST_FROM_FN_PTR(address, __svml_tanh1_ha_e9); 6461 StubRoutines::_vector_double128_tanh = CAST_FROM_FN_PTR(address, __svml_tanh2_ha_e9); 6462 StubRoutines::_vector_double256_tanh = CAST_FROM_FN_PTR(address, __svml_tanh4_ha_e9); 6463 StubRoutines::_vector_float64_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_e9); 6464 StubRoutines::_vector_float128_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_e9); 6465 StubRoutines::_vector_float256_acos = CAST_FROM_FN_PTR(address, __svml_acosf8_ha_e9); 6466 StubRoutines::_vector_double64_acos = CAST_FROM_FN_PTR(address, __svml_acos1_ha_e9); 6467 StubRoutines::_vector_double128_acos = CAST_FROM_FN_PTR(address, __svml_acos2_ha_e9); 6468 StubRoutines::_vector_double256_acos = CAST_FROM_FN_PTR(address, __svml_acos4_ha_e9); 6469 StubRoutines::_vector_float64_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_e9); 6470 StubRoutines::_vector_float128_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_e9); 6471 StubRoutines::_vector_float256_asin = CAST_FROM_FN_PTR(address, __svml_asinf8_ha_e9); 6472 StubRoutines::_vector_double64_asin = CAST_FROM_FN_PTR(address, __svml_asin1_ha_e9); 6473 StubRoutines::_vector_double128_asin = CAST_FROM_FN_PTR(address, __svml_asin2_ha_e9); 6474 StubRoutines::_vector_double256_asin = CAST_FROM_FN_PTR(address, __svml_asin4_ha_e9); 6475 StubRoutines::_vector_float64_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_e9); 6476 StubRoutines::_vector_float128_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_e9); 6477 StubRoutines::_vector_float256_atan = CAST_FROM_FN_PTR(address, __svml_atanf8_ha_e9); 6478 StubRoutines::_vector_double64_atan = CAST_FROM_FN_PTR(address, __svml_atan1_ha_e9); 6479 StubRoutines::_vector_double128_atan = CAST_FROM_FN_PTR(address, __svml_atan2_ha_e9); 6480 StubRoutines::_vector_double256_atan = CAST_FROM_FN_PTR(address, __svml_atan4_ha_e9); 6481 StubRoutines::_vector_float64_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_e9); 6482 StubRoutines::_vector_float128_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_e9); 6483 StubRoutines::_vector_float256_pow = CAST_FROM_FN_PTR(address, __svml_powf8_ha_e9); 6484 StubRoutines::_vector_double64_pow = CAST_FROM_FN_PTR(address, __svml_pow1_ha_e9); 6485 StubRoutines::_vector_double128_pow = CAST_FROM_FN_PTR(address, __svml_pow2_ha_e9); 6486 StubRoutines::_vector_double256_pow = CAST_FROM_FN_PTR(address, __svml_pow4_ha_e9); 6487 StubRoutines::_vector_float64_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_e9); 6488 StubRoutines::_vector_float128_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_e9); 6489 StubRoutines::_vector_float256_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf8_ha_e9); 6490 StubRoutines::_vector_double64_hypot = CAST_FROM_FN_PTR(address, __svml_hypot1_ha_e9); 6491 StubRoutines::_vector_double128_hypot = CAST_FROM_FN_PTR(address, __svml_hypot2_ha_e9); 6492 StubRoutines::_vector_double256_hypot = CAST_FROM_FN_PTR(address, __svml_hypot4_ha_e9); 6493 StubRoutines::_vector_float64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_e9); 6494 StubRoutines::_vector_float128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_e9); 6495 StubRoutines::_vector_float256_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf8_ha_e9); 6496 StubRoutines::_vector_double64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt1_ha_e9); 6497 StubRoutines::_vector_double128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt2_ha_e9); 6498 StubRoutines::_vector_double256_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt4_ha_e9); 6499 StubRoutines::_vector_float64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_e9); 6500 StubRoutines::_vector_float128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_e9); 6501 StubRoutines::_vector_float256_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f8_ha_e9); 6502 StubRoutines::_vector_double64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan21_ha_e9); 6503 StubRoutines::_vector_double128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan22_ha_e9); 6504 StubRoutines::_vector_double256_atan2 = CAST_FROM_FN_PTR(address, __svml_atan24_ha_e9); 6505 } 6506 else { 6507 StubRoutines::_vector_float64_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_l9); 6508 StubRoutines::_vector_float128_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_l9); 6509 StubRoutines::_vector_float256_exp = CAST_FROM_FN_PTR(address, __svml_expf8_ha_l9); 6510 StubRoutines::_vector_double64_exp = CAST_FROM_FN_PTR(address, __svml_exp1_ha_l9); 6511 StubRoutines::_vector_double128_exp = CAST_FROM_FN_PTR(address, __svml_exp2_ha_l9); 6512 StubRoutines::_vector_double256_exp = CAST_FROM_FN_PTR(address, __svml_exp4_ha_l9); 6513 StubRoutines::_vector_float64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_l9); 6514 StubRoutines::_vector_float128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_l9); 6515 StubRoutines::_vector_float256_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f8_ha_l9); 6516 StubRoutines::_vector_double64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm11_ha_l9); 6517 StubRoutines::_vector_double128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm12_ha_l9); 6518 StubRoutines::_vector_double256_expm1 = CAST_FROM_FN_PTR(address, __svml_expm14_ha_l9); 6519 StubRoutines::_vector_float64_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_l9); 6520 StubRoutines::_vector_float128_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_l9); 6521 StubRoutines::_vector_float256_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf8_ha_l9); 6522 StubRoutines::_vector_double64_log1p = CAST_FROM_FN_PTR(address, __svml_log1p1_ha_l9); 6523 StubRoutines::_vector_double128_log1p = CAST_FROM_FN_PTR(address, __svml_log1p2_ha_l9); 6524 StubRoutines::_vector_double256_log1p = CAST_FROM_FN_PTR(address, __svml_log1p4_ha_l9); 6525 StubRoutines::_vector_float64_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_l9); 6526 StubRoutines::_vector_float128_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_l9); 6527 StubRoutines::_vector_float256_log = CAST_FROM_FN_PTR(address, __svml_logf8_ha_l9); 6528 StubRoutines::_vector_double64_log = CAST_FROM_FN_PTR(address, __svml_log1_ha_l9); 6529 StubRoutines::_vector_double128_log = CAST_FROM_FN_PTR(address, __svml_log2_ha_l9); 6530 StubRoutines::_vector_double256_log = CAST_FROM_FN_PTR(address, __svml_log4_ha_l9); 6531 StubRoutines::_vector_float64_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_l9); 6532 StubRoutines::_vector_float128_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_l9); 6533 StubRoutines::_vector_float256_log10 = CAST_FROM_FN_PTR(address, __svml_log10f8_ha_l9); 6534 StubRoutines::_vector_double64_log10 = CAST_FROM_FN_PTR(address, __svml_log101_ha_l9); 6535 StubRoutines::_vector_double128_log10 = CAST_FROM_FN_PTR(address, __svml_log102_ha_l9); 6536 StubRoutines::_vector_double256_log10 = CAST_FROM_FN_PTR(address, __svml_log104_ha_l9); 6537 StubRoutines::_vector_float64_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_l9); 6538 StubRoutines::_vector_float128_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_l9); 6539 StubRoutines::_vector_float256_sin = CAST_FROM_FN_PTR(address, __svml_sinf8_ha_l9); 6540 StubRoutines::_vector_double64_sin = CAST_FROM_FN_PTR(address, __svml_sin1_ha_l9); 6541 StubRoutines::_vector_double128_sin = CAST_FROM_FN_PTR(address, __svml_sin2_ha_l9); 6542 StubRoutines::_vector_double256_sin = CAST_FROM_FN_PTR(address, __svml_sin4_ha_l9); 6543 StubRoutines::_vector_float64_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_l9); 6544 StubRoutines::_vector_float128_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_l9); 6545 StubRoutines::_vector_float256_cos = CAST_FROM_FN_PTR(address, __svml_cosf8_ha_l9); 6546 StubRoutines::_vector_double64_cos = CAST_FROM_FN_PTR(address, __svml_cos1_ha_l9); 6547 StubRoutines::_vector_double128_cos = CAST_FROM_FN_PTR(address, __svml_cos2_ha_l9); 6548 StubRoutines::_vector_double256_cos = CAST_FROM_FN_PTR(address, __svml_cos4_ha_l9); 6549 StubRoutines::_vector_float64_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_l9); 6550 StubRoutines::_vector_float128_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_l9); 6551 StubRoutines::_vector_float256_tan = CAST_FROM_FN_PTR(address, __svml_tanf8_ha_l9); 6552 StubRoutines::_vector_double64_tan = CAST_FROM_FN_PTR(address, __svml_tan1_ha_l9); 6553 StubRoutines::_vector_double128_tan = CAST_FROM_FN_PTR(address, __svml_tan2_ha_l9); 6554 StubRoutines::_vector_double256_tan = CAST_FROM_FN_PTR(address, __svml_tan4_ha_l9); 6555 StubRoutines::_vector_float64_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_l9); 6556 StubRoutines::_vector_float128_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_l9); 6557 StubRoutines::_vector_float256_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf8_ha_l9); 6558 StubRoutines::_vector_double64_sinh = CAST_FROM_FN_PTR(address, __svml_sinh1_ha_l9); 6559 StubRoutines::_vector_double128_sinh = CAST_FROM_FN_PTR(address, __svml_sinh2_ha_l9); 6560 StubRoutines::_vector_double256_sinh = CAST_FROM_FN_PTR(address, __svml_sinh4_ha_l9); 6561 StubRoutines::_vector_float64_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_l9); 6562 StubRoutines::_vector_float128_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_l9); 6563 StubRoutines::_vector_float256_cosh = CAST_FROM_FN_PTR(address, __svml_coshf8_ha_l9); 6564 StubRoutines::_vector_double64_cosh = CAST_FROM_FN_PTR(address, __svml_cosh1_ha_l9); 6565 StubRoutines::_vector_double128_cosh = CAST_FROM_FN_PTR(address, __svml_cosh2_ha_l9); 6566 StubRoutines::_vector_double256_cosh = CAST_FROM_FN_PTR(address, __svml_cosh4_ha_l9); 6567 StubRoutines::_vector_float64_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_l9); 6568 StubRoutines::_vector_float128_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_l9); 6569 StubRoutines::_vector_float256_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf8_ha_l9); 6570 StubRoutines::_vector_double64_tanh = CAST_FROM_FN_PTR(address, __svml_tanh1_ha_l9); 6571 StubRoutines::_vector_double128_tanh = CAST_FROM_FN_PTR(address, __svml_tanh2_ha_l9); 6572 StubRoutines::_vector_double256_tanh = CAST_FROM_FN_PTR(address, __svml_tanh4_ha_l9); 6573 StubRoutines::_vector_float64_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_l9); 6574 StubRoutines::_vector_float128_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_l9); 6575 StubRoutines::_vector_float256_acos = CAST_FROM_FN_PTR(address, __svml_acosf8_ha_l9); 6576 StubRoutines::_vector_double64_acos = CAST_FROM_FN_PTR(address, __svml_acos1_ha_l9); 6577 StubRoutines::_vector_double128_acos = CAST_FROM_FN_PTR(address, __svml_acos2_ha_l9); 6578 StubRoutines::_vector_double256_acos = CAST_FROM_FN_PTR(address, __svml_acos4_ha_l9); 6579 StubRoutines::_vector_float64_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_l9); 6580 StubRoutines::_vector_float128_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_l9); 6581 StubRoutines::_vector_float256_asin = CAST_FROM_FN_PTR(address, __svml_asinf8_ha_l9); 6582 StubRoutines::_vector_double64_asin = CAST_FROM_FN_PTR(address, __svml_asin1_ha_l9); 6583 StubRoutines::_vector_double128_asin = CAST_FROM_FN_PTR(address, __svml_asin2_ha_l9); 6584 StubRoutines::_vector_double256_asin = CAST_FROM_FN_PTR(address, __svml_asin4_ha_l9); 6585 StubRoutines::_vector_float64_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_l9); 6586 StubRoutines::_vector_float128_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_l9); 6587 StubRoutines::_vector_float256_atan = CAST_FROM_FN_PTR(address, __svml_atanf8_ha_l9); 6588 StubRoutines::_vector_double64_atan = CAST_FROM_FN_PTR(address, __svml_atan1_ha_l9); 6589 StubRoutines::_vector_double128_atan = CAST_FROM_FN_PTR(address, __svml_atan2_ha_l9); 6590 StubRoutines::_vector_double256_atan = CAST_FROM_FN_PTR(address, __svml_atan4_ha_l9); 6591 StubRoutines::_vector_float64_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_l9); 6592 StubRoutines::_vector_float128_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_l9); 6593 StubRoutines::_vector_float256_pow = CAST_FROM_FN_PTR(address, __svml_powf8_ha_l9); 6594 StubRoutines::_vector_double64_pow = CAST_FROM_FN_PTR(address, __svml_pow1_ha_l9); 6595 StubRoutines::_vector_double128_pow = CAST_FROM_FN_PTR(address, __svml_pow2_ha_l9); 6596 StubRoutines::_vector_double256_pow = CAST_FROM_FN_PTR(address, __svml_pow4_ha_l9); 6597 StubRoutines::_vector_float64_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_l9); 6598 StubRoutines::_vector_float128_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_l9); 6599 StubRoutines::_vector_float256_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf8_ha_l9); 6600 StubRoutines::_vector_double64_hypot = CAST_FROM_FN_PTR(address, __svml_hypot1_ha_l9); 6601 StubRoutines::_vector_double128_hypot = CAST_FROM_FN_PTR(address, __svml_hypot2_ha_l9); 6602 StubRoutines::_vector_double256_hypot = CAST_FROM_FN_PTR(address, __svml_hypot4_ha_l9); 6603 StubRoutines::_vector_float64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_l9); 6604 StubRoutines::_vector_float128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_l9); 6605 StubRoutines::_vector_float256_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf8_ha_l9); 6606 StubRoutines::_vector_double64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt1_ha_l9); 6607 StubRoutines::_vector_double128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt2_ha_l9); 6608 StubRoutines::_vector_double256_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt4_ha_l9); 6609 StubRoutines::_vector_float64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_l9); 6610 StubRoutines::_vector_float128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_l9); 6611 StubRoutines::_vector_float256_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f8_ha_l9); 6612 StubRoutines::_vector_double64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan21_ha_l9); 6613 StubRoutines::_vector_double128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan22_ha_l9); 6614 StubRoutines::_vector_double256_atan2 = CAST_FROM_FN_PTR(address, __svml_atan24_ha_l9); 6615 } 6616 6617 6618 } else if (UseSSE>=2) { 6619 StubRoutines::_vector_float64_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_ex); 6620 StubRoutines::_vector_float128_exp = CAST_FROM_FN_PTR(address, __svml_expf4_ha_ex); 6621 StubRoutines::_vector_double64_exp = CAST_FROM_FN_PTR(address, __svml_exp1_ha_ex); 6622 StubRoutines::_vector_double128_exp = CAST_FROM_FN_PTR(address, __svml_exp2_ha_ex); 6623 StubRoutines::_vector_float64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_ex); 6624 StubRoutines::_vector_float128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm1f4_ha_ex); 6625 StubRoutines::_vector_double64_expm1 = CAST_FROM_FN_PTR(address, __svml_expm11_ha_ex); 6626 StubRoutines::_vector_double128_expm1 = CAST_FROM_FN_PTR(address, __svml_expm12_ha_ex); 6627 StubRoutines::_vector_float64_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_ex); 6628 StubRoutines::_vector_float128_acos = CAST_FROM_FN_PTR(address, __svml_acosf4_ha_ex); 6629 StubRoutines::_vector_double64_acos = CAST_FROM_FN_PTR(address, __svml_acos1_ha_ex); 6630 StubRoutines::_vector_double128_acos = CAST_FROM_FN_PTR(address, __svml_acos2_ha_ex); 6631 StubRoutines::_vector_float64_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_ex); 6632 StubRoutines::_vector_float128_asin = CAST_FROM_FN_PTR(address, __svml_asinf4_ha_ex); 6633 StubRoutines::_vector_double64_asin = CAST_FROM_FN_PTR(address, __svml_asin1_ha_ex); 6634 StubRoutines::_vector_double128_asin = CAST_FROM_FN_PTR(address, __svml_asin2_ha_ex); 6635 StubRoutines::_vector_float64_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_ex); 6636 StubRoutines::_vector_float128_atan = CAST_FROM_FN_PTR(address, __svml_atanf4_ha_ex); 6637 StubRoutines::_vector_double64_atan = CAST_FROM_FN_PTR(address, __svml_atan1_ha_ex); 6638 StubRoutines::_vector_double128_atan = CAST_FROM_FN_PTR(address, __svml_atan2_ha_ex); 6639 StubRoutines::_vector_float64_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_ex); 6640 StubRoutines::_vector_float128_sin = CAST_FROM_FN_PTR(address, __svml_sinf4_ha_ex); 6641 StubRoutines::_vector_double64_sin = CAST_FROM_FN_PTR(address, __svml_sin1_ha_ex); 6642 StubRoutines::_vector_double128_sin = CAST_FROM_FN_PTR(address, __svml_sin2_ha_ex); 6643 StubRoutines::_vector_float64_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_ex); 6644 StubRoutines::_vector_float128_cos = CAST_FROM_FN_PTR(address, __svml_cosf4_ha_ex); 6645 StubRoutines::_vector_double64_cos = CAST_FROM_FN_PTR(address, __svml_cos1_ha_ex); 6646 StubRoutines::_vector_double128_cos = CAST_FROM_FN_PTR(address, __svml_cos2_ha_ex); 6647 StubRoutines::_vector_float64_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_ex); 6648 StubRoutines::_vector_float128_tan = CAST_FROM_FN_PTR(address, __svml_tanf4_ha_ex); 6649 StubRoutines::_vector_double64_tan = CAST_FROM_FN_PTR(address, __svml_tan1_ha_ex); 6650 StubRoutines::_vector_double128_tan = CAST_FROM_FN_PTR(address, __svml_tan2_ha_ex); 6651 StubRoutines::_vector_float64_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_ex); 6652 StubRoutines::_vector_float128_sinh = CAST_FROM_FN_PTR(address, __svml_sinhf4_ha_ex); 6653 StubRoutines::_vector_double64_sinh = CAST_FROM_FN_PTR(address, __svml_sinh1_ha_ex); 6654 StubRoutines::_vector_double128_sinh = CAST_FROM_FN_PTR(address, __svml_sinh2_ha_ex); 6655 StubRoutines::_vector_float64_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_ex); 6656 StubRoutines::_vector_float128_cosh = CAST_FROM_FN_PTR(address, __svml_coshf4_ha_ex); 6657 StubRoutines::_vector_double64_cosh = CAST_FROM_FN_PTR(address, __svml_cosh1_ha_ex); 6658 StubRoutines::_vector_double128_cosh = CAST_FROM_FN_PTR(address, __svml_cosh2_ha_ex); 6659 StubRoutines::_vector_float64_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_ex); 6660 StubRoutines::_vector_float128_tanh = CAST_FROM_FN_PTR(address, __svml_tanhf4_ha_ex); 6661 StubRoutines::_vector_double64_tanh = CAST_FROM_FN_PTR(address, __svml_tanh1_ha_ex); 6662 StubRoutines::_vector_double128_tanh = CAST_FROM_FN_PTR(address, __svml_tanh2_ha_ex); 6663 StubRoutines::_vector_float64_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_ex); 6664 StubRoutines::_vector_float128_log = CAST_FROM_FN_PTR(address, __svml_logf4_ha_ex); 6665 StubRoutines::_vector_double64_log = CAST_FROM_FN_PTR(address, __svml_log1_ha_ex); 6666 StubRoutines::_vector_double128_log = CAST_FROM_FN_PTR(address, __svml_log2_ha_ex); 6667 StubRoutines::_vector_float64_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_ex); 6668 StubRoutines::_vector_float128_log10 = CAST_FROM_FN_PTR(address, __svml_log10f4_ha_ex); 6669 StubRoutines::_vector_double64_log10 = CAST_FROM_FN_PTR(address, __svml_log101_ha_ex); 6670 StubRoutines::_vector_double128_log10 = CAST_FROM_FN_PTR(address, __svml_log102_ha_ex); 6671 StubRoutines::_vector_float64_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_ex); 6672 StubRoutines::_vector_float128_log1p = CAST_FROM_FN_PTR(address, __svml_log1pf4_ha_ex); 6673 StubRoutines::_vector_double64_log1p = CAST_FROM_FN_PTR(address, __svml_log1p1_ha_ex); 6674 StubRoutines::_vector_double128_log1p = CAST_FROM_FN_PTR(address, __svml_log1p2_ha_ex); 6675 StubRoutines::_vector_float64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_ex); 6676 StubRoutines::_vector_float128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan2f4_ha_ex); 6677 StubRoutines::_vector_double64_atan2 = CAST_FROM_FN_PTR(address, __svml_atan21_ha_ex); 6678 StubRoutines::_vector_double128_atan2 = CAST_FROM_FN_PTR(address, __svml_atan22_ha_ex); 6679 StubRoutines::_vector_float64_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_ex); 6680 StubRoutines::_vector_float128_hypot = CAST_FROM_FN_PTR(address, __svml_hypotf4_ha_ex); 6681 StubRoutines::_vector_double64_hypot = CAST_FROM_FN_PTR(address, __svml_hypot1_ha_ex); 6682 StubRoutines::_vector_double128_hypot = CAST_FROM_FN_PTR(address, __svml_hypot2_ha_ex); 6683 StubRoutines::_vector_float64_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_ex); 6684 StubRoutines::_vector_float128_pow = CAST_FROM_FN_PTR(address, __svml_powf4_ha_ex); 6685 StubRoutines::_vector_double64_pow = CAST_FROM_FN_PTR(address, __svml_pow1_ha_ex); 6686 StubRoutines::_vector_double128_pow = CAST_FROM_FN_PTR(address, __svml_pow2_ha_ex); 6687 StubRoutines::_vector_float64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_ex); 6688 StubRoutines::_vector_float128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrtf4_ha_ex); 6689 StubRoutines::_vector_double64_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt1_ha_ex); 6690 StubRoutines::_vector_double128_cbrt = CAST_FROM_FN_PTR(address, __svml_cbrt2_ha_ex); 6691 } 6692 } 6693 #endif 6694 } 6695 6696 public: 6697 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 6698 if (all) { 6699 generate_all(); 6700 } else { 6701 generate_initial(); 6702 } 6703 } 6704 }; // end class declaration 6705 6706 void StubGenerator_generate(CodeBuffer* code, bool all) { 6707 StubGenerator g(code, all); 6708 }