src/cpu/x86/vm/stubGenerator_x86_64.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File 8004835 Sdiff src/cpu/x86/vm

src/cpu/x86/vm/stubGenerator_x86_64.cpp

Print this page




2936   address generate_key_shuffle_mask() {
2937     __ align(16);
2938     StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
2939     address start = __ pc();
2940     __ emit_data64( 0x0405060700010203, relocInfo::none );
2941     __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
2942     return start;
2943   }
2944 
2945   // Utility routine for loading a 128-bit key word in little endian format
2946   // can optionally specify that the shuffle mask is already in an xmmregister
2947   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2948     __ movdqu(xmmdst, Address(key, offset));
2949     if (xmm_shuf_mask != NULL) {
2950       __ pshufb(xmmdst, xmm_shuf_mask);
2951     } else {
2952       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2953     }
2954   }
2955 
2956   // aesenc using specified key+offset
2957   // can optionally specify that the shuffle mask is already in an xmmregister
2958   void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2959     load_key(xmmtmp, key, offset, xmm_shuf_mask);
2960     __ aesenc(xmmdst, xmmtmp);
2961   }
2962 
2963   // aesdec using specified key+offset
2964   // can optionally specify that the shuffle mask is already in an xmmregister
2965   void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2966     load_key(xmmtmp, key, offset, xmm_shuf_mask);
2967     __ aesdec(xmmdst, xmmtmp);
2968   }
2969 
2970 
2971   // Arguments:
2972   //
2973   // Inputs:
2974   //   c_rarg0   - source byte array address
2975   //   c_rarg1   - destination byte array address
2976   //   c_rarg2   - K (key) in little endian int array
2977   //
2978   address generate_aescrypt_encryptBlock() {
2979     assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
2980     __ align(CodeEntryAlignment);
2981     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2982     Label L_doLast;
2983     address start = __ pc();
2984 
2985     const Register from        = c_rarg0;  // source array address
2986     const Register to          = c_rarg1;  // destination array address
2987     const Register key         = c_rarg2;  // key array address
2988     const Register keylen      = rax;
2989 
2990     const XMMRegister xmm_result = xmm0;
2991     const XMMRegister xmm_temp   = xmm1;
2992     const XMMRegister xmm_key_shuf_mask = xmm2;




2993 
2994     __ enter(); // required for proper stackwalking of RuntimeStub frame
2995 

2996     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2997     // keylen = # of 32-bit words, convert to 128-bit words
2998     __ shrl(keylen, 2);
2999     __ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more
3000 
3001     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3002     __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
3003 
3004     // For encryption, the java expanded key ordering is just what we need
3005     // we don't know if the key is aligned, hence not using load-execute form
3006 
3007     load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
3008     __ pxor(xmm_result, xmm_temp);
3009     for (int offset = 0x10; offset <= 0x90; offset += 0x10) {
3010       aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
3011     }
3012     load_key  (xmm_temp, key, 0xa0, xmm_key_shuf_mask);
3013     __ cmpl(keylen, 0);
3014     __ jcc(Assembler::equal, L_doLast);
3015     __ aesenc(xmm_result, xmm_temp);                   // only in 192 and 256 bit keys
3016     aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
3017     load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask);
3018     __ subl(keylen, 2);
3019     __ jcc(Assembler::equal, L_doLast);
3020     __ aesenc(xmm_result, xmm_temp);                   // only in 256 bit keys
3021     aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
3022     load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask);



























3023 
3024     __ BIND(L_doLast);
3025     __ aesenclast(xmm_result, xmm_temp);

3026     __ movdqu(Address(to, 0), xmm_result);        // store the result
3027     __ xorptr(rax, rax); // return 0
3028     __ leave(); // required for proper stackwalking of RuntimeStub frame
3029     __ ret(0);
3030 
3031     return start;
3032   }
3033 
3034 
3035   // Arguments:
3036   //
3037   // Inputs:
3038   //   c_rarg0   - source byte array address
3039   //   c_rarg1   - destination byte array address
3040   //   c_rarg2   - K (key) in little endian int array
3041   //
3042   address generate_aescrypt_decryptBlock() {
3043     assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
3044     __ align(CodeEntryAlignment);
3045     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3046     Label L_doLast;
3047     address start = __ pc();
3048 
3049     const Register from        = c_rarg0;  // source array address
3050     const Register to          = c_rarg1;  // destination array address
3051     const Register key         = c_rarg2;  // key array address
3052     const Register keylen      = rax;
3053 
3054     const XMMRegister xmm_result = xmm0;
3055     const XMMRegister xmm_temp   = xmm1;
3056     const XMMRegister xmm_key_shuf_mask = xmm2;




3057 
3058     __ enter(); // required for proper stackwalking of RuntimeStub frame
3059 

3060     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3061     // keylen = # of 32-bit words, convert to 128-bit words
3062     __ shrl(keylen, 2);
3063     __ subl(keylen, 11);   // every key has at least 11 128-bit words, some have more
3064 
3065     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3066     __ movdqu(xmm_result, Address(from, 0));
3067 
3068     // for decryption java expanded key ordering is rotated one position from what we want
3069     // so we start from 0x10 here and hit 0x00 last
3070     // we don't know if the key is aligned, hence not using load-execute form
3071     load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask);
3072     __ pxor  (xmm_result, xmm_temp);
3073     for (int offset = 0x20; offset <= 0xa0; offset += 0x10) {
3074       aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask);
3075     }
3076     __ cmpl(keylen, 0);
3077     __ jcc(Assembler::equal, L_doLast);
3078     // only in 192 and 256 bit keys
3079     aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask);
3080     aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask);
3081     __ subl(keylen, 2);
3082     __ jcc(Assembler::equal, L_doLast);
3083     // only in 256 bit keys
3084     aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask);
3085     aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask);


























3086 
3087     __ BIND(L_doLast);
3088     // for decryption the aesdeclast operation is always on key+0x00
3089     load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask);
3090     __ aesdeclast(xmm_result, xmm_temp);
3091 


3092     __ movdqu(Address(to, 0), xmm_result);  // store the result
3093 
3094     __ xorptr(rax, rax); // return 0
3095     __ leave(); // required for proper stackwalking of RuntimeStub frame
3096     __ ret(0);
3097 
3098     return start;
3099   }
3100 
3101 
3102   // Arguments:
3103   //
3104   // Inputs:
3105   //   c_rarg0   - source byte array address
3106   //   c_rarg1   - destination byte array address
3107   //   c_rarg2   - K (key) in little endian int array
3108   //   c_rarg3   - r vector byte array address
3109   //   c_rarg4   - input length
3110   //
3111   address generate_cipherBlockChaining_encryptAESCrypt() {
3112     assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
3113     __ align(CodeEntryAlignment);
3114     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3115     address start = __ pc();
3116 
3117     Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3118     const Register from        = c_rarg0;  // source array address
3119     const Register to          = c_rarg1;  // destination array address
3120     const Register key         = c_rarg2;  // key array address
3121     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3122                                            // and left with the results of the last encryption block
3123 #ifndef _WIN64
3124     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3125 #else
3126     const Address  len_mem(rsp, 6 * wordSize);  // length is on stack on Win64
3127     const Register len_reg     = r10;      // pick the first volatile windows register
3128 #endif
3129     const Register pos         = rax;
3130 
3131     // xmm register assignments for the loops below
3132     const XMMRegister xmm_result = xmm0;
3133     const XMMRegister xmm_temp   = xmm1;
3134     // keys 0-10 preloaded into xmm2-xmm12
3135     const int XMM_REG_NUM_KEY_FIRST = 2;
3136     const int XMM_REG_NUM_KEY_LAST  = 12;
3137     const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3138     const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);



3139 
3140     __ enter(); // required for proper stackwalking of RuntimeStub frame
3141 
3142 #ifdef _WIN64
3143     // on win64, fill len_reg from stack position
3144     __ movl(len_reg, len_mem);
3145     // save the xmm registers which must be preserved 6-12
3146     __ subptr(rsp, -rsp_after_call_off * wordSize);
3147     for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3148       __ movdqu(xmm_save(i), as_XMMRegister(i));
3149     }
3150 #endif
3151 
3152     const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
3153     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3154     // load up xmm regs 2 thru 12 with key 0x00 - 0xa0
3155     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
3156       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3157       offset += 0x10;
3158     }
3159 
3160     __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
3161 
3162     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3163     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3164     __ cmpl(rax, 44);
3165     __ jcc(Assembler::notEqual, L_key_192_256);
3166 
3167     // 128 bit code follows here
3168     __ movptr(pos, 0);
3169     __ align(OptoLoopAlignment);

3170     __ BIND(L_loopTop_128);
3171     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3172     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3173 
3174     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3175     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3176       __ aesenc(xmm_result, as_XMMRegister(rnum));
3177     }
3178     __ aesenclast(xmm_result, xmm_key10);
3179 
3180     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3181     // no need to store r to memory until we exit
3182     __ addptr(pos, AESBlockSize);
3183     __ subptr(len_reg, AESBlockSize);
3184     __ jcc(Assembler::notEqual, L_loopTop_128);
3185 
3186     __ BIND(L_exit);
3187     __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
3188 
3189 #ifdef _WIN64
3190     // restore xmm regs belonging to calling function
3191     for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3192       __ movdqu(as_XMMRegister(i), xmm_save(i));
3193     }
3194 #endif
3195     __ movl(rax, 0); // return 0 (why?)
3196     __ leave(); // required for proper stackwalking of RuntimeStub frame
3197     __ ret(0);
3198 
3199     __ BIND(L_key_192_256);
3200     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)


3201     __ cmpl(rax, 52);
3202     __ jcc(Assembler::notEqual, L_key_256);
3203 
3204     // 192-bit code follows here (could be changed to use more xmm registers)
3205     __ movptr(pos, 0);
3206     __ align(OptoLoopAlignment);

3207     __ BIND(L_loopTop_192);
3208     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3209     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3210 
3211     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3212     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
3213       __ aesenc(xmm_result, as_XMMRegister(rnum));
3214     }
3215     aes_enc_key(xmm_result, xmm_temp, key, 0xb0);
3216     load_key(xmm_temp, key, 0xc0);
3217     __ aesenclast(xmm_result, xmm_temp);
3218 
3219     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3220     // no need to store r to memory until we exit
3221     __ addptr(pos, AESBlockSize);
3222     __ subptr(len_reg, AESBlockSize);
3223     __ jcc(Assembler::notEqual, L_loopTop_192);
3224     __ jmp(L_exit);
3225 
3226     __ BIND(L_key_256);
3227     // 256-bit code follows here (could be changed to use more xmm registers)

3228     __ movptr(pos, 0);
3229     __ align(OptoLoopAlignment);

3230     __ BIND(L_loopTop_256);
3231     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3232     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3233 
3234     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3235     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST; rnum++) {
3236       __ aesenc(xmm_result, as_XMMRegister(rnum));
3237     }
3238     aes_enc_key(xmm_result, xmm_temp, key, 0xb0);
3239     aes_enc_key(xmm_result, xmm_temp, key, 0xc0);
3240     aes_enc_key(xmm_result, xmm_temp, key, 0xd0);
3241     load_key(xmm_temp, key, 0xe0);
3242     __ aesenclast(xmm_result, xmm_temp);
3243 
3244     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3245     // no need to store r to memory until we exit
3246     __ addptr(pos, AESBlockSize);
3247     __ subptr(len_reg, AESBlockSize);
3248     __ jcc(Assembler::notEqual, L_loopTop_256);
3249     __ jmp(L_exit);
3250 
3251     return start;
3252   }
3253 
3254 
3255 
3256   // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3257   // to hide instruction latency
3258   //
3259   // Arguments:
3260   //
3261   // Inputs:
3262   //   c_rarg0   - source byte array address
3263   //   c_rarg1   - destination byte array address
3264   //   c_rarg2   - K (key) in little endian int array
3265   //   c_rarg3   - r vector byte array address
3266   //   c_rarg4   - input length
3267   //
3268 
3269   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3270     assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support");
3271     __ align(CodeEntryAlignment);
3272     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3273     address start = __ pc();
3274 
3275     Label L_exit, L_key_192_256, L_key_256;
3276     Label L_singleBlock_loopTop_128, L_multiBlock_loopTop_128;
3277     Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256;
3278     const Register from        = c_rarg0;  // source array address
3279     const Register to          = c_rarg1;  // destination array address
3280     const Register key         = c_rarg2;  // key array address
3281     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3282                                            // and left with the results of the last encryption block
3283 #ifndef _WIN64
3284     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3285 #else
3286     const Address  len_mem(rsp, 6 * wordSize);  // length is on stack on Win64
3287     const Register len_reg     = r10;      // pick the first volatile windows register
3288 #endif
3289     const Register pos         = rax;
3290 
3291     // xmm register assignments for the loops below
3292     const XMMRegister xmm_result = xmm0;
3293     // keys 0-10 preloaded into xmm2-xmm12
3294     const int XMM_REG_NUM_KEY_FIRST = 5;
3295     const int XMM_REG_NUM_KEY_LAST  = 15;
3296     const XMMRegister xmm_key_first   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3297     const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3298 
3299     __ enter(); // required for proper stackwalking of RuntimeStub frame
3300 
3301 #ifdef _WIN64
3302     // on win64, fill len_reg from stack position
3303     __ movl(len_reg, len_mem);
3304     // save the xmm registers which must be preserved 6-15
3305     __ subptr(rsp, -rsp_after_call_off * wordSize);
3306     for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3307       __ movdqu(xmm_save(i), as_XMMRegister(i));
3308     }
3309 #endif
3310     // the java expanded key ordering is rotated one position from what we want
3311     // so we start from 0x10 here and hit 0x00 last
3312     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
3313     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3314     // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3315     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) {
3316       if (rnum == XMM_REG_NUM_KEY_LAST) offset = 0x00;
3317       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3318       offset += 0x10;
3319     }

3320 
3321     const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block

3322     // registers holding the four results in the parallelized loop
3323     const XMMRegister xmm_result0 = xmm0;
3324     const XMMRegister xmm_result1 = xmm2;
3325     const XMMRegister xmm_result2 = xmm3;
3326     const XMMRegister xmm_result3 = xmm4;
3327 
3328     __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
3329 
3330     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3331     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3332     __ cmpl(rax, 44);
3333     __ jcc(Assembler::notEqual, L_key_192_256);
3334 
3335 
3336     // 128-bit code follows here, parallelized
3337     __ movptr(pos, 0);
3338     __ align(OptoLoopAlignment);
3339     __ BIND(L_multiBlock_loopTop_128);
3340     __ cmpptr(len_reg, 4*AESBlockSize);           // see if at least 4 blocks left
3341     __ jcc(Assembler::less, L_singleBlock_loopTop_128);


3359     // for each result, xor with the r vector of previous cipher block
3360     __ pxor(xmm_result0, xmm_prev_block_cipher);
3361     __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0*AESBlockSize));
3362     __ pxor(xmm_result1, xmm_prev_block_cipher);
3363     __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1*AESBlockSize));
3364     __ pxor(xmm_result2, xmm_prev_block_cipher);
3365     __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2*AESBlockSize));
3366     __ pxor(xmm_result3, xmm_prev_block_cipher);
3367     __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3*AESBlockSize));   // this will carry over to next set of blocks
3368 
3369     __ movdqu(Address(to, pos, Address::times_1, 0*AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
3370     __ movdqu(Address(to, pos, Address::times_1, 1*AESBlockSize), xmm_result1);
3371     __ movdqu(Address(to, pos, Address::times_1, 2*AESBlockSize), xmm_result2);
3372     __ movdqu(Address(to, pos, Address::times_1, 3*AESBlockSize), xmm_result3);
3373 
3374     __ addptr(pos, 4*AESBlockSize);
3375     __ subptr(len_reg, 4*AESBlockSize);
3376     __ jmp(L_multiBlock_loopTop_128);
3377 
3378     // registers used in the non-parallelized loops


3379     const XMMRegister xmm_prev_block_cipher_save = xmm2;
3380     const XMMRegister xmm_temp   = xmm3;


3381 
3382     __ align(OptoLoopAlignment);
3383     __ BIND(L_singleBlock_loopTop_128);
3384     __ cmpptr(len_reg, 0);           // any blocks left??
3385     __ jcc(Assembler::equal, L_exit);
3386     __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
3387     __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
3388     __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
3389     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3390       __ aesdec(xmm_result, as_XMMRegister(rnum));
3391     }
3392     __ aesdeclast(xmm_result, xmm_key_last);
3393     __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
3394     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3395     // no need to store r to memory until we exit
3396     __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);              // set up next r vector with cipher input from this block
3397 
3398     __ addptr(pos, AESBlockSize);
3399     __ subptr(len_reg, AESBlockSize);
3400     __ jmp(L_singleBlock_loopTop_128);
3401 
3402 
3403     __ BIND(L_exit);
3404     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
3405 #ifdef _WIN64
3406     // restore regs belonging to calling function
3407     for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3408       __ movdqu(as_XMMRegister(i), xmm_save(i));
3409     }
3410 #endif
3411     __ movl(rax, 0); // return 0 (why?)
3412     __ leave(); // required for proper stackwalking of RuntimeStub frame
3413     __ ret(0);
3414 
3415 
3416     __ BIND(L_key_192_256);
3417     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)

3418     __ cmpl(rax, 52);
3419     __ jcc(Assembler::notEqual, L_key_256);
3420 
3421     // 192-bit code follows here (could be optimized to use parallelism)

3422     __ movptr(pos, 0);
3423     __ align(OptoLoopAlignment);

3424     __ BIND(L_singleBlock_loopTop_192);
3425     __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
3426     __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
3427     __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
3428     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3429       __ aesdec(xmm_result, as_XMMRegister(rnum));
3430     }
3431     aes_dec_key(xmm_result, xmm_temp, key, 0xb0);     // 192-bit key goes up to c0
3432     aes_dec_key(xmm_result, xmm_temp, key, 0xc0);
3433     __ aesdeclast(xmm_result, xmm_key_last);                    // xmm15 always came from key+0
3434     __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
3435     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3436     // no need to store r to memory until we exit
3437     __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);              // set up next r vector with cipher input from this block
3438 
3439     __ addptr(pos, AESBlockSize);
3440     __ subptr(len_reg, AESBlockSize);
3441     __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
3442     __ jmp(L_exit);
3443 
3444     __ BIND(L_key_256);
3445     // 256-bit code follows here (could be optimized to use parallelism)
3446     __ movptr(pos, 0);
3447     __ align(OptoLoopAlignment);

3448     __ BIND(L_singleBlock_loopTop_256);
3449     __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
3450     __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
3451     __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
3452     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3453       __ aesdec(xmm_result, as_XMMRegister(rnum));
3454     }
3455     aes_dec_key(xmm_result, xmm_temp, key, 0xb0);     // 256-bit key goes up to e0
3456     aes_dec_key(xmm_result, xmm_temp, key, 0xc0);
3457     aes_dec_key(xmm_result, xmm_temp, key, 0xd0);
3458     aes_dec_key(xmm_result, xmm_temp, key, 0xe0);



3459     __ aesdeclast(xmm_result, xmm_key_last);             // xmm15 came from key+0
3460     __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
3461     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3462     // no need to store r to memory until we exit
3463     __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);              // set up next r vector with cipher input from this block
3464 
3465     __ addptr(pos, AESBlockSize);
3466     __ subptr(len_reg, AESBlockSize);
3467     __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
3468     __ jmp(L_exit);
3469 
3470     return start;
3471   }
3472 
3473 
3474 
3475 #undef __
3476 #define __ masm->
3477 
3478   // Continuation point for throwing of implicit exceptions that are
3479   // not handled in the current activation. Fabricates an exception
3480   // oop and initiates normal exception dispatching in this
3481   // frame. Since we need to preserve callee-saved values (currently
3482   // only for C2, but done for C1 as well) we need a callee-saved oop
3483   // map and therefore have to make these stubs into RuntimeStubs
3484   // rather than BufferBlobs.  If the compiler needs all registers to




2936   address generate_key_shuffle_mask() {
2937     __ align(16);
2938     StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
2939     address start = __ pc();
2940     __ emit_data64( 0x0405060700010203, relocInfo::none );
2941     __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
2942     return start;
2943   }
2944 
2945   // Utility routine for loading a 128-bit key word in little endian format
2946   // can optionally specify that the shuffle mask is already in an xmmregister
2947   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2948     __ movdqu(xmmdst, Address(key, offset));
2949     if (xmm_shuf_mask != NULL) {
2950       __ pshufb(xmmdst, xmm_shuf_mask);
2951     } else {
2952       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2953     }
2954   }
2955 















2956   // Arguments:
2957   //
2958   // Inputs:
2959   //   c_rarg0   - source byte array address
2960   //   c_rarg1   - destination byte array address
2961   //   c_rarg2   - K (key) in little endian int array
2962   //
2963   address generate_aescrypt_encryptBlock() {
2964     assert(UseAES, "need AES instructions and misaligned SSE support");
2965     __ align(CodeEntryAlignment);
2966     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2967     Label L_doLast;
2968     address start = __ pc();
2969 
2970     const Register from        = c_rarg0;  // source array address
2971     const Register to          = c_rarg1;  // destination array address
2972     const Register key         = c_rarg2;  // key array address
2973     const Register keylen      = rax;
2974 
2975     const XMMRegister xmm_result = xmm0;
2976     const XMMRegister xmm_key_shuf_mask = xmm1;
2977     // On win64 xmm6-xmm15 must be preserved so don't use them.
2978     const XMMRegister xmm_temp1  = xmm2;
2979     const XMMRegister xmm_temp2  = xmm3;
2980     const XMMRegister xmm_temp3  = xmm4;
2981     const XMMRegister xmm_temp4  = xmm5;
2982 
2983     __ enter(); // required for proper stackwalking of RuntimeStub frame
2984 
2985     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
2986     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));



2987 
2988     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2989     __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
2990 
2991     // For encryption, the java expanded key ordering is just what we need
2992     // we don't know if the key is aligned, hence not using load-execute form
2993 
2994     load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
2995     __ pxor(xmm_result, xmm_temp1);
2996     
2997     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
2998     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
2999     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3000     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3001 
3002     __ aesenc(xmm_result, xmm_temp1);
3003     __ aesenc(xmm_result, xmm_temp2);
3004     __ aesenc(xmm_result, xmm_temp3);
3005     __ aesenc(xmm_result, xmm_temp4);
3006 
3007     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3008     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3009     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3010     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3011 
3012     __ aesenc(xmm_result, xmm_temp1);
3013     __ aesenc(xmm_result, xmm_temp2);
3014     __ aesenc(xmm_result, xmm_temp3);
3015     __ aesenc(xmm_result, xmm_temp4);
3016 
3017     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3018     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3019     
3020     __ cmpl(keylen, 44);
3021     __ jccb(Assembler::equal, L_doLast);
3022 
3023     __ aesenc(xmm_result, xmm_temp1);
3024     __ aesenc(xmm_result, xmm_temp2);
3025 
3026     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3027     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3028 
3029     __ cmpl(keylen, 52);
3030     __ jccb(Assembler::equal, L_doLast);
3031 
3032     __ aesenc(xmm_result, xmm_temp1);
3033     __ aesenc(xmm_result, xmm_temp2);
3034     
3035     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3036     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3037 
3038     __ BIND(L_doLast);
3039     __ aesenc(xmm_result, xmm_temp1);
3040     __ aesenclast(xmm_result, xmm_temp2);
3041     __ movdqu(Address(to, 0), xmm_result);        // store the result
3042     __ xorptr(rax, rax); // return 0
3043     __ leave(); // required for proper stackwalking of RuntimeStub frame
3044     __ ret(0);
3045 
3046     return start;
3047   }
3048 
3049 
3050   // Arguments:
3051   //
3052   // Inputs:
3053   //   c_rarg0   - source byte array address
3054   //   c_rarg1   - destination byte array address
3055   //   c_rarg2   - K (key) in little endian int array
3056   //
3057   address generate_aescrypt_decryptBlock() {
3058     assert(UseAES, "need AES instructions and misaligned SSE support");
3059     __ align(CodeEntryAlignment);
3060     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3061     Label L_doLast;
3062     address start = __ pc();
3063 
3064     const Register from        = c_rarg0;  // source array address
3065     const Register to          = c_rarg1;  // destination array address
3066     const Register key         = c_rarg2;  // key array address
3067     const Register keylen      = rax;
3068 
3069     const XMMRegister xmm_result = xmm0;
3070     const XMMRegister xmm_key_shuf_mask = xmm1;
3071     // On win64 xmm6-xmm15 must be preserved so don't use them.
3072     const XMMRegister xmm_temp1  = xmm2;
3073     const XMMRegister xmm_temp2  = xmm3;
3074     const XMMRegister xmm_temp3  = xmm4;
3075     const XMMRegister xmm_temp4  = xmm5;
3076 
3077     __ enter(); // required for proper stackwalking of RuntimeStub frame
3078 
3079     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3080     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));



3081 
3082     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3083     __ movdqu(xmm_result, Address(from, 0));
3084 
3085     // for decryption java expanded key ordering is rotated one position from what we want
3086     // so we start from 0x10 here and hit 0x00 last
3087     // we don't know if the key is aligned, hence not using load-execute form
3088     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3089     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3090     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3091     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3092 
3093     __ pxor  (xmm_result, xmm_temp1);
3094     __ aesdec(xmm_result, xmm_temp2);
3095     __ aesdec(xmm_result, xmm_temp3);
3096     __ aesdec(xmm_result, xmm_temp4);
3097 
3098     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3099     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3100     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3101     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3102 
3103     __ aesdec(xmm_result, xmm_temp1);
3104     __ aesdec(xmm_result, xmm_temp2);
3105     __ aesdec(xmm_result, xmm_temp3);
3106     __ aesdec(xmm_result, xmm_temp4);
3107 
3108     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3109     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3110     load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
3111     
3112     __ cmpl(keylen, 44);
3113     __ jccb(Assembler::equal, L_doLast);
3114 
3115     __ aesdec(xmm_result, xmm_temp1);
3116     __ aesdec(xmm_result, xmm_temp2);
3117 
3118     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3119     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3120 
3121     __ cmpl(keylen, 52);
3122     __ jccb(Assembler::equal, L_doLast);
3123 
3124     __ aesdec(xmm_result, xmm_temp1);
3125     __ aesdec(xmm_result, xmm_temp2);
3126     
3127     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3128     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3129 
3130     __ BIND(L_doLast);
3131     __ aesdec(xmm_result, xmm_temp1);
3132     __ aesdec(xmm_result, xmm_temp2);

3133 
3134     // for decryption the aesdeclast operation is always on key+0x00
3135     __ aesdeclast(xmm_result, xmm_temp3);
3136     __ movdqu(Address(to, 0), xmm_result);  // store the result

3137     __ xorptr(rax, rax); // return 0
3138     __ leave(); // required for proper stackwalking of RuntimeStub frame
3139     __ ret(0);
3140 
3141     return start;
3142   }
3143 
3144 
3145   // Arguments:
3146   //
3147   // Inputs:
3148   //   c_rarg0   - source byte array address
3149   //   c_rarg1   - destination byte array address
3150   //   c_rarg2   - K (key) in little endian int array
3151   //   c_rarg3   - r vector byte array address
3152   //   c_rarg4   - input length
3153   //
3154   address generate_cipherBlockChaining_encryptAESCrypt() {
3155     assert(UseAES, "need AES instructions and misaligned SSE support");
3156     __ align(CodeEntryAlignment);
3157     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3158     address start = __ pc();
3159 
3160     Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3161     const Register from        = c_rarg0;  // source array address
3162     const Register to          = c_rarg1;  // destination array address
3163     const Register key         = c_rarg2;  // key array address
3164     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3165                                            // and left with the results of the last encryption block
3166 #ifndef _WIN64
3167     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3168 #else
3169     const Address  len_mem(rsp, 6 * wordSize);  // length is on stack on Win64
3170     const Register len_reg     = r10;      // pick the first volatile windows register
3171 #endif
3172     const Register pos         = rax;
3173 
3174     // xmm register assignments for the loops below
3175     const XMMRegister xmm_result = xmm0;
3176     const XMMRegister xmm_temp   = xmm1;
3177     // keys 0-10 preloaded into xmm2-xmm12
3178     const int XMM_REG_NUM_KEY_FIRST = 2;
3179     const int XMM_REG_NUM_KEY_LAST  = 15;
3180     const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3181     const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3182     const XMMRegister xmm_key11  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3183     const XMMRegister xmm_key12  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3184     const XMMRegister xmm_key13  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3185 
3186     __ enter(); // required for proper stackwalking of RuntimeStub frame
3187 
3188 #ifdef _WIN64
3189     // on win64, fill len_reg from stack position
3190     __ movl(len_reg, len_mem);
3191     // save the xmm registers which must be preserved 6-15
3192     __ subptr(rsp, -rsp_after_call_off * wordSize);
3193     for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3194       __ movdqu(xmm_save(i), as_XMMRegister(i));
3195     }
3196 #endif
3197 
3198     const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
3199     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3200     // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3201     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3202       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3203       offset += 0x10;
3204     }

3205     __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
3206 
3207     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3208     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3209     __ cmpl(rax, 44);
3210     __ jcc(Assembler::notEqual, L_key_192_256);
3211 
3212     // 128 bit code follows here
3213     __ movptr(pos, 0);
3214     __ align(OptoLoopAlignment);
3215 
3216     __ BIND(L_loopTop_128);
3217     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3218     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector

3219     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3220     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3221       __ aesenc(xmm_result, as_XMMRegister(rnum));
3222     }
3223     __ aesenclast(xmm_result, xmm_key10);

3224     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3225     // no need to store r to memory until we exit
3226     __ addptr(pos, AESBlockSize);
3227     __ subptr(len_reg, AESBlockSize);
3228     __ jcc(Assembler::notEqual, L_loopTop_128);
3229 
3230     __ BIND(L_exit);
3231     __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
3232 
3233 #ifdef _WIN64
3234     // restore xmm regs belonging to calling function
3235     for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3236       __ movdqu(as_XMMRegister(i), xmm_save(i));
3237     }
3238 #endif
3239     __ movl(rax, 0); // return 0 (why?)
3240     __ leave(); // required for proper stackwalking of RuntimeStub frame
3241     __ ret(0);
3242 
3243     __ BIND(L_key_192_256);
3244     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3245     load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3246     load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3247     __ cmpl(rax, 52);
3248     __ jcc(Assembler::notEqual, L_key_256);
3249 
3250     // 192-bit code follows here (could be changed to use more xmm registers)
3251     __ movptr(pos, 0);
3252     __ align(OptoLoopAlignment);
3253     
3254     __ BIND(L_loopTop_192);
3255     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3256     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector

3257     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3258     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
3259       __ aesenc(xmm_result, as_XMMRegister(rnum));
3260     }
3261     __ aesenclast(xmm_result, xmm_key12);



3262     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3263     // no need to store r to memory until we exit
3264     __ addptr(pos, AESBlockSize);
3265     __ subptr(len_reg, AESBlockSize);
3266     __ jcc(Assembler::notEqual, L_loopTop_192);
3267     __ jmp(L_exit);
3268 
3269     __ BIND(L_key_256);
3270     // 256-bit code follows here (could be changed to use more xmm registers)
3271     load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
3272     __ movptr(pos, 0);
3273     __ align(OptoLoopAlignment);
3274     
3275     __ BIND(L_loopTop_256);
3276     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3277     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector

3278     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3279     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
3280       __ aesenc(xmm_result, as_XMMRegister(rnum));
3281     }



3282     load_key(xmm_temp, key, 0xe0);
3283     __ aesenclast(xmm_result, xmm_temp);

3284     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3285     // no need to store r to memory until we exit
3286     __ addptr(pos, AESBlockSize);
3287     __ subptr(len_reg, AESBlockSize);
3288     __ jcc(Assembler::notEqual, L_loopTop_256);
3289     __ jmp(L_exit);
3290 
3291     return start;
3292   }
3293 
3294 
3295 
3296   // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3297   // to hide instruction latency
3298   //
3299   // Arguments:
3300   //
3301   // Inputs:
3302   //   c_rarg0   - source byte array address
3303   //   c_rarg1   - destination byte array address
3304   //   c_rarg2   - K (key) in little endian int array
3305   //   c_rarg3   - r vector byte array address
3306   //   c_rarg4   - input length
3307   //
3308 
3309   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3310     assert(UseAES, "need AES instructions and misaligned SSE support");
3311     __ align(CodeEntryAlignment);
3312     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3313     address start = __ pc();
3314 
3315     Label L_exit, L_key_192_256, L_key_256;
3316     Label L_singleBlock_loopTop_128, L_multiBlock_loopTop_128;
3317     Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256;
3318     const Register from        = c_rarg0;  // source array address
3319     const Register to          = c_rarg1;  // destination array address
3320     const Register key         = c_rarg2;  // key array address
3321     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3322                                            // and left with the results of the last encryption block
3323 #ifndef _WIN64
3324     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3325 #else
3326     const Address  len_mem(rsp, 6 * wordSize);  // length is on stack on Win64
3327     const Register len_reg     = r10;      // pick the first volatile windows register
3328 #endif
3329     const Register pos         = rax;
3330 


3331     // keys 0-10 preloaded into xmm2-xmm12
3332     const int XMM_REG_NUM_KEY_FIRST = 5;
3333     const int XMM_REG_NUM_KEY_LAST  = 15;
3334     const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3335     const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3336 
3337     __ enter(); // required for proper stackwalking of RuntimeStub frame
3338 
3339 #ifdef _WIN64
3340     // on win64, fill len_reg from stack position
3341     __ movl(len_reg, len_mem);
3342     // save the xmm registers which must be preserved 6-15
3343     __ subptr(rsp, -rsp_after_call_off * wordSize);
3344     for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3345       __ movdqu(xmm_save(i), as_XMMRegister(i));
3346     }
3347 #endif
3348     // the java expanded key ordering is rotated one position from what we want
3349     // so we start from 0x10 here and hit 0x00 last
3350     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
3351     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3352     // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3353     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {

3354       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3355       offset += 0x10;
3356     }
3357     load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3358 
3359     const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
3360 
3361     // registers holding the four results in the parallelized loop
3362     const XMMRegister xmm_result0 = xmm0;
3363     const XMMRegister xmm_result1 = xmm2;
3364     const XMMRegister xmm_result2 = xmm3;
3365     const XMMRegister xmm_result3 = xmm4;
3366 
3367     __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
3368 
3369     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3370     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3371     __ cmpl(rax, 44);
3372     __ jcc(Assembler::notEqual, L_key_192_256);
3373 
3374 
3375     // 128-bit code follows here, parallelized
3376     __ movptr(pos, 0);
3377     __ align(OptoLoopAlignment);
3378     __ BIND(L_multiBlock_loopTop_128);
3379     __ cmpptr(len_reg, 4*AESBlockSize);           // see if at least 4 blocks left
3380     __ jcc(Assembler::less, L_singleBlock_loopTop_128);


3398     // for each result, xor with the r vector of previous cipher block
3399     __ pxor(xmm_result0, xmm_prev_block_cipher);
3400     __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0*AESBlockSize));
3401     __ pxor(xmm_result1, xmm_prev_block_cipher);
3402     __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1*AESBlockSize));
3403     __ pxor(xmm_result2, xmm_prev_block_cipher);
3404     __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2*AESBlockSize));
3405     __ pxor(xmm_result3, xmm_prev_block_cipher);
3406     __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3*AESBlockSize));   // this will carry over to next set of blocks
3407 
3408     __ movdqu(Address(to, pos, Address::times_1, 0*AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
3409     __ movdqu(Address(to, pos, Address::times_1, 1*AESBlockSize), xmm_result1);
3410     __ movdqu(Address(to, pos, Address::times_1, 2*AESBlockSize), xmm_result2);
3411     __ movdqu(Address(to, pos, Address::times_1, 3*AESBlockSize), xmm_result3);
3412 
3413     __ addptr(pos, 4*AESBlockSize);
3414     __ subptr(len_reg, 4*AESBlockSize);
3415     __ jmp(L_multiBlock_loopTop_128);
3416 
3417     // registers used in the non-parallelized loops
3418     // xmm register assignments for the loops below
3419     const XMMRegister xmm_result = xmm0;
3420     const XMMRegister xmm_prev_block_cipher_save = xmm2;
3421     const XMMRegister xmm_key11 = xmm3;
3422     const XMMRegister xmm_key12 = xmm4;
3423     const XMMRegister xmm_temp  = xmm4;
3424 
3425     __ align(OptoLoopAlignment);
3426     __ BIND(L_singleBlock_loopTop_128);
3427     __ cmpptr(len_reg, 0);           // any blocks left??
3428     __ jcc(Assembler::equal, L_exit);
3429     __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
3430     __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
3431     __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
3432     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3433       __ aesdec(xmm_result, as_XMMRegister(rnum));
3434     }
3435     __ aesdeclast(xmm_result, xmm_key_last);
3436     __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
3437     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3438     // no need to store r to memory until we exit
3439     __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);              // set up next r vector with cipher input from this block
3440 
3441     __ addptr(pos, AESBlockSize);
3442     __ subptr(len_reg, AESBlockSize);
3443     __ jmp(L_singleBlock_loopTop_128);
3444 
3445 
3446     __ BIND(L_exit);
3447     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
3448 #ifdef _WIN64
3449     // restore regs belonging to calling function
3450     for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
3451       __ movdqu(as_XMMRegister(i), xmm_save(i));
3452     }
3453 #endif
3454     __ movl(rax, 0); // return 0 (why?)
3455     __ leave(); // required for proper stackwalking of RuntimeStub frame
3456     __ ret(0);
3457 
3458 
3459     __ BIND(L_key_192_256);
3460     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3461     load_key(xmm_key11, key, 0xb0);
3462     __ cmpl(rax, 52);
3463     __ jcc(Assembler::notEqual, L_key_256);
3464 
3465     // 192-bit code follows here (could be optimized to use parallelism)
3466     load_key(xmm_key12, key, 0xc0);     // 192-bit key goes up to c0
3467     __ movptr(pos, 0);
3468     __ align(OptoLoopAlignment);
3469     
3470     __ BIND(L_singleBlock_loopTop_192);
3471     __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of cipher input
3472     __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
3473     __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
3474     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3475       __ aesdec(xmm_result, as_XMMRegister(rnum));
3476     }
3477     __ aesdec(xmm_result, xmm_key11);
3478     __ aesdec(xmm_result, xmm_key12);
3479     __ aesdeclast(xmm_result, xmm_key_last);                    // xmm15 always came from key+0
3480     __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
3481     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);  // store into the next 16 bytes of output
3482     // no need to store r to memory until we exit
3483     __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);  // set up next r vector with cipher input from this block

3484     __ addptr(pos, AESBlockSize);
3485     __ subptr(len_reg, AESBlockSize);
3486     __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192);
3487     __ jmp(L_exit);
3488 
3489     __ BIND(L_key_256);
3490     // 256-bit code follows here (could be optimized to use parallelism)
3491     __ movptr(pos, 0);
3492     __ align(OptoLoopAlignment);
3493     
3494     __ BIND(L_singleBlock_loopTop_256);
3495     __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
3496     __ movdqa(xmm_prev_block_cipher_save, xmm_result);              // save for next r vector
3497     __ pxor  (xmm_result, xmm_key_first);               // do the aes dec rounds
3498     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) {
3499       __ aesdec(xmm_result, as_XMMRegister(rnum));
3500     }
3501     __ aesdec(xmm_result, xmm_key11);
3502     load_key(xmm_temp, key, 0xc0);
3503     __ aesdec(xmm_result, xmm_temp);
3504     load_key(xmm_temp, key, 0xd0);
3505     __ aesdec(xmm_result, xmm_temp);
3506     load_key(xmm_temp, key, 0xe0);     // 256-bit key goes up to e0
3507     __ aesdec(xmm_result, xmm_temp);
3508     __ aesdeclast(xmm_result, xmm_key_last);          // xmm15 came from key+0
3509     __ pxor  (xmm_result, xmm_prev_block_cipher);               // xor with the current r vector
3510     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);  // store into the next 16 bytes of output
3511     // no need to store r to memory until we exit
3512     __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save);  // set up next r vector with cipher input from this block

3513     __ addptr(pos, AESBlockSize);
3514     __ subptr(len_reg, AESBlockSize);
3515     __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256);
3516     __ jmp(L_exit);
3517 
3518     return start;
3519   }
3520 
3521 
3522 
3523 #undef __
3524 #define __ masm->
3525 
3526   // Continuation point for throwing of implicit exceptions that are
3527   // not handled in the current activation. Fabricates an exception
3528   // oop and initiates normal exception dispatching in this
3529   // frame. Since we need to preserve callee-saved values (currently
3530   // only for C2, but done for C1 as well) we need a callee-saved oop
3531   // map and therefore have to make these stubs into RuntimeStubs
3532   // rather than BufferBlobs.  If the compiler needs all registers to


src/cpu/x86/vm/stubGenerator_x86_64.cpp
Index Unified diffs Context diffs Sdiffs Wdiffs Patch New Old Previous File Next File