2936 address generate_key_shuffle_mask() { 2937 __ align(16); 2938 StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask"); 2939 address start = __ pc(); 2940 __ emit_data64( 0x0405060700010203, relocInfo::none ); 2941 __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none ); 2942 return start; 2943 } 2944 2945 // Utility routine for loading a 128-bit key word in little endian format 2946 // can optionally specify that the shuffle mask is already in an xmmregister 2947 void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { 2948 __ movdqu(xmmdst, Address(key, offset)); 2949 if (xmm_shuf_mask != NULL) { 2950 __ pshufb(xmmdst, xmm_shuf_mask); 2951 } else { 2952 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 2953 } 2954 } 2955 2956 // aesenc using specified key+offset 2957 // can optionally specify that the shuffle mask is already in an xmmregister 2958 void aes_enc_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { 2959 load_key(xmmtmp, key, offset, xmm_shuf_mask); 2960 __ aesenc(xmmdst, xmmtmp); 2961 } 2962 2963 // aesdec using specified key+offset 2964 // can optionally specify that the shuffle mask is already in an xmmregister 2965 void aes_dec_key(XMMRegister xmmdst, XMMRegister xmmtmp, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { 2966 load_key(xmmtmp, key, offset, xmm_shuf_mask); 2967 __ aesdec(xmmdst, xmmtmp); 2968 } 2969 2970 2971 // Arguments: 2972 // 2973 // Inputs: 2974 // c_rarg0 - source byte array address 2975 // c_rarg1 - destination byte array address 2976 // c_rarg2 - K (key) in little endian int array 2977 // 2978 address generate_aescrypt_encryptBlock() { 2979 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); 2980 __ align(CodeEntryAlignment); 2981 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2982 Label L_doLast; 2983 address start = __ pc(); 2984 2985 const Register from = c_rarg0; // source array address 2986 const Register to = c_rarg1; // destination array address 2987 const Register key = c_rarg2; // key array address 2988 const Register keylen = rax; 2989 2990 const XMMRegister xmm_result = xmm0; 2991 const XMMRegister xmm_temp = xmm1; 2992 const XMMRegister xmm_key_shuf_mask = xmm2; 2993 2994 __ enter(); // required for proper stackwalking of RuntimeStub frame 2995 2996 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2997 // keylen = # of 32-bit words, convert to 128-bit words 2998 __ shrl(keylen, 2); 2999 __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more 3000 3001 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3002 __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input 3003 3004 // For encryption, the java expanded key ordering is just what we need 3005 // we don't know if the key is aligned, hence not using load-execute form 3006 3007 load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); 3008 __ pxor(xmm_result, xmm_temp); 3009 for (int offset = 0x10; offset <= 0x90; offset += 0x10) { 3010 aes_enc_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); 3011 } 3012 load_key (xmm_temp, key, 0xa0, xmm_key_shuf_mask); 3013 __ cmpl(keylen, 0); 3014 __ jcc(Assembler::equal, L_doLast); 3015 __ aesenc(xmm_result, xmm_temp); // only in 192 and 256 bit keys 3016 aes_enc_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); 3017 load_key(xmm_temp, key, 0xc0, xmm_key_shuf_mask); 3018 __ subl(keylen, 2); 3019 __ jcc(Assembler::equal, L_doLast); 3020 __ aesenc(xmm_result, xmm_temp); // only in 256 bit keys 3021 aes_enc_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); 3022 load_key(xmm_temp, key, 0xe0, xmm_key_shuf_mask); 3023 3024 __ BIND(L_doLast); 3025 __ aesenclast(xmm_result, xmm_temp); 3026 __ movdqu(Address(to, 0), xmm_result); // store the result 3027 __ xorptr(rax, rax); // return 0 3028 __ leave(); // required for proper stackwalking of RuntimeStub frame 3029 __ ret(0); 3030 3031 return start; 3032 } 3033 3034 3035 // Arguments: 3036 // 3037 // Inputs: 3038 // c_rarg0 - source byte array address 3039 // c_rarg1 - destination byte array address 3040 // c_rarg2 - K (key) in little endian int array 3041 // 3042 address generate_aescrypt_decryptBlock() { 3043 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); 3044 __ align(CodeEntryAlignment); 3045 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 3046 Label L_doLast; 3047 address start = __ pc(); 3048 3049 const Register from = c_rarg0; // source array address 3050 const Register to = c_rarg1; // destination array address 3051 const Register key = c_rarg2; // key array address 3052 const Register keylen = rax; 3053 3054 const XMMRegister xmm_result = xmm0; 3055 const XMMRegister xmm_temp = xmm1; 3056 const XMMRegister xmm_key_shuf_mask = xmm2; 3057 3058 __ enter(); // required for proper stackwalking of RuntimeStub frame 3059 3060 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3061 // keylen = # of 32-bit words, convert to 128-bit words 3062 __ shrl(keylen, 2); 3063 __ subl(keylen, 11); // every key has at least 11 128-bit words, some have more 3064 3065 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3066 __ movdqu(xmm_result, Address(from, 0)); 3067 3068 // for decryption java expanded key ordering is rotated one position from what we want 3069 // so we start from 0x10 here and hit 0x00 last 3070 // we don't know if the key is aligned, hence not using load-execute form 3071 load_key(xmm_temp, key, 0x10, xmm_key_shuf_mask); 3072 __ pxor (xmm_result, xmm_temp); 3073 for (int offset = 0x20; offset <= 0xa0; offset += 0x10) { 3074 aes_dec_key(xmm_result, xmm_temp, key, offset, xmm_key_shuf_mask); 3075 } 3076 __ cmpl(keylen, 0); 3077 __ jcc(Assembler::equal, L_doLast); 3078 // only in 192 and 256 bit keys 3079 aes_dec_key(xmm_result, xmm_temp, key, 0xb0, xmm_key_shuf_mask); 3080 aes_dec_key(xmm_result, xmm_temp, key, 0xc0, xmm_key_shuf_mask); 3081 __ subl(keylen, 2); 3082 __ jcc(Assembler::equal, L_doLast); 3083 // only in 256 bit keys 3084 aes_dec_key(xmm_result, xmm_temp, key, 0xd0, xmm_key_shuf_mask); 3085 aes_dec_key(xmm_result, xmm_temp, key, 0xe0, xmm_key_shuf_mask); 3086 3087 __ BIND(L_doLast); 3088 // for decryption the aesdeclast operation is always on key+0x00 3089 load_key(xmm_temp, key, 0x00, xmm_key_shuf_mask); 3090 __ aesdeclast(xmm_result, xmm_temp); 3091 3092 __ movdqu(Address(to, 0), xmm_result); // store the result 3093 3094 __ xorptr(rax, rax); // return 0 3095 __ leave(); // required for proper stackwalking of RuntimeStub frame 3096 __ ret(0); 3097 3098 return start; 3099 } 3100 3101 3102 // Arguments: 3103 // 3104 // Inputs: 3105 // c_rarg0 - source byte array address 3106 // c_rarg1 - destination byte array address 3107 // c_rarg2 - K (key) in little endian int array 3108 // c_rarg3 - r vector byte array address 3109 // c_rarg4 - input length 3110 // 3111 address generate_cipherBlockChaining_encryptAESCrypt() { 3112 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); 3113 __ align(CodeEntryAlignment); 3114 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 3115 address start = __ pc(); 3116 3117 Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256; 3118 const Register from = c_rarg0; // source array address 3119 const Register to = c_rarg1; // destination array address 3120 const Register key = c_rarg2; // key array address 3121 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 3122 // and left with the results of the last encryption block 3123 #ifndef _WIN64 3124 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 3125 #else 3126 const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64 3127 const Register len_reg = r10; // pick the first volatile windows register 3128 #endif 3129 const Register pos = rax; 3130 3131 // xmm register assignments for the loops below 3132 const XMMRegister xmm_result = xmm0; 3133 const XMMRegister xmm_temp = xmm1; 3134 // keys 0-10 preloaded into xmm2-xmm12 3135 const int XMM_REG_NUM_KEY_FIRST = 2; 3136 const int XMM_REG_NUM_KEY_LAST = 12; 3137 const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); 3138 const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_LAST); 3139 3140 __ enter(); // required for proper stackwalking of RuntimeStub frame 3141 3142 #ifdef _WIN64 3143 // on win64, fill len_reg from stack position 3144 __ movl(len_reg, len_mem); 3145 // save the xmm registers which must be preserved 6-12 3146 __ subptr(rsp, -rsp_after_call_off * wordSize); 3147 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { 3148 __ movdqu(xmm_save(i), as_XMMRegister(i)); 3149 } 3150 #endif 3151 3152 const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front 3153 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3154 // load up xmm regs 2 thru 12 with key 0x00 - 0xa0 3155 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 3156 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); 3157 offset += 0x10; 3158 } 3159 3160 __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec 3161 3162 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) 3163 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3164 __ cmpl(rax, 44); 3165 __ jcc(Assembler::notEqual, L_key_192_256); 3166 3167 // 128 bit code follows here 3168 __ movptr(pos, 0); 3169 __ align(OptoLoopAlignment); 3170 __ BIND(L_loopTop_128); 3171 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 3172 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 3173 3174 __ pxor (xmm_result, xmm_key0); // do the aes rounds 3175 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { 3176 __ aesenc(xmm_result, as_XMMRegister(rnum)); 3177 } 3178 __ aesenclast(xmm_result, xmm_key10); 3179 3180 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3181 // no need to store r to memory until we exit 3182 __ addptr(pos, AESBlockSize); 3183 __ subptr(len_reg, AESBlockSize); 3184 __ jcc(Assembler::notEqual, L_loopTop_128); 3185 3186 __ BIND(L_exit); 3187 __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object 3188 3189 #ifdef _WIN64 3190 // restore xmm regs belonging to calling function 3191 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { 3192 __ movdqu(as_XMMRegister(i), xmm_save(i)); 3193 } 3194 #endif 3195 __ movl(rax, 0); // return 0 (why?) 3196 __ leave(); // required for proper stackwalking of RuntimeStub frame 3197 __ ret(0); 3198 3199 __ BIND(L_key_192_256); 3200 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) 3201 __ cmpl(rax, 52); 3202 __ jcc(Assembler::notEqual, L_key_256); 3203 3204 // 192-bit code follows here (could be changed to use more xmm registers) 3205 __ movptr(pos, 0); 3206 __ align(OptoLoopAlignment); 3207 __ BIND(L_loopTop_192); 3208 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 3209 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 3210 3211 __ pxor (xmm_result, xmm_key0); // do the aes rounds 3212 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 3213 __ aesenc(xmm_result, as_XMMRegister(rnum)); 3214 } 3215 aes_enc_key(xmm_result, xmm_temp, key, 0xb0); 3216 load_key(xmm_temp, key, 0xc0); 3217 __ aesenclast(xmm_result, xmm_temp); 3218 3219 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3220 // no need to store r to memory until we exit 3221 __ addptr(pos, AESBlockSize); 3222 __ subptr(len_reg, AESBlockSize); 3223 __ jcc(Assembler::notEqual, L_loopTop_192); 3224 __ jmp(L_exit); 3225 3226 __ BIND(L_key_256); 3227 // 256-bit code follows here (could be changed to use more xmm registers) 3228 __ movptr(pos, 0); 3229 __ align(OptoLoopAlignment); 3230 __ BIND(L_loopTop_256); 3231 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 3232 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 3233 3234 __ pxor (xmm_result, xmm_key0); // do the aes rounds 3235 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 3236 __ aesenc(xmm_result, as_XMMRegister(rnum)); 3237 } 3238 aes_enc_key(xmm_result, xmm_temp, key, 0xb0); 3239 aes_enc_key(xmm_result, xmm_temp, key, 0xc0); 3240 aes_enc_key(xmm_result, xmm_temp, key, 0xd0); 3241 load_key(xmm_temp, key, 0xe0); 3242 __ aesenclast(xmm_result, xmm_temp); 3243 3244 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3245 // no need to store r to memory until we exit 3246 __ addptr(pos, AESBlockSize); 3247 __ subptr(len_reg, AESBlockSize); 3248 __ jcc(Assembler::notEqual, L_loopTop_256); 3249 __ jmp(L_exit); 3250 3251 return start; 3252 } 3253 3254 3255 3256 // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time 3257 // to hide instruction latency 3258 // 3259 // Arguments: 3260 // 3261 // Inputs: 3262 // c_rarg0 - source byte array address 3263 // c_rarg1 - destination byte array address 3264 // c_rarg2 - K (key) in little endian int array 3265 // c_rarg3 - r vector byte array address 3266 // c_rarg4 - input length 3267 // 3268 3269 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { 3270 assert(UseAES && (UseAVX > 0), "need AES instructions and misaligned SSE support"); 3271 __ align(CodeEntryAlignment); 3272 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 3273 address start = __ pc(); 3274 3275 Label L_exit, L_key_192_256, L_key_256; 3276 Label L_singleBlock_loopTop_128, L_multiBlock_loopTop_128; 3277 Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256; 3278 const Register from = c_rarg0; // source array address 3279 const Register to = c_rarg1; // destination array address 3280 const Register key = c_rarg2; // key array address 3281 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 3282 // and left with the results of the last encryption block 3283 #ifndef _WIN64 3284 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 3285 #else 3286 const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64 3287 const Register len_reg = r10; // pick the first volatile windows register 3288 #endif 3289 const Register pos = rax; 3290 3291 // xmm register assignments for the loops below 3292 const XMMRegister xmm_result = xmm0; 3293 // keys 0-10 preloaded into xmm2-xmm12 3294 const int XMM_REG_NUM_KEY_FIRST = 5; 3295 const int XMM_REG_NUM_KEY_LAST = 15; 3296 const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); 3297 const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST); 3298 3299 __ enter(); // required for proper stackwalking of RuntimeStub frame 3300 3301 #ifdef _WIN64 3302 // on win64, fill len_reg from stack position 3303 __ movl(len_reg, len_mem); 3304 // save the xmm registers which must be preserved 6-15 3305 __ subptr(rsp, -rsp_after_call_off * wordSize); 3306 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { 3307 __ movdqu(xmm_save(i), as_XMMRegister(i)); 3308 } 3309 #endif 3310 // the java expanded key ordering is rotated one position from what we want 3311 // so we start from 0x10 here and hit 0x00 last 3312 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front 3313 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3314 // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00 3315 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum <= XMM_REG_NUM_KEY_LAST; rnum++) { 3316 if (rnum == XMM_REG_NUM_KEY_LAST) offset = 0x00; 3317 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); 3318 offset += 0x10; 3319 } 3320 3321 const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block 3322 // registers holding the four results in the parallelized loop 3323 const XMMRegister xmm_result0 = xmm0; 3324 const XMMRegister xmm_result1 = xmm2; 3325 const XMMRegister xmm_result2 = xmm3; 3326 const XMMRegister xmm_result3 = xmm4; 3327 3328 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec 3329 3330 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) 3331 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3332 __ cmpl(rax, 44); 3333 __ jcc(Assembler::notEqual, L_key_192_256); 3334 3335 3336 // 128-bit code follows here, parallelized 3337 __ movptr(pos, 0); 3338 __ align(OptoLoopAlignment); 3339 __ BIND(L_multiBlock_loopTop_128); 3340 __ cmpptr(len_reg, 4*AESBlockSize); // see if at least 4 blocks left 3341 __ jcc(Assembler::less, L_singleBlock_loopTop_128); 3359 // for each result, xor with the r vector of previous cipher block 3360 __ pxor(xmm_result0, xmm_prev_block_cipher); 3361 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0*AESBlockSize)); 3362 __ pxor(xmm_result1, xmm_prev_block_cipher); 3363 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1*AESBlockSize)); 3364 __ pxor(xmm_result2, xmm_prev_block_cipher); 3365 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2*AESBlockSize)); 3366 __ pxor(xmm_result3, xmm_prev_block_cipher); 3367 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3*AESBlockSize)); // this will carry over to next set of blocks 3368 3369 __ movdqu(Address(to, pos, Address::times_1, 0*AESBlockSize), xmm_result0); // store 4 results into the next 64 bytes of output 3370 __ movdqu(Address(to, pos, Address::times_1, 1*AESBlockSize), xmm_result1); 3371 __ movdqu(Address(to, pos, Address::times_1, 2*AESBlockSize), xmm_result2); 3372 __ movdqu(Address(to, pos, Address::times_1, 3*AESBlockSize), xmm_result3); 3373 3374 __ addptr(pos, 4*AESBlockSize); 3375 __ subptr(len_reg, 4*AESBlockSize); 3376 __ jmp(L_multiBlock_loopTop_128); 3377 3378 // registers used in the non-parallelized loops 3379 const XMMRegister xmm_prev_block_cipher_save = xmm2; 3380 const XMMRegister xmm_temp = xmm3; 3381 3382 __ align(OptoLoopAlignment); 3383 __ BIND(L_singleBlock_loopTop_128); 3384 __ cmpptr(len_reg, 0); // any blocks left?? 3385 __ jcc(Assembler::equal, L_exit); 3386 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input 3387 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector 3388 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds 3389 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { 3390 __ aesdec(xmm_result, as_XMMRegister(rnum)); 3391 } 3392 __ aesdeclast(xmm_result, xmm_key_last); 3393 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector 3394 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3395 // no need to store r to memory until we exit 3396 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block 3397 3398 __ addptr(pos, AESBlockSize); 3399 __ subptr(len_reg, AESBlockSize); 3400 __ jmp(L_singleBlock_loopTop_128); 3401 3402 3403 __ BIND(L_exit); 3404 __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object 3405 #ifdef _WIN64 3406 // restore regs belonging to calling function 3407 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { 3408 __ movdqu(as_XMMRegister(i), xmm_save(i)); 3409 } 3410 #endif 3411 __ movl(rax, 0); // return 0 (why?) 3412 __ leave(); // required for proper stackwalking of RuntimeStub frame 3413 __ ret(0); 3414 3415 3416 __ BIND(L_key_192_256); 3417 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) 3418 __ cmpl(rax, 52); 3419 __ jcc(Assembler::notEqual, L_key_256); 3420 3421 // 192-bit code follows here (could be optimized to use parallelism) 3422 __ movptr(pos, 0); 3423 __ align(OptoLoopAlignment); 3424 __ BIND(L_singleBlock_loopTop_192); 3425 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input 3426 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector 3427 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds 3428 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { 3429 __ aesdec(xmm_result, as_XMMRegister(rnum)); 3430 } 3431 aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 192-bit key goes up to c0 3432 aes_dec_key(xmm_result, xmm_temp, key, 0xc0); 3433 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0 3434 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector 3435 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3436 // no need to store r to memory until we exit 3437 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block 3438 3439 __ addptr(pos, AESBlockSize); 3440 __ subptr(len_reg, AESBlockSize); 3441 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192); 3442 __ jmp(L_exit); 3443 3444 __ BIND(L_key_256); 3445 // 256-bit code follows here (could be optimized to use parallelism) 3446 __ movptr(pos, 0); 3447 __ align(OptoLoopAlignment); 3448 __ BIND(L_singleBlock_loopTop_256); 3449 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input 3450 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector 3451 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds 3452 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { 3453 __ aesdec(xmm_result, as_XMMRegister(rnum)); 3454 } 3455 aes_dec_key(xmm_result, xmm_temp, key, 0xb0); // 256-bit key goes up to e0 3456 aes_dec_key(xmm_result, xmm_temp, key, 0xc0); 3457 aes_dec_key(xmm_result, xmm_temp, key, 0xd0); 3458 aes_dec_key(xmm_result, xmm_temp, key, 0xe0); 3459 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 came from key+0 3460 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector 3461 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3462 // no need to store r to memory until we exit 3463 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block 3464 3465 __ addptr(pos, AESBlockSize); 3466 __ subptr(len_reg, AESBlockSize); 3467 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256); 3468 __ jmp(L_exit); 3469 3470 return start; 3471 } 3472 3473 3474 3475 #undef __ 3476 #define __ masm-> 3477 3478 // Continuation point for throwing of implicit exceptions that are 3479 // not handled in the current activation. Fabricates an exception 3480 // oop and initiates normal exception dispatching in this 3481 // frame. Since we need to preserve callee-saved values (currently 3482 // only for C2, but done for C1 as well) we need a callee-saved oop 3483 // map and therefore have to make these stubs into RuntimeStubs 3484 // rather than BufferBlobs. If the compiler needs all registers to | 2936 address generate_key_shuffle_mask() { 2937 __ align(16); 2938 StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask"); 2939 address start = __ pc(); 2940 __ emit_data64( 0x0405060700010203, relocInfo::none ); 2941 __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none ); 2942 return start; 2943 } 2944 2945 // Utility routine for loading a 128-bit key word in little endian format 2946 // can optionally specify that the shuffle mask is already in an xmmregister 2947 void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) { 2948 __ movdqu(xmmdst, Address(key, offset)); 2949 if (xmm_shuf_mask != NULL) { 2950 __ pshufb(xmmdst, xmm_shuf_mask); 2951 } else { 2952 __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 2953 } 2954 } 2955 2956 // Arguments: 2957 // 2958 // Inputs: 2959 // c_rarg0 - source byte array address 2960 // c_rarg1 - destination byte array address 2961 // c_rarg2 - K (key) in little endian int array 2962 // 2963 address generate_aescrypt_encryptBlock() { 2964 assert(UseAES, "need AES instructions and misaligned SSE support"); 2965 __ align(CodeEntryAlignment); 2966 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2967 Label L_doLast; 2968 address start = __ pc(); 2969 2970 const Register from = c_rarg0; // source array address 2971 const Register to = c_rarg1; // destination array address 2972 const Register key = c_rarg2; // key array address 2973 const Register keylen = rax; 2974 2975 const XMMRegister xmm_result = xmm0; 2976 const XMMRegister xmm_key_shuf_mask = xmm1; 2977 // On win64 xmm6-xmm15 must be preserved so don't use them. 2978 const XMMRegister xmm_temp1 = xmm2; 2979 const XMMRegister xmm_temp2 = xmm3; 2980 const XMMRegister xmm_temp3 = xmm4; 2981 const XMMRegister xmm_temp4 = xmm5; 2982 2983 __ enter(); // required for proper stackwalking of RuntimeStub frame 2984 2985 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60} 2986 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2987 2988 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 2989 __ movdqu(xmm_result, Address(from, 0)); // get 16 bytes of input 2990 2991 // For encryption, the java expanded key ordering is just what we need 2992 // we don't know if the key is aligned, hence not using load-execute form 2993 2994 load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask); 2995 __ pxor(xmm_result, xmm_temp1); 2996 2997 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask); 2998 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask); 2999 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask); 3000 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask); 3001 3002 __ aesenc(xmm_result, xmm_temp1); 3003 __ aesenc(xmm_result, xmm_temp2); 3004 __ aesenc(xmm_result, xmm_temp3); 3005 __ aesenc(xmm_result, xmm_temp4); 3006 3007 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask); 3008 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask); 3009 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask); 3010 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask); 3011 3012 __ aesenc(xmm_result, xmm_temp1); 3013 __ aesenc(xmm_result, xmm_temp2); 3014 __ aesenc(xmm_result, xmm_temp3); 3015 __ aesenc(xmm_result, xmm_temp4); 3016 3017 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask); 3018 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask); 3019 3020 __ cmpl(keylen, 44); 3021 __ jccb(Assembler::equal, L_doLast); 3022 3023 __ aesenc(xmm_result, xmm_temp1); 3024 __ aesenc(xmm_result, xmm_temp2); 3025 3026 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask); 3027 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask); 3028 3029 __ cmpl(keylen, 52); 3030 __ jccb(Assembler::equal, L_doLast); 3031 3032 __ aesenc(xmm_result, xmm_temp1); 3033 __ aesenc(xmm_result, xmm_temp2); 3034 3035 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask); 3036 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask); 3037 3038 __ BIND(L_doLast); 3039 __ aesenc(xmm_result, xmm_temp1); 3040 __ aesenclast(xmm_result, xmm_temp2); 3041 __ movdqu(Address(to, 0), xmm_result); // store the result 3042 __ xorptr(rax, rax); // return 0 3043 __ leave(); // required for proper stackwalking of RuntimeStub frame 3044 __ ret(0); 3045 3046 return start; 3047 } 3048 3049 3050 // Arguments: 3051 // 3052 // Inputs: 3053 // c_rarg0 - source byte array address 3054 // c_rarg1 - destination byte array address 3055 // c_rarg2 - K (key) in little endian int array 3056 // 3057 address generate_aescrypt_decryptBlock() { 3058 assert(UseAES, "need AES instructions and misaligned SSE support"); 3059 __ align(CodeEntryAlignment); 3060 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 3061 Label L_doLast; 3062 address start = __ pc(); 3063 3064 const Register from = c_rarg0; // source array address 3065 const Register to = c_rarg1; // destination array address 3066 const Register key = c_rarg2; // key array address 3067 const Register keylen = rax; 3068 3069 const XMMRegister xmm_result = xmm0; 3070 const XMMRegister xmm_key_shuf_mask = xmm1; 3071 // On win64 xmm6-xmm15 must be preserved so don't use them. 3072 const XMMRegister xmm_temp1 = xmm2; 3073 const XMMRegister xmm_temp2 = xmm3; 3074 const XMMRegister xmm_temp3 = xmm4; 3075 const XMMRegister xmm_temp4 = xmm5; 3076 3077 __ enter(); // required for proper stackwalking of RuntimeStub frame 3078 3079 // keylen could be only {11, 13, 15} * 4 = {44, 52, 60} 3080 __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3081 3082 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3083 __ movdqu(xmm_result, Address(from, 0)); 3084 3085 // for decryption java expanded key ordering is rotated one position from what we want 3086 // so we start from 0x10 here and hit 0x00 last 3087 // we don't know if the key is aligned, hence not using load-execute form 3088 load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask); 3089 load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask); 3090 load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask); 3091 load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask); 3092 3093 __ pxor (xmm_result, xmm_temp1); 3094 __ aesdec(xmm_result, xmm_temp2); 3095 __ aesdec(xmm_result, xmm_temp3); 3096 __ aesdec(xmm_result, xmm_temp4); 3097 3098 load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask); 3099 load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask); 3100 load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask); 3101 load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask); 3102 3103 __ aesdec(xmm_result, xmm_temp1); 3104 __ aesdec(xmm_result, xmm_temp2); 3105 __ aesdec(xmm_result, xmm_temp3); 3106 __ aesdec(xmm_result, xmm_temp4); 3107 3108 load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask); 3109 load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask); 3110 load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask); 3111 3112 __ cmpl(keylen, 44); 3113 __ jccb(Assembler::equal, L_doLast); 3114 3115 __ aesdec(xmm_result, xmm_temp1); 3116 __ aesdec(xmm_result, xmm_temp2); 3117 3118 load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask); 3119 load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask); 3120 3121 __ cmpl(keylen, 52); 3122 __ jccb(Assembler::equal, L_doLast); 3123 3124 __ aesdec(xmm_result, xmm_temp1); 3125 __ aesdec(xmm_result, xmm_temp2); 3126 3127 load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask); 3128 load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask); 3129 3130 __ BIND(L_doLast); 3131 __ aesdec(xmm_result, xmm_temp1); 3132 __ aesdec(xmm_result, xmm_temp2); 3133 3134 // for decryption the aesdeclast operation is always on key+0x00 3135 __ aesdeclast(xmm_result, xmm_temp3); 3136 __ movdqu(Address(to, 0), xmm_result); // store the result 3137 __ xorptr(rax, rax); // return 0 3138 __ leave(); // required for proper stackwalking of RuntimeStub frame 3139 __ ret(0); 3140 3141 return start; 3142 } 3143 3144 3145 // Arguments: 3146 // 3147 // Inputs: 3148 // c_rarg0 - source byte array address 3149 // c_rarg1 - destination byte array address 3150 // c_rarg2 - K (key) in little endian int array 3151 // c_rarg3 - r vector byte array address 3152 // c_rarg4 - input length 3153 // 3154 address generate_cipherBlockChaining_encryptAESCrypt() { 3155 assert(UseAES, "need AES instructions and misaligned SSE support"); 3156 __ align(CodeEntryAlignment); 3157 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 3158 address start = __ pc(); 3159 3160 Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256; 3161 const Register from = c_rarg0; // source array address 3162 const Register to = c_rarg1; // destination array address 3163 const Register key = c_rarg2; // key array address 3164 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 3165 // and left with the results of the last encryption block 3166 #ifndef _WIN64 3167 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 3168 #else 3169 const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64 3170 const Register len_reg = r10; // pick the first volatile windows register 3171 #endif 3172 const Register pos = rax; 3173 3174 // xmm register assignments for the loops below 3175 const XMMRegister xmm_result = xmm0; 3176 const XMMRegister xmm_temp = xmm1; 3177 // keys 0-10 preloaded into xmm2-xmm12 3178 const int XMM_REG_NUM_KEY_FIRST = 2; 3179 const int XMM_REG_NUM_KEY_LAST = 15; 3180 const XMMRegister xmm_key0 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); 3181 const XMMRegister xmm_key10 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10); 3182 const XMMRegister xmm_key11 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11); 3183 const XMMRegister xmm_key12 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12); 3184 const XMMRegister xmm_key13 = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13); 3185 3186 __ enter(); // required for proper stackwalking of RuntimeStub frame 3187 3188 #ifdef _WIN64 3189 // on win64, fill len_reg from stack position 3190 __ movl(len_reg, len_mem); 3191 // save the xmm registers which must be preserved 6-15 3192 __ subptr(rsp, -rsp_after_call_off * wordSize); 3193 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { 3194 __ movdqu(xmm_save(i), as_XMMRegister(i)); 3195 } 3196 #endif 3197 3198 const XMMRegister xmm_key_shuf_mask = xmm_temp; // used temporarily to swap key bytes up front 3199 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3200 // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0 3201 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) { 3202 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); 3203 offset += 0x10; 3204 } 3205 __ movdqu(xmm_result, Address(rvec, 0x00)); // initialize xmm_result with r vec 3206 3207 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) 3208 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3209 __ cmpl(rax, 44); 3210 __ jcc(Assembler::notEqual, L_key_192_256); 3211 3212 // 128 bit code follows here 3213 __ movptr(pos, 0); 3214 __ align(OptoLoopAlignment); 3215 3216 __ BIND(L_loopTop_128); 3217 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 3218 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 3219 __ pxor (xmm_result, xmm_key0); // do the aes rounds 3220 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) { 3221 __ aesenc(xmm_result, as_XMMRegister(rnum)); 3222 } 3223 __ aesenclast(xmm_result, xmm_key10); 3224 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3225 // no need to store r to memory until we exit 3226 __ addptr(pos, AESBlockSize); 3227 __ subptr(len_reg, AESBlockSize); 3228 __ jcc(Assembler::notEqual, L_loopTop_128); 3229 3230 __ BIND(L_exit); 3231 __ movdqu(Address(rvec, 0), xmm_result); // final value of r stored in rvec of CipherBlockChaining object 3232 3233 #ifdef _WIN64 3234 // restore xmm regs belonging to calling function 3235 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { 3236 __ movdqu(as_XMMRegister(i), xmm_save(i)); 3237 } 3238 #endif 3239 __ movl(rax, 0); // return 0 (why?) 3240 __ leave(); // required for proper stackwalking of RuntimeStub frame 3241 __ ret(0); 3242 3243 __ BIND(L_key_192_256); 3244 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) 3245 load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask); 3246 load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask); 3247 __ cmpl(rax, 52); 3248 __ jcc(Assembler::notEqual, L_key_256); 3249 3250 // 192-bit code follows here (could be changed to use more xmm registers) 3251 __ movptr(pos, 0); 3252 __ align(OptoLoopAlignment); 3253 3254 __ BIND(L_loopTop_192); 3255 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 3256 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 3257 __ pxor (xmm_result, xmm_key0); // do the aes rounds 3258 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) { 3259 __ aesenc(xmm_result, as_XMMRegister(rnum)); 3260 } 3261 __ aesenclast(xmm_result, xmm_key12); 3262 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3263 // no need to store r to memory until we exit 3264 __ addptr(pos, AESBlockSize); 3265 __ subptr(len_reg, AESBlockSize); 3266 __ jcc(Assembler::notEqual, L_loopTop_192); 3267 __ jmp(L_exit); 3268 3269 __ BIND(L_key_256); 3270 // 256-bit code follows here (could be changed to use more xmm registers) 3271 load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask); 3272 __ movptr(pos, 0); 3273 __ align(OptoLoopAlignment); 3274 3275 __ BIND(L_loopTop_256); 3276 __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of input 3277 __ pxor (xmm_result, xmm_temp); // xor with the current r vector 3278 __ pxor (xmm_result, xmm_key0); // do the aes rounds 3279 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) { 3280 __ aesenc(xmm_result, as_XMMRegister(rnum)); 3281 } 3282 load_key(xmm_temp, key, 0xe0); 3283 __ aesenclast(xmm_result, xmm_temp); 3284 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3285 // no need to store r to memory until we exit 3286 __ addptr(pos, AESBlockSize); 3287 __ subptr(len_reg, AESBlockSize); 3288 __ jcc(Assembler::notEqual, L_loopTop_256); 3289 __ jmp(L_exit); 3290 3291 return start; 3292 } 3293 3294 3295 3296 // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time 3297 // to hide instruction latency 3298 // 3299 // Arguments: 3300 // 3301 // Inputs: 3302 // c_rarg0 - source byte array address 3303 // c_rarg1 - destination byte array address 3304 // c_rarg2 - K (key) in little endian int array 3305 // c_rarg3 - r vector byte array address 3306 // c_rarg4 - input length 3307 // 3308 3309 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() { 3310 assert(UseAES, "need AES instructions and misaligned SSE support"); 3311 __ align(CodeEntryAlignment); 3312 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 3313 address start = __ pc(); 3314 3315 Label L_exit, L_key_192_256, L_key_256; 3316 Label L_singleBlock_loopTop_128, L_multiBlock_loopTop_128; 3317 Label L_singleBlock_loopTop_192, L_singleBlock_loopTop_256; 3318 const Register from = c_rarg0; // source array address 3319 const Register to = c_rarg1; // destination array address 3320 const Register key = c_rarg2; // key array address 3321 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 3322 // and left with the results of the last encryption block 3323 #ifndef _WIN64 3324 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 3325 #else 3326 const Address len_mem(rsp, 6 * wordSize); // length is on stack on Win64 3327 const Register len_reg = r10; // pick the first volatile windows register 3328 #endif 3329 const Register pos = rax; 3330 3331 // keys 0-10 preloaded into xmm2-xmm12 3332 const int XMM_REG_NUM_KEY_FIRST = 5; 3333 const int XMM_REG_NUM_KEY_LAST = 15; 3334 const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST); 3335 const XMMRegister xmm_key_last = as_XMMRegister(XMM_REG_NUM_KEY_LAST); 3336 3337 __ enter(); // required for proper stackwalking of RuntimeStub frame 3338 3339 #ifdef _WIN64 3340 // on win64, fill len_reg from stack position 3341 __ movl(len_reg, len_mem); 3342 // save the xmm registers which must be preserved 6-15 3343 __ subptr(rsp, -rsp_after_call_off * wordSize); 3344 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { 3345 __ movdqu(xmm_save(i), as_XMMRegister(i)); 3346 } 3347 #endif 3348 // the java expanded key ordering is rotated one position from what we want 3349 // so we start from 0x10 here and hit 0x00 last 3350 const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front 3351 __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr())); 3352 // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00 3353 for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) { 3354 load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask); 3355 offset += 0x10; 3356 } 3357 load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask); 3358 3359 const XMMRegister xmm_prev_block_cipher = xmm1; // holds cipher of previous block 3360 3361 // registers holding the four results in the parallelized loop 3362 const XMMRegister xmm_result0 = xmm0; 3363 const XMMRegister xmm_result1 = xmm2; 3364 const XMMRegister xmm_result2 = xmm3; 3365 const XMMRegister xmm_result3 = xmm4; 3366 3367 __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00)); // initialize with initial rvec 3368 3369 // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256)) 3370 __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3371 __ cmpl(rax, 44); 3372 __ jcc(Assembler::notEqual, L_key_192_256); 3373 3374 3375 // 128-bit code follows here, parallelized 3376 __ movptr(pos, 0); 3377 __ align(OptoLoopAlignment); 3378 __ BIND(L_multiBlock_loopTop_128); 3379 __ cmpptr(len_reg, 4*AESBlockSize); // see if at least 4 blocks left 3380 __ jcc(Assembler::less, L_singleBlock_loopTop_128); 3398 // for each result, xor with the r vector of previous cipher block 3399 __ pxor(xmm_result0, xmm_prev_block_cipher); 3400 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0*AESBlockSize)); 3401 __ pxor(xmm_result1, xmm_prev_block_cipher); 3402 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1*AESBlockSize)); 3403 __ pxor(xmm_result2, xmm_prev_block_cipher); 3404 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2*AESBlockSize)); 3405 __ pxor(xmm_result3, xmm_prev_block_cipher); 3406 __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3*AESBlockSize)); // this will carry over to next set of blocks 3407 3408 __ movdqu(Address(to, pos, Address::times_1, 0*AESBlockSize), xmm_result0); // store 4 results into the next 64 bytes of output 3409 __ movdqu(Address(to, pos, Address::times_1, 1*AESBlockSize), xmm_result1); 3410 __ movdqu(Address(to, pos, Address::times_1, 2*AESBlockSize), xmm_result2); 3411 __ movdqu(Address(to, pos, Address::times_1, 3*AESBlockSize), xmm_result3); 3412 3413 __ addptr(pos, 4*AESBlockSize); 3414 __ subptr(len_reg, 4*AESBlockSize); 3415 __ jmp(L_multiBlock_loopTop_128); 3416 3417 // registers used in the non-parallelized loops 3418 // xmm register assignments for the loops below 3419 const XMMRegister xmm_result = xmm0; 3420 const XMMRegister xmm_prev_block_cipher_save = xmm2; 3421 const XMMRegister xmm_key11 = xmm3; 3422 const XMMRegister xmm_key12 = xmm4; 3423 const XMMRegister xmm_temp = xmm4; 3424 3425 __ align(OptoLoopAlignment); 3426 __ BIND(L_singleBlock_loopTop_128); 3427 __ cmpptr(len_reg, 0); // any blocks left?? 3428 __ jcc(Assembler::equal, L_exit); 3429 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input 3430 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector 3431 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds 3432 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { 3433 __ aesdec(xmm_result, as_XMMRegister(rnum)); 3434 } 3435 __ aesdeclast(xmm_result, xmm_key_last); 3436 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector 3437 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3438 // no need to store r to memory until we exit 3439 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block 3440 3441 __ addptr(pos, AESBlockSize); 3442 __ subptr(len_reg, AESBlockSize); 3443 __ jmp(L_singleBlock_loopTop_128); 3444 3445 3446 __ BIND(L_exit); 3447 __ movdqu(Address(rvec, 0), xmm_prev_block_cipher); // final value of r stored in rvec of CipherBlockChaining object 3448 #ifdef _WIN64 3449 // restore regs belonging to calling function 3450 for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) { 3451 __ movdqu(as_XMMRegister(i), xmm_save(i)); 3452 } 3453 #endif 3454 __ movl(rax, 0); // return 0 (why?) 3455 __ leave(); // required for proper stackwalking of RuntimeStub frame 3456 __ ret(0); 3457 3458 3459 __ BIND(L_key_192_256); 3460 // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256) 3461 load_key(xmm_key11, key, 0xb0); 3462 __ cmpl(rax, 52); 3463 __ jcc(Assembler::notEqual, L_key_256); 3464 3465 // 192-bit code follows here (could be optimized to use parallelism) 3466 load_key(xmm_key12, key, 0xc0); // 192-bit key goes up to c0 3467 __ movptr(pos, 0); 3468 __ align(OptoLoopAlignment); 3469 3470 __ BIND(L_singleBlock_loopTop_192); 3471 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input 3472 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector 3473 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds 3474 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { 3475 __ aesdec(xmm_result, as_XMMRegister(rnum)); 3476 } 3477 __ aesdec(xmm_result, xmm_key11); 3478 __ aesdec(xmm_result, xmm_key12); 3479 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0 3480 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector 3481 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3482 // no need to store r to memory until we exit 3483 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block 3484 __ addptr(pos, AESBlockSize); 3485 __ subptr(len_reg, AESBlockSize); 3486 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_192); 3487 __ jmp(L_exit); 3488 3489 __ BIND(L_key_256); 3490 // 256-bit code follows here (could be optimized to use parallelism) 3491 __ movptr(pos, 0); 3492 __ align(OptoLoopAlignment); 3493 3494 __ BIND(L_singleBlock_loopTop_256); 3495 __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input 3496 __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector 3497 __ pxor (xmm_result, xmm_key_first); // do the aes dec rounds 3498 for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_LAST - 1; rnum++) { 3499 __ aesdec(xmm_result, as_XMMRegister(rnum)); 3500 } 3501 __ aesdec(xmm_result, xmm_key11); 3502 load_key(xmm_temp, key, 0xc0); 3503 __ aesdec(xmm_result, xmm_temp); 3504 load_key(xmm_temp, key, 0xd0); 3505 __ aesdec(xmm_result, xmm_temp); 3506 load_key(xmm_temp, key, 0xe0); // 256-bit key goes up to e0 3507 __ aesdec(xmm_result, xmm_temp); 3508 __ aesdeclast(xmm_result, xmm_key_last); // xmm15 came from key+0 3509 __ pxor (xmm_result, xmm_prev_block_cipher); // xor with the current r vector 3510 __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output 3511 // no need to store r to memory until we exit 3512 __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block 3513 __ addptr(pos, AESBlockSize); 3514 __ subptr(len_reg, AESBlockSize); 3515 __ jcc(Assembler::notEqual,L_singleBlock_loopTop_256); 3516 __ jmp(L_exit); 3517 3518 return start; 3519 } 3520 3521 3522 3523 #undef __ 3524 #define __ masm-> 3525 3526 // Continuation point for throwing of implicit exceptions that are 3527 // not handled in the current activation. Fabricates an exception 3528 // oop and initiates normal exception dispatching in this 3529 // frame. Since we need to preserve callee-saved values (currently 3530 // only for C2, but done for C1 as well) we need a callee-saved oop 3531 // map and therefore have to make these stubs into RuntimeStubs 3532 // rather than BufferBlobs. If the compiler needs all registers to |