1 /* 2 * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #ifdef COMPILE_CRYPTO 26 27 // The Rijndael S-box and inverted S-box are embedded here for a faster access. 28 // 29 // Note about lookup tables (T1...T4 and T5..T8): 30 // The tables (boxes) combine ahead-of-time precalculated transposition and mixing steps as 31 // an alternative to a runtime calculation. 32 // The tables are statically generated in com/sun/crypto/provider/AESCrypt class. 33 // Only the first table reference is passed to AES methods below. The other 3 tables 34 // in ecryption and decryption are calculated in runtime by rotating the T1 result accordingly. 35 // It is a free operation on ARM with embedded register-shifted-register EOR capability. 36 // The table reference is passed in a form of a last argument on the parametes list. 37 // The tables lookup method proves to perform better then a runtime Galois Field caclulation, 38 // due to a lack of HW acceleration for the later. 39 40 unsigned char * SBox; 41 unsigned char * SInvBox; 42 43 void aes_init() { 44 45 const static unsigned char Si[256] = 46 { 47 0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, 48 0xBF, 0x40, 0xA3, 0x9E, 0x81, 0xF3, 0xD7, 0xFB, 49 0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87, 50 0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB, 51 0x54, 0x7B, 0x94, 0x32, 0xA6, 0xC2, 0x23, 0x3D, 52 0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E, 53 0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2, 54 0x76, 0x5B, 0xA2, 0x49, 0x6D, 0x8B, 0xD1, 0x25, 55 0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16, 56 0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92, 57 0x6C, 0x70, 0x48, 0x50, 0xFD, 0xED, 0xB9, 0xDA, 58 0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84, 59 0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A, 60 0xF7, 0xE4, 0x58, 0x05, 0xB8, 0xB3, 0x45, 0x06, 61 0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02, 62 0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B, 63 0x3A, 0x91, 0x11, 0x41, 0x4F, 0x67, 0xDC, 0xEA, 64 0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73, 65 0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85, 66 0xE2, 0xF9, 0x37, 0xE8, 0x1C, 0x75, 0xDF, 0x6E, 67 0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89, 68 0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B, 69 0xFC, 0x56, 0x3E, 0x4B, 0xC6, 0xD2, 0x79, 0x20, 70 0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4, 71 0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31, 72 0xB1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xEC, 0x5F, 73 0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D, 74 0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF, 75 0xA0, 0xE0, 0x3B, 0x4D, 0xAE, 0x2A, 0xF5, 0xB0, 76 0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61, 77 0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26, 78 0xE1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0C, 0x7D 79 }; 80 81 static const unsigned char S[256]={ 82 0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 83 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, 84 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, 85 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, 86 0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, 87 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, 88 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 89 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75, 90 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, 91 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, 92 0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, 93 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, 94 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 95 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8, 96 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, 97 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, 98 0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, 99 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, 100 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 101 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB, 102 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, 103 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, 104 0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, 105 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, 106 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 107 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A, 108 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, 109 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, 110 0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, 111 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, 112 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 113 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16 114 }; 115 116 SBox = (unsigned char*)S; 117 SInvBox = (unsigned char*)Si; 118 } 119 120 address generate_aescrypt_encryptBlock() { 121 __ align(CodeEntryAlignment); 122 StubCodeMark mark(this, "StubRoutines", "aesencryptBlock"); 123 124 address start = __ pc(); 125 126 // Register from = R0; // source byte array 127 // Register to = R1; // destination byte array 128 // Register key = R2; // expanded key array 129 // Register tbox = R3; // transposition box reference 130 131 __ push (RegisterSet(R4, R12) | LR); 132 __ fstmdbd(SP, FloatRegisterSet(D0, 4), writeback); 133 __ sub(SP, SP, 32); 134 135 // preserve TBox references 136 __ add(R3, R3, arrayOopDesc::base_offset_in_bytes(T_INT)); 137 __ str(R3, Address(SP, 16)); 138 139 // retrieve key length. The length is used to determine the number of subsequent rounds (10, 12 or 14) 140 __ ldr(R9, Address(R2, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 141 142 __ ldr(R5, Address(R0)); 143 __ ldr(R10, Address(R2, 4, post_indexed)); 144 __ rev(R5, R5); 145 __ eor(R5, R5, R10); 146 __ ldr(R6, Address(R0, 4)); 147 __ ldr(R10, Address(R2, 4, post_indexed)); 148 __ rev(R6, R6); 149 __ eor(R6, R6, R10); 150 __ ldr(R7, Address(R0, 8)); 151 __ ldr(R10, Address(R2, 4, post_indexed)); 152 __ rev(R7, R7); 153 __ eor(R7, R7, R10); 154 __ ldr(R8, Address(R0, 12)); 155 __ ldr(R10, Address(R2, 4, post_indexed)); 156 __ rev(R8, R8); 157 __ eor(R8, R8, R10); 158 159 // Store the key size; However before doing that adjust the key to compensate for the Initial and Last rounds 160 __ sub(R9, R9, 8); 161 __ fmsr(S7, R1); 162 163 // load first transporistion box (T1) 164 __ ldr(R0, Address(SP, 16)); 165 166 __ mov(LR, R2); 167 168 Label round; 169 170 __ bind(round); 171 172 // Utilize a Transposition Box lookup along with subsequent shift and EOR with a round key. 173 // instructions ordering is rearranged to minimize ReadAferWrite dependency. Not that important on A15 target 174 // with register renaming but performs ~10% better on A9. 175 __ mov(R12, AsmOperand(R5, lsr, 24)); 176 __ ubfx(R4, R6, 16, 8); 177 __ ldr (R1, Address(R0, R12, lsl, 2)); 178 __ ldr(R2, Address(R0, R4, lsl, 2)); 179 __ ubfx(R3, R7, 8, 8); 180 __ eor(R1, R1, AsmOperand(R2, ror, 8)); 181 __ uxtb(R4, R8); 182 __ ldr(R3, Address(R0, R3, lsl, 2)); 183 __ ldr(R4, Address(R0, R4, lsl, 2)); 184 __ ldr(R12, Address(LR, 4, post_indexed)); 185 __ eor(R1, R1, AsmOperand(R3, ror, 16)); 186 __ eor(R12, R12, AsmOperand(R4, ror, 24)); 187 __ eor(R10, R1, R12); 188 189 __ mov(R12, AsmOperand(R6, lsr, 24)); 190 __ ubfx(R4, R7, 16, 8); 191 __ ldr (R1, Address(R0, R12, lsl, 2)); 192 __ ldr(R2, Address(R0, R4, lsl, 2)); 193 __ ubfx(R3, R8, 8, 8); 194 __ eor(R1, R1, AsmOperand(R2, ror, 8)); 195 __ uxtb(R4, R5); 196 __ ldr(R3, Address(R0, R3, lsl, 2)); 197 __ ldr(R4, Address(R0, R4, lsl, 2)); 198 __ ldr(R12, Address(LR, 4, post_indexed)); 199 __ eor(R1, R1, AsmOperand(R3, ror, 16)); 200 __ eor(R12, R12, AsmOperand(R4, ror, 24)); 201 __ eor(R11, R1, R12); 202 203 __ mov(R12, AsmOperand(R7, lsr, 24)); 204 __ ubfx(R4, R8, 16, 8); 205 __ ldr (R1, Address(R0, R12, lsl, 2)); 206 __ ldr(R2, Address(R0, R4, lsl, 2)); 207 __ ubfx(R3, R5, 8, 8); 208 __ eor(R1, R1, AsmOperand(R2, ror, 8)); 209 __ uxtb(R4, R6); 210 __ ldr(R3, Address(R0, R3, lsl, 2)); 211 __ ldr(R4, Address(R0, R4, lsl, 2)); 212 __ ldr(R12, Address(LR, 4, post_indexed)); 213 __ eor(R1, R1, AsmOperand(R3, ror, 16)); 214 __ eor(R12, R12, AsmOperand(R4, ror, 24)); 215 __ eor(R3, R1, R12); 216 __ str(R3, Address(SP, 0)); 217 218 __ mov(R12, AsmOperand(R8, lsr, 24)); 219 __ ubfx(R4, R5, 16, 8); 220 __ ldr (R1, Address(R0, R12, lsl, 2)); 221 __ ldr(R2, Address(R0, R4, lsl, 2)); 222 __ ubfx(R3, R6, 8, 8); 223 __ eor(R1, R1, AsmOperand(R2, ror, 8)); 224 __ uxtb(R4, R7); 225 __ ldr(R3, Address(R0, R3, lsl, 2)); 226 __ ldr(R4, Address(R0, R4, lsl, 2)); 227 __ ldr(R12, Address(LR, 4, post_indexed)); 228 __ eor(R1, R1, AsmOperand(R3, ror, 16)); 229 __ eor(R12, R12, AsmOperand(R4, ror, 24)); 230 __ eor(R8, R1, R12); 231 232 // update round count 233 __ subs(R9, R9, 4); 234 235 __ mov(R5, R10); 236 __ mov(R6, R11); 237 __ ldr(R7, Address(SP, 0)); 238 239 __ b(round, gt); 240 241 242 // last round - a special case, no MixColumn 243 __ mov_slow(R10, (int)SBox); 244 245 246 // output buffer pointer 247 __ fmrs(R9, S7); 248 249 __ ldr(R11, Address(LR, 4, post_indexed)); 250 __ ldrb(R0, Address(R10, R5, lsr, 24)); 251 __ ubfx(R12, R6, 16, 8); 252 __ ldrb(R1, Address(R10, R12)); 253 __ orr(R0, R1, AsmOperand(R0, lsl, 8)); 254 __ ubfx(R12, R7, 8, 8); 255 __ ldrb(R2, Address(R10, R12)); 256 __ orr(R0, R2, AsmOperand(R0, lsl, 8)); 257 __ uxtb (R12, R8); 258 __ ldrb(R3, Address(R10, R12)); 259 __ orr(R0, R3, AsmOperand(R0, lsl, 8)); 260 __ eor(R0, R0, R11); 261 __ rev(R0, R0); 262 __ str(R0, Address(R9, 4, post_indexed)); 263 264 __ ldr(R11, Address(LR, 4, post_indexed)); 265 __ ldrb(R0, Address(R10, R6, lsr, 24)); 266 __ ubfx(R12, R7, 16, 8); 267 __ ldrb(R1, Address(R10, R12)); 268 __ orr(R0, R1, AsmOperand(R0, lsl, 8)); 269 __ ubfx(R12, R8, 8, 8); 270 __ ldrb(R2, Address(R10, R12)); 271 __ orr(R0, R2, AsmOperand(R0, lsl, 8)); 272 __ uxtb (R12, R5); 273 __ ldrb(R3, Address(R10, R12)); 274 __ orr(R0, R3, AsmOperand(R0, lsl, 8)); 275 __ eor(R0, R0, R11); 276 __ rev(R0, R0); 277 278 __ str(R0, Address(R9, 4, post_indexed)); 279 __ ldr(R11, Address(LR, 4, post_indexed)); 280 __ ldrb(R0, Address(R10, R7, lsr, 24)); 281 __ ubfx(R12, R8, 16, 8); 282 __ ldrb(R1, Address(R10, R12)); 283 __ orr(R0, R1, AsmOperand(R0, lsl, 8)); 284 __ ubfx(R12, R5, 8, 8); 285 __ ldrb(R2, Address(R10, R12)); 286 __ orr(R0, R2, AsmOperand(R0, lsl, 8)); 287 __ uxtb (R12, R6); 288 __ ldrb(R3, Address(R10, R12)); 289 __ orr(R0, R3, AsmOperand(R0, lsl, 8)); 290 __ eor(R0, R0, R11); 291 __ rev(R0, R0); 292 293 __ str(R0, Address(R9, 4, post_indexed)); 294 __ ldr(R11, Address(LR)); 295 __ ldrb(R0, Address(R10, R8, lsr, 24)); 296 __ ubfx(R12, R5, 16, 8); 297 __ ldrb(R1, Address(R10, R12)); 298 __ orr(R0, R1, AsmOperand(R0, lsl, 8)); 299 __ ubfx(R12, R6, 8, 8); 300 __ ldrb(R2, Address(R10, R12)); 301 __ orr(R0, R2, AsmOperand(R0, lsl, 8)); 302 __ uxtb (R12, R7); 303 __ ldrb(R3, Address(R10, R12)); 304 __ orr(R0, R3, AsmOperand(R0, lsl, 8)); 305 __ eor(R0, R0, R11); 306 __ rev(R0, R0); 307 308 __ str(R0, Address(R9)); 309 310 __ add(SP, SP, 32); 311 __ fldmiad(SP, FloatRegisterSet(D0, 4), writeback);; 312 313 __ pop(RegisterSet(R4, R12) | PC); 314 return start; 315 } 316 317 address generate_aescrypt_decryptBlock() { 318 __ align(CodeEntryAlignment); 319 StubCodeMark mark(this, "StubRoutines", "aesdecryptBlock"); 320 321 address start = __ pc(); 322 323 // Register from = R0; // source byte array 324 // Register to = R1; // destination byte array 325 // Register key = R2; // expanded key array 326 // Register tbox = R3; // transposition box reference 327 328 __ push (RegisterSet(R4, R12) | LR); 329 __ fstmdbd(SP, FloatRegisterSet(D0, 4), writeback); 330 __ sub(SP, SP, 32); 331 332 // retrieve key length 333 __ ldr(R9, Address(R2, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 334 335 // preserve TBox references 336 __ add(R3, R3, arrayOopDesc::base_offset_in_bytes(T_INT)); 337 __ str(R3, Address(SP, 16)); 338 339 340 // Preserve the expanded key pointer 341 __ fmsr(S8, R2); 342 343 // The first key round is applied to the last round 344 __ add(LR, R2, 16); 345 346 347 __ ldr(R5, Address(R0)); 348 __ ldr(R10, Address(LR, 4, post_indexed)); 349 __ rev(R5, R5); 350 __ eor(R5, R5, R10); 351 __ ldr(R6, Address(R0, 4)); 352 __ ldr(R10, Address(LR, 4, post_indexed)); 353 __ rev(R6, R6); 354 __ eor(R6, R6, R10); 355 __ ldr(R7, Address(R0, 8)); 356 __ ldr(R10, Address(LR, 4, post_indexed)); 357 __ rev(R7, R7); 358 __ eor(R7, R7, R10); 359 __ ldr(R8, Address(R0, 12)); 360 __ ldr(R10, Address(LR, 4, post_indexed)); 361 __ rev(R8, R8); 362 __ eor(R8, R8, R10); 363 364 365 // Store the key size; However before doing that adjust the key to compensate for the Initial and Last rounds 366 __ sub(R9, R9, 8); 367 __ fmsr(S7, R1); 368 369 // load transporistion box (T5) 370 __ ldr(R0, Address(SP, 16)); 371 372 Label round; 373 374 __ bind(round); 375 // each sub-block is treated similary: 376 377 // combine SubBytes|ShiftRows|MixColumn through a precalculated set of tables 378 // Utilize a Transposition Box lookup along with subsequent shift and EOR with a round key. 379 // instructions ordering is rearranged to minimize ReadAferWrite dependency. Not that important on A15 target 380 // with register renaming but performs ~10% better on A9. 381 __ mov(R12, AsmOperand(R5, lsr, 24)); 382 __ ubfx(R4, R8, 16, 8); 383 __ ldr (R1, Address(R0, R12, lsl, 2)); 384 __ ldr(R2, Address(R0, R4, lsl, 2)); 385 __ ubfx(R3, R7, 8, 8); 386 __ eor(R1, R1, AsmOperand(R2, ror, 8)); 387 __ uxtb(R4, R6); 388 __ ldr(R3, Address(R0, R3, lsl, 2)); 389 __ ldr(R4, Address(R0, R4, lsl, 2)); 390 __ ldr(R12, Address(LR, 4, post_indexed)); 391 __ eor(R1, R1, AsmOperand(R3, ror, 16)); 392 __ eor(R12, R12, AsmOperand(R4, ror, 24)); 393 __ eor(R10, R1, R12); 394 395 __ mov(R12, AsmOperand(R6, lsr, 24)); 396 __ ubfx(R4, R5, 16, 8); 397 __ ldr (R1, Address(R0, R12, lsl, 2)); 398 __ ldr(R2, Address(R0, R4, lsl, 2)); 399 __ ubfx(R3, R8, 8, 8); 400 __ eor(R1, R1, AsmOperand(R2, ror, 8)); 401 __ uxtb(R4, R7); 402 __ ldr(R3, Address(R0, R3, lsl, 2)); 403 __ ldr(R4, Address(R0, R4, lsl, 2)); 404 __ ldr(R12, Address(LR, 4, post_indexed)); 405 __ eor(R1, R1, AsmOperand(R3, ror, 16)); 406 __ eor(R12, R12, AsmOperand(R4, ror, 24)); 407 __ eor(R11, R1, R12); 408 409 __ mov(R12, AsmOperand(R7, lsr, 24)); 410 __ ubfx(R4, R6, 16, 8); 411 __ ldr (R1, Address(R0, R12, lsl, 2)); 412 __ ldr(R2, Address(R0, R4, lsl, 2)); 413 __ ubfx(R3, R5, 8, 8); 414 __ eor(R1, R1, AsmOperand(R2, ror, 8)); 415 __ uxtb(R4, R8); 416 __ ldr(R3, Address(R0, R3, lsl, 2)); 417 __ ldr(R4, Address(R0, R4, lsl, 2)); 418 __ ldr(R12, Address(LR, 4, post_indexed)); 419 __ eor(R1, R1, AsmOperand(R3, ror, 16)); 420 __ eor(R12, R12, AsmOperand(R4, ror, 24)); 421 __ eor(R3, R1, R12); 422 __ str(R3, Address(SP, 0)); 423 424 __ mov(R12, AsmOperand(R8, lsr, 24)); 425 __ ubfx(R4, R7, 16, 8); 426 __ ldr (R1, Address(R0, R12, lsl, 2)); 427 __ ldr(R2, Address(R0, R4, lsl, 2)); 428 __ ubfx(R3, R6, 8, 8); 429 __ eor(R1, R1, AsmOperand(R2, ror, 8)); 430 __ uxtb(R4, R5); 431 __ ldr(R3, Address(R0, R3, lsl, 2)); 432 __ ldr(R4, Address(R0, R4, lsl, 2)); 433 __ ldr(R12, Address(LR, 4, post_indexed)); 434 __ eor(R1, R1, AsmOperand(R3, ror, 16)); 435 __ eor(R12, R12, AsmOperand(R4, ror, 24)); 436 __ eor(R8, R1, R12); 437 438 // update round count 439 __ subs(R9, R9, 4); 440 441 __ mov(R5, R10); 442 __ mov(R6, R11); 443 __ ldr(R7, Address(SP, 0)); 444 445 __ b(round, gt); 446 447 // last round - a special case, no MixColumn: 448 449 // Retrieve expanded key pointer 450 __ fmrs(LR, S8); 451 452 __ mov_slow(R10, (int)SInvBox); 453 454 // output buffer pointer 455 __ fmrs(R9, S7); 456 457 // process each sub-block in a similar manner: 458 // 1. load a corresponding round key 459 __ ldr(R11, Address(LR, 4, post_indexed)); 460 // 2. combine SubBytes and ShiftRows stages 461 __ ldrb(R0, Address(R10, R5, lsr, 24)); 462 __ ubfx(R12, R8, 16, 8); 463 __ ldrb(R1, Address(R10, R12)); 464 __ orr(R0, R1, AsmOperand(R0, lsl, 8)); 465 __ ubfx(R12, R7, 8, 8); 466 __ ldrb(R2, Address(R10, R12)); 467 __ orr(R0, R2, AsmOperand(R0, lsl, 8)); 468 __ uxtb (R12, R6); 469 __ ldrb(R3, Address(R10, R12)); 470 __ orr(R3, R3, AsmOperand(R0, lsl, 8)); 471 // 3. AddRoundKey stage 472 __ eor(R0, R3, R11); 473 // 4. convert the result to LE representation 474 __ rev(R0, R0); 475 // 5. store in the output buffer 476 __ str(R0, Address(R9, 4, post_indexed)); 477 478 __ ldr(R11, Address(LR, 4, post_indexed)); 479 __ ldrb(R0, Address(R10, R6, lsr, 24)); 480 __ ubfx(R12, R5, 16, 8); 481 __ ldrb(R1, Address(R10, R12)); 482 __ orr(R0, R1, AsmOperand(R0, lsl, 8)); 483 __ ubfx(R12, R8, 8, 8); 484 __ ldrb(R2, Address(R10, R12)); 485 __ orr(R0, R2, AsmOperand(R0, lsl, 8)); 486 __ uxtb (R12, R7); 487 __ ldrb(R3, Address(R10, R12)); 488 __ orr(R0, R3, AsmOperand(R0, lsl, 8)); 489 __ eor(R0, R0, R11); 490 __ rev(R0, R0); 491 __ str(R0, Address(R9, 4, post_indexed)); 492 493 __ ldr(R11, Address(LR, 4, post_indexed)); 494 __ ldrb(R0, Address(R10, R7, lsr, 24)); 495 __ ubfx(R12, R6, 16, 8); 496 __ ldrb(R1, Address(R10, R12)); 497 __ orr(R0, R1, AsmOperand(R0, lsl, 8)); 498 __ ubfx(R12, R5, 8, 8); 499 __ ldrb(R2, Address(R10, R12)); 500 __ orr(R0, R2, AsmOperand(R0, lsl, 8)); 501 __ uxtb (R12, R8); 502 __ ldrb(R3, Address(R10, R12)); 503 __ orr(R0, R3, AsmOperand(R0, lsl, 8)); 504 __ eor(R0, R0, R11); 505 __ rev(R0, R0); 506 __ str(R0, Address(R9, 4, post_indexed)); 507 508 __ ldr(R11, Address(LR)); 509 __ ldrb(R0, Address(R10, R8, lsr, 24)); 510 __ ubfx(R12, R7, 16, 8); 511 __ ldrb(R1, Address(R10, R12)); 512 __ orr(R0, R1, AsmOperand(R0, lsl, 8)); 513 __ ubfx(R12, R6, 8, 8); 514 __ ldrb(R2, Address(R10, R12)); 515 __ orr(R0, R2, AsmOperand(R0, lsl, 8)); 516 __ uxtb (R12, R5); 517 __ ldrb(R3, Address(R10, R12)); 518 __ orr(R0, R3, AsmOperand(R0, lsl, 8)); 519 __ eor(R0, R0, R11); 520 __ rev(R0, R0); 521 __ str(R0, Address(R9)); 522 523 __ add(SP, SP, 32); 524 __ fldmiad(SP, FloatRegisterSet(D0, 4), writeback);; 525 __ pop(RegisterSet(R4, R12) | PC); 526 527 return start; 528 } 529 530 address generate_cipherBlockChaining_encryptAESCrypt() { 531 // R0 - plain 532 // R1 - cipher 533 // R2 - expanded key 534 // R3 - Initialization Vector (IV) 535 // [sp+0] - cipher len 536 // [sp+4] Transposition Box reference 537 538 __ align(CodeEntryAlignment); 539 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 540 541 address start = __ pc(); 542 543 __ push(RegisterSet(R4, R12) | LR); 544 // load cipher length (which is first element on the original calling stack) 545 __ ldr(R4, Address(SP, 40)); 546 547 __ sub(SP, SP, 32); 548 549 // preserve some arguments 550 __ mov(R5, R1); 551 __ mov(R6, R2); 552 553 // load IV 554 __ ldmia(R3, RegisterSet(R9, R12), writeback); 555 556 // preserve original source buffer on stack 557 __ str(R0, Address(SP, 16)); 558 559 Label loop; 560 __ bind(loop); 561 __ ldmia(R0, RegisterSet(R0, R1) | RegisterSet(R7, R8)); 562 563 __ eor(R0, R0, R9); 564 __ eor(R1, R1, R10); 565 __ eor(R7, R7, R11); 566 __ eor(R8, R8, R12); 567 __ stmia(SP, RegisterSet(R0, R1) | RegisterSet(R7, R8)); 568 569 __ mov(R0, SP); 570 __ mov(R1, R5); 571 __ mov(R2, R6); 572 __ ldr(R3, Address(SP, 40+32+4)); 573 574 // near call is sufficient since the target is also in the stubs 575 __ bl(StubRoutines::_aescrypt_encryptBlock); 576 577 __ subs(R4, R4, 16); 578 __ ldr(R0, Address(SP, 16), gt); 579 __ ldmia(R5, RegisterSet(R9, R12), writeback); 580 __ add(R0, R0, 16, gt); 581 __ str(R0, Address(SP, 16), gt); 582 __ b(loop, gt); 583 584 __ add(SP, SP, 32); 585 __ pop(RegisterSet(R4, R12) | LR); 586 // return cipher len (copied from the original argument) 587 __ ldr(R0, Address(SP)); 588 __ bx(LR); 589 590 return start; 591 } 592 593 594 // The CBC decryption could benefit from parallel processing as the blocks could be 595 // decrypted separatly from each other. 596 // NEON is utilized (if available) to perform parallel execution on 8 blocks at a time. 597 // Since Transposition Box (tbox) is used the parallel execution will only apply to an 598 // Initial Round and the last round. It's not practical to use NEON for a table lookup 599 // larger than 128 bytes. It also appears to be faster performing tbox lookup 600 // sequentially then execute Galois Field calculation in parallel. 601 602 address generate_cipherBlockChaining_decryptAESCrypt() { 603 __ align(CodeEntryAlignment); 604 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 605 606 address start = __ pc(); 607 608 Label single_block_done, single_block, cbc_done; 609 // R0 - cipher 610 // R1 - plain 611 // R2 - expanded key 612 // R3 - Initialization Vector (iv) 613 // [sp+0] - cipher len 614 // [sp+4] - Transpotition Box reference 615 616 __ push(RegisterSet(R4, R12) | LR); 617 618 // load cipher len: must be modulo 16 619 __ ldr(R4, Address(SP, 40)); 620 621 if (VM_Version::has_simd()) { 622 __ andrs(R4, R4, 0x7f); 623 } 624 625 // preserve registers based arguments 626 __ mov(R7, R2); 627 __ mov(R8, R3); 628 629 if (VM_Version::has_simd()) { 630 __ b(single_block_done, eq); 631 } 632 633 __ bind(single_block); 634 // preserve args 635 __ mov(R5, R0); 636 __ mov(R6, R1); 637 638 // reload arguments 639 __ mov(R2, R7); 640 __ ldr(R3, Address(SP, 40+4)); 641 642 // near call is sufficient as the method is part of the StubGenerator 643 __ bl((address)StubRoutines::_aescrypt_decryptBlock); 644 645 // check remainig cipher size (for individual block processing) 646 __ subs(R4, R4, 16); 647 if (VM_Version::has_simd()) { 648 __ tst(R4, 0x7f); 649 } 650 651 // load IV (changes based on a CBC schedule) 652 __ ldmia(R8, RegisterSet(R9, R12)); 653 654 // load plaintext from the previous block processing 655 __ ldmia(R6, RegisterSet(R0, R3)); 656 657 // perform IV addition and save the plaintext for good now 658 __ eor(R0, R0, R9); 659 __ eor(R1, R1, R10); 660 __ eor(R2, R2, R11); 661 __ eor(R3, R3, R12); 662 __ stmia(R6, RegisterSet(R0, R3)); 663 664 // adjust pointers for next block processing 665 __ mov(R8, R5); 666 __ add(R0, R5, 16); 667 __ add(R1, R6, 16); 668 __ b(single_block, ne); 669 670 __ bind(single_block_done); 671 if (!VM_Version::has_simd()) { 672 __ b(cbc_done); 673 } else { 674 // done with single blocks. 675 // check if any 8 block chunks are available for parallel processing 676 __ ldr(R4, Address(SP, 40)); 677 __ bics(R4, R4, 0x7f); 678 __ b(cbc_done, eq); 679 680 Label decrypt_8_blocks; 681 int quad = 1; 682 // Process 8 blocks in parallel 683 __ fstmdbd(SP, FloatRegisterSet(D8, 8), writeback); 684 __ sub(SP, SP, 40); 685 686 // record output buffer end address (used as a block counter) 687 Address output_buffer_end(SP, 16); 688 __ add(R5, R1, R4); 689 __ str(R5, output_buffer_end); 690 691 // preserve key pointer 692 Address rounds_key(SP, 28); 693 __ str(R7, rounds_key); 694 // in decryption the first 16 bytes of expanded key are used in the last round 695 __ add(LR, R7, 16); 696 697 698 // Record the end of the key which is used to indicate a last round 699 __ ldr(R3, Address(R7, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 700 __ add(R9, R7, AsmOperand(R3, lsl, 2)); 701 702 // preserve IV 703 Address iv(SP, 36); 704 __ str(R8, iv); 705 706 __ bind(decrypt_8_blocks); 707 __ mov(R5, R1); 708 709 // preserve original source pointer 710 Address original_src(SP, 32); 711 __ str(R0, original_src); 712 713 // Apply ShiftRow for 8 block at once: 714 // use output buffer for a temp storage to preload it into cache 715 716 __ vld1(D18, LR, MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 717 __ vld1(D0, Address(R0, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 718 __ vrev(D0, D0, quad, 32, MacroAssembler::VELEM_SIZE_8); 719 __ veor(D20, D0, D18, quad); 720 __ vst1(D20, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 721 722 __ vld1(D2, Address(R0, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 723 __ vrev(D2, D2, quad, 32, MacroAssembler::VELEM_SIZE_8); 724 __ veor(D20, D2, D18, quad); 725 __ vst1(D20, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 726 727 __ vld1(D4, Address(R0, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 728 __ vrev(D4, D4, quad, 32, MacroAssembler::VELEM_SIZE_8); 729 __ veor(D20, D4, D18, quad); 730 __ vst1(D20, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 731 732 __ vld1(D6, Address(R0, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 733 __ vrev(D6, D6, quad, 32, MacroAssembler::VELEM_SIZE_8); 734 __ veor(D20, D6, D18, quad); 735 __ vst1(D20, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 736 737 __ vld1(D8, Address(R0, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 738 __ vrev(D8, D8, quad, 32, MacroAssembler::VELEM_SIZE_8); 739 __ veor(D20, D8, D18, quad); 740 __ vst1(D20, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 741 742 __ vld1(D10, Address(R0, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 743 __ vrev(D10, D10, quad, 32, MacroAssembler::VELEM_SIZE_8); 744 __ veor(D20, D10, D18, quad); 745 __ vst1(D20, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 746 747 __ vld1(D12, Address(R0, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 748 __ vrev(D12, D12, quad, 32, MacroAssembler::VELEM_SIZE_8); 749 __ veor(D20, D12, D18, quad); 750 __ vst1(D20, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 751 752 __ vld1(D14, Address(R0, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 753 __ vrev(D14, D14, quad, 32, MacroAssembler::VELEM_SIZE_8); 754 __ veor(D20, D14, D18, quad); 755 __ vst1(D20, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 756 757 758 // Local frame map: 759 // sp+20 - ouput buffer pointer 760 // sp+28 - key pointer 761 // sp+32 - original source 762 // sp+36 - block counter 763 764 765 // preserve output buffer pointer 766 Address block_current_output_buffer(SP, 20); 767 __ str(R1, block_current_output_buffer); 768 769 // individual rounds in block processing are executed sequentially . 770 Label block_start; 771 772 // record end of the output buffer 773 __ add(R0, R1, 128); 774 __ str(R0, Address(SP, 12)); 775 776 __ bind(block_start); 777 778 // load transporistion box reference (T5) 779 // location of the reference (6th incoming argument, second slot on the stack): 780 // 10 scalar registers on stack 781 // 8 double-precision FP registers 782 // 40 bytes frame size for local storage 783 // 4 bytes offset to the original arguments list 784 __ ldr(R0, Address(SP, 40+64+40+4)); 785 __ add(R0, R0, arrayOopDesc::base_offset_in_bytes(T_INT)); 786 787 // load rounds key and compensate for the first and last rounds 788 __ ldr(LR, rounds_key); 789 __ add(LR, LR, 32); 790 791 // load block data out buffer 792 __ ldr(R2, block_current_output_buffer); 793 __ ldmia(R2, RegisterSet(R5, R8)); 794 795 Label round; 796 __ bind(round); 797 798 // Utilize a Transposition Box lookup along with subsequent shift and EOR with a round key. 799 // instructions ordering is rearranged to minimize ReadAferWrite dependency. Not that important on A15 target 800 // with register renaming but performs ~10% better on A9. 801 __ mov(R12, AsmOperand(R5, lsr, 24)); 802 __ ubfx(R4, R8, 16, 8); 803 __ ldr (R1, Address(R0, R12, lsl, 2)); 804 __ ldr(R2, Address(R0, R4, lsl, 2)); 805 __ ubfx(R3, R7, 8, 8); 806 __ eor(R1, R1, AsmOperand(R2, ror, 8)); 807 __ uxtb(R4, R6); 808 __ ldr(R3, Address(R0, R3, lsl, 2)); 809 __ ldr(R4, Address(R0, R4, lsl, 2)); 810 __ ldr(R12, Address(LR, 4, post_indexed)); 811 __ eor(R1, R1, AsmOperand(R3, ror, 16)); 812 __ eor(R12, R12, AsmOperand(R4, ror, 24)); 813 __ eor(R10, R1, R12); 814 815 __ mov(R12, AsmOperand(R6, lsr, 24)); 816 __ ubfx(R4, R5, 16, 8); 817 __ ldr (R1, Address(R0, R12, lsl, 2)); 818 __ ldr(R2, Address(R0, R4, lsl, 2)); 819 __ ubfx(R3, R8, 8, 8); 820 __ eor(R1, R1, AsmOperand(R2, ror, 8)); 821 __ uxtb(R4, R7); 822 __ ldr(R3, Address(R0, R3, lsl, 2)); 823 __ ldr(R4, Address(R0, R4, lsl, 2)); 824 __ ldr(R12, Address(LR, 4, post_indexed)); 825 __ eor(R1, R1, AsmOperand(R3, ror, 16)); 826 __ eor(R12, R12, AsmOperand(R4, ror, 24)); 827 __ eor(R11, R1, R12); 828 829 __ mov(R12, AsmOperand(R7, lsr, 24)); 830 __ ubfx(R4, R6, 16, 8); 831 __ ldr (R1, Address(R0, R12, lsl, 2)); 832 __ ldr(R2, Address(R0, R4, lsl, 2)); 833 __ ubfx(R3, R5, 8, 8); 834 __ eor(R1, R1, AsmOperand(R2, ror, 8)); 835 __ uxtb(R4, R8); 836 __ ldr(R3, Address(R0, R3, lsl, 2)); 837 __ ldr(R4, Address(R0, R4, lsl, 2)); 838 __ ldr(R12, Address(LR, 4, post_indexed)); 839 __ eor(R1, R1, AsmOperand(R3, ror, 16)); 840 __ eor(R12, R12, AsmOperand(R4, ror, 24)); 841 __ eor(R3, R1, R12); 842 __ str(R3, Address(SP, 0)); 843 844 __ mov(R12, AsmOperand(R8, lsr, 24)); 845 __ ubfx(R4, R7, 16, 8); 846 __ ldr (R1, Address(R0, R12, lsl, 2)); 847 __ ldr(R2, Address(R0, R4, lsl, 2)); 848 __ ubfx(R3, R6, 8, 8); 849 __ eor(R1, R1, AsmOperand(R2, ror, 8)); 850 __ uxtb(R4, R5); 851 __ ldr(R3, Address(R0, R3, lsl, 2)); 852 __ ldr(R4, Address(R0, R4, lsl, 2)); 853 __ ldr(R12, Address(LR, 4, post_indexed)); 854 __ eor(R1, R1, AsmOperand(R3, ror, 16)); 855 __ eor(R12, R12, AsmOperand(R4, ror, 24)); 856 __ eor(R8, R1, R12); 857 858 // see if we reached the key array end 859 __ cmp(R9, LR); 860 861 // load processed data 862 __ mov(R5, R10); 863 __ mov(R6, R11); 864 __ ldr(R7, Address(SP, 0)); 865 866 __ b(round, gt); 867 868 869 // last round is special 870 // this round could be implemented through vtbl instruction in NEON. However vtbl is limited to a 32-byte wide table (4 vectors), 871 // thus it requires 8 lookup rounds to cover 256-byte wide Si table. On the other hand scalar lookup is independent of the 872 // lookup table size and thus proves to be faster. 873 __ ldr(LR, block_current_output_buffer); 874 875 // cipher counter 876 __ ldr(R11, Address(SP, 12)); 877 878 __ mov_slow(R10, (int)SInvBox); 879 __ ldrb(R0, Address(R10, R5, lsr, 24)); 880 __ ubfx(R12, R8, 16, 8); 881 __ ldrb (R1, Address(R10, R12)); 882 __ orr(R0, R1, AsmOperand(R0, lsl, 8)); 883 __ ubfx(R12, R7, 8, 8); 884 __ ldrb(R2, Address(R10, R12)); 885 __ orr(R0, R2, AsmOperand(R0, lsl, 8)); 886 __ uxtb(R12, R6); 887 __ ldrb(R3, Address(R10, R12)); 888 __ orr(R0, R3, AsmOperand(R0, lsl, 8)); 889 __ str(R0, Address(LR, 4, post_indexed)); 890 891 __ ldrb(R0, Address(R10, R6, lsr, 24)); 892 __ ubfx(R12, R5, 16, 8); 893 __ ldrb (R1, Address(R10, R12)); 894 __ orr(R0, R1, AsmOperand(R0, lsl, 8)); 895 __ ubfx(R12, R8, 8, 8); 896 __ ldrb(R2, Address(R10, R12)); 897 __ orr(R0, R2, AsmOperand(R0, lsl, 8)); 898 __ uxtb(R12, R7); 899 __ ldrb(R3, Address(R10, R12)); 900 __ orr(R0, R3, AsmOperand(R0, lsl, 8)); 901 __ str(R0, Address(LR, 4, post_indexed)); 902 903 904 __ ldrb(R0, Address(R10, R7, lsr, 24)); 905 __ ubfx(R12, R6, 16, 8); 906 __ ldrb (R1, Address(R10, R12)); 907 __ orr(R0, R1, AsmOperand(R0, lsl, 8)); 908 __ ubfx(R12, R5, 8, 8); 909 __ ldrb(R2, Address(R10, R12)); 910 __ orr(R0, R2, AsmOperand(R0, lsl, 8)); 911 __ uxtb(R12, R8); 912 __ ldrb(R3, Address(R10, R12)); 913 __ orr(R0, R3, AsmOperand(R0, lsl, 8)); 914 __ str(R0, Address(LR, 4, post_indexed)); 915 916 917 __ ldrb(R0, Address(R10, R8, lsr, 24)); 918 __ ubfx(R12, R7, 16, 8); 919 __ ldrb (R1, Address(R10, R12)); 920 __ orr(R0, R1, AsmOperand(R0, lsl, 8)); 921 __ ubfx(R12, R6, 8, 8); 922 __ ldrb(R2, Address(R10, R12)); 923 __ orr(R0, R2, AsmOperand(R0, lsl, 8)); 924 __ uxtb(R12, R5); 925 __ ldrb(R3, Address(R10, R12)); 926 __ orr(R0, R3, AsmOperand(R0, lsl, 8)); 927 __ str(R0, Address(LR, 4, post_indexed)); 928 929 930 // preserve current scratch buffer pointer 931 __ cmp(R11, LR); 932 __ str(LR, block_current_output_buffer); 933 934 // go to the next block processing 935 __ b(block_start, ne); 936 937 938 939 // Perform last round AddRoundKey state on all 8 blocks 940 941 // load key pointer (remember that [sp+24] points to a byte #32 at the key array) 942 // last round is processed with the key[0 ..3] 943 __ ldr(LR, rounds_key); 944 945 // retireve original output buffer pointer 946 __ ldr(R1, block_current_output_buffer); 947 __ sub(R1, R1, 128); 948 __ mov(R5, R1); 949 950 951 // retrieve original cipher (source) pointer 952 __ ldr(R0, original_src); 953 954 // retrieve IV (second argument on stack) 955 __ ldr(R6, iv); 956 957 __ vld1(D20, R6, MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 958 __ vrev(D20, D20, quad, 32, MacroAssembler::VELEM_SIZE_8); 959 960 // perform last AddRoundKey and IV addition 961 __ vld1(D18, Address(LR, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 962 963 __ vld1(D22, Address(R1, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 964 __ veor(D22, D22, D18, quad); 965 __ veor(D22, D22, D20, quad); 966 __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8); 967 __ vst1(D22, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 968 969 970 __ vld1(D22, Address(R1, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 971 __ veor(D22, D22, D18, quad); 972 __ veor(D22, D22, D0, quad); 973 __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8); 974 __ vst1(D22, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 975 976 __ vld1(D22, Address(R1, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 977 __ veor(D22, D22, D18, quad); 978 __ veor(D22, D22, D2, quad); 979 __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8); 980 __ vst1(D22, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 981 982 __ vld1(D22, Address(R1, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 983 __ veor(D22, D22, D18, quad); 984 __ veor(D22, D22, D4, quad); 985 __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8); 986 __ vst1(D22, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 987 988 __ vld1(D22, Address(R1, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 989 __ veor(D22, D22, D18, quad); 990 __ veor(D22, D22, D6, quad); 991 __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8); 992 __ vst1(D22, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 993 994 __ vld1(D22, Address(R1, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 995 __ veor(D22, D22, D18, quad); 996 __ veor(D22, D22, D8, quad); 997 __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8); 998 __ vst1(D22, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 999 1000 __ vld1(D22, Address(R1, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 1001 __ veor(D22, D22, D18, quad); 1002 __ veor(D22, D22, D10, quad); 1003 __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8); 1004 __ vst1(D22, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 1005 1006 __ vld1(D22, Address(R1, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 1007 __ veor(D22, D22, D18, quad); 1008 __ veor(D22, D22, D12, quad); 1009 __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8); 1010 __ vst1(D22, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS); 1011 1012 1013 // check if we're done 1014 __ ldr(R4, output_buffer_end); 1015 __ cmp(R4, R1); 1016 __ add(R0, R0, 128-16); 1017 __ str(R0, iv); 1018 __ add(R0, R0, 16); 1019 1020 __ b(decrypt_8_blocks, ne); 1021 1022 __ add(SP, SP, 40); 1023 __ fldmiad(SP, FloatRegisterSet(D8, 8), writeback);; 1024 } 1025 1026 __ bind(cbc_done); 1027 __ pop(RegisterSet(R4, R12) | LR); 1028 __ ldr(R0, Address(SP)); 1029 __ bx(LR); 1030 1031 return start; 1032 } 1033 #endif // USE_CRYPTO