1 /*
   2  * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifdef COMPILE_CRYPTO
  26 
  27 // The Rijndael S-box and inverted S-box are embedded here for a faster access.
  28 //
  29 // Note about lookup tables (T1...T4 and T5..T8):
  30 // The tables (boxes) combine ahead-of-time precalculated transposition and mixing steps as
  31 // an alternative to a runtime calculation.
  32 // The tables are statically generated in com/sun/crypto/provider/AESCrypt class.
  33 // Only the first table reference is passed to AES methods below. The other 3 tables
  34 // in ecryption and decryption are calculated in runtime by rotating the T1 result accordingly.
  35 // It is a free operation on ARM with embedded register-shifted-register EOR capability.
  36 // The table reference is passed in a form of a last argument on the parametes list.
  37 // The tables lookup method proves to perform better then a runtime Galois Field caclulation,
  38 // due to a lack of HW acceleration for the later.
  39 
  40 unsigned char * SBox;
  41 unsigned char * SInvBox;
  42 
  43 void  aes_init() {
  44 
  45   const static unsigned char Si[256] =
  46     {
  47       0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38,
  48       0xBF, 0x40, 0xA3, 0x9E, 0x81, 0xF3, 0xD7, 0xFB,
  49       0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87,
  50       0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB,
  51       0x54, 0x7B, 0x94, 0x32, 0xA6, 0xC2, 0x23, 0x3D,
  52       0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E,
  53       0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2,
  54       0x76, 0x5B, 0xA2, 0x49, 0x6D, 0x8B, 0xD1, 0x25,
  55       0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16,
  56       0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92,
  57       0x6C, 0x70, 0x48, 0x50, 0xFD, 0xED, 0xB9, 0xDA,
  58       0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84,
  59       0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A,
  60       0xF7, 0xE4, 0x58, 0x05, 0xB8, 0xB3, 0x45, 0x06,
  61       0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02,
  62       0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B,
  63       0x3A, 0x91, 0x11, 0x41, 0x4F, 0x67, 0xDC, 0xEA,
  64       0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73,
  65       0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85,
  66       0xE2, 0xF9, 0x37, 0xE8, 0x1C, 0x75, 0xDF, 0x6E,
  67       0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89,
  68       0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B,
  69       0xFC, 0x56, 0x3E, 0x4B, 0xC6, 0xD2, 0x79, 0x20,
  70       0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4,
  71       0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31,
  72       0xB1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xEC, 0x5F,
  73       0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D,
  74       0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF,
  75       0xA0, 0xE0, 0x3B, 0x4D, 0xAE, 0x2A, 0xF5, 0xB0,
  76       0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61,
  77       0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26,
  78       0xE1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0C, 0x7D
  79     };
  80 
  81   static const unsigned char S[256]={
  82       0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5,
  83       0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
  84       0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0,
  85       0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
  86       0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC,
  87       0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
  88       0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A,
  89       0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
  90       0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0,
  91       0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
  92       0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B,
  93       0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
  94       0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85,
  95       0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
  96       0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5,
  97       0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
  98       0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17,
  99       0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
 100       0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88,
 101       0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
 102       0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C,
 103       0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
 104       0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9,
 105       0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
 106       0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6,
 107       0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
 108       0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E,
 109       0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
 110       0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94,
 111       0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
 112       0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68,
 113       0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16
 114   };
 115 
 116   SBox = (unsigned char*)S;
 117   SInvBox = (unsigned char*)Si;
 118 }
 119 
 120 address generate_aescrypt_encryptBlock() {
 121   __ align(CodeEntryAlignment);
 122   StubCodeMark mark(this, "StubRoutines", "aesencryptBlock");
 123 
 124   address start = __ pc();
 125 
 126   //    Register from = R0; // source byte array
 127   //    Register to = R1;   // destination byte array
 128   //    Register key = R2;  // expanded key array
 129   //    Register tbox = R3; // transposition box reference
 130 
 131   __ push (RegisterSet(R4, R12) | LR);
 132   __ fstmdbd(SP, FloatRegisterSet(D0, 4), writeback);
 133   __ sub(SP, SP, 32);
 134 
 135   // preserve TBox references
 136   __ add(R3, R3, arrayOopDesc::base_offset_in_bytes(T_INT));
 137   __ str(R3, Address(SP, 16));
 138 
 139   // retrieve key length. The length is used to determine the number of subsequent rounds (10, 12 or 14)
 140   __ ldr(R9, Address(R2, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 141 
 142   __ ldr(R5, Address(R0));
 143   __ ldr(R10, Address(R2, 4, post_indexed));
 144   __ rev(R5, R5);
 145   __ eor(R5, R5, R10);
 146   __ ldr(R6, Address(R0, 4));
 147   __ ldr(R10, Address(R2, 4, post_indexed));
 148   __ rev(R6, R6);
 149   __ eor(R6, R6, R10);
 150   __ ldr(R7, Address(R0, 8));
 151   __ ldr(R10, Address(R2, 4, post_indexed));
 152   __ rev(R7, R7);
 153   __ eor(R7, R7, R10);
 154   __ ldr(R8, Address(R0, 12));
 155   __ ldr(R10, Address(R2, 4, post_indexed));
 156   __ rev(R8, R8);
 157   __ eor(R8, R8, R10);
 158 
 159   // Store the key size; However before doing that adjust the key to compensate for the Initial and Last rounds
 160   __ sub(R9, R9, 8);
 161   __ fmsr(S7, R1);
 162 
 163   // load first transporistion box (T1)
 164   __ ldr(R0, Address(SP, 16));
 165 
 166   __ mov(LR, R2);
 167 
 168   Label round;
 169 
 170   __ bind(round);
 171 
 172   // Utilize a Transposition Box lookup along with subsequent shift and EOR with a round key.
 173   // instructions ordering is rearranged to minimize ReadAferWrite dependency. Not that important on A15 target
 174   // with register renaming but performs ~10% better on A9.
 175   __ mov(R12, AsmOperand(R5, lsr, 24));
 176   __ ubfx(R4, R6, 16, 8);
 177   __ ldr (R1, Address(R0, R12, lsl, 2));
 178   __ ldr(R2, Address(R0, R4, lsl, 2));
 179   __ ubfx(R3, R7, 8, 8);
 180   __ eor(R1, R1, AsmOperand(R2, ror, 8));
 181   __ uxtb(R4, R8);
 182   __ ldr(R3, Address(R0, R3, lsl, 2));
 183   __ ldr(R4, Address(R0, R4, lsl, 2));
 184   __ ldr(R12, Address(LR, 4, post_indexed));
 185   __ eor(R1, R1, AsmOperand(R3, ror, 16));
 186   __ eor(R12, R12, AsmOperand(R4, ror, 24));
 187   __ eor(R10, R1, R12);
 188 
 189   __ mov(R12, AsmOperand(R6, lsr, 24));
 190   __ ubfx(R4, R7, 16, 8);
 191   __ ldr (R1, Address(R0, R12, lsl, 2));
 192   __ ldr(R2, Address(R0, R4, lsl, 2));
 193   __ ubfx(R3, R8, 8, 8);
 194   __ eor(R1, R1, AsmOperand(R2, ror, 8));
 195   __ uxtb(R4, R5);
 196   __ ldr(R3, Address(R0, R3, lsl, 2));
 197   __ ldr(R4, Address(R0, R4, lsl, 2));
 198   __ ldr(R12, Address(LR, 4, post_indexed));
 199   __ eor(R1, R1, AsmOperand(R3, ror, 16));
 200   __ eor(R12, R12, AsmOperand(R4, ror, 24));
 201   __ eor(R11, R1, R12);
 202 
 203   __ mov(R12, AsmOperand(R7, lsr, 24));
 204   __ ubfx(R4, R8, 16, 8);
 205   __ ldr (R1, Address(R0, R12, lsl, 2));
 206   __ ldr(R2, Address(R0, R4, lsl, 2));
 207   __ ubfx(R3, R5, 8, 8);
 208   __ eor(R1, R1, AsmOperand(R2, ror, 8));
 209   __ uxtb(R4, R6);
 210   __ ldr(R3, Address(R0, R3, lsl, 2));
 211   __ ldr(R4, Address(R0, R4, lsl, 2));
 212   __ ldr(R12, Address(LR, 4, post_indexed));
 213   __ eor(R1, R1, AsmOperand(R3, ror, 16));
 214   __ eor(R12, R12, AsmOperand(R4, ror, 24));
 215   __ eor(R3, R1, R12);
 216   __ str(R3, Address(SP, 0));
 217 
 218   __ mov(R12, AsmOperand(R8, lsr, 24));
 219   __ ubfx(R4, R5, 16, 8);
 220   __ ldr (R1, Address(R0, R12, lsl, 2));
 221   __ ldr(R2, Address(R0, R4, lsl, 2));
 222   __ ubfx(R3, R6, 8, 8);
 223   __ eor(R1, R1, AsmOperand(R2, ror, 8));
 224   __ uxtb(R4, R7);
 225   __ ldr(R3, Address(R0, R3, lsl, 2));
 226   __ ldr(R4, Address(R0, R4, lsl, 2));
 227   __ ldr(R12, Address(LR, 4, post_indexed));
 228   __ eor(R1, R1, AsmOperand(R3, ror, 16));
 229   __ eor(R12, R12, AsmOperand(R4, ror, 24));
 230   __ eor(R8, R1, R12);
 231 
 232   // update round count
 233   __ subs(R9, R9, 4);
 234 
 235   __ mov(R5, R10);
 236   __ mov(R6, R11);
 237   __ ldr(R7, Address(SP, 0));
 238 
 239   __ b(round, gt);
 240 
 241 
 242   // last round - a special case, no MixColumn
 243   __ mov_slow(R10, (int)SBox);
 244 
 245 
 246   // output buffer pointer
 247   __ fmrs(R9, S7);
 248 
 249   __ ldr(R11, Address(LR, 4, post_indexed));
 250   __ ldrb(R0, Address(R10, R5, lsr, 24));
 251   __ ubfx(R12, R6, 16, 8);
 252   __ ldrb(R1, Address(R10, R12));
 253   __ orr(R0, R1, AsmOperand(R0, lsl, 8));
 254   __ ubfx(R12, R7, 8, 8);
 255   __ ldrb(R2, Address(R10, R12));
 256   __ orr(R0, R2, AsmOperand(R0, lsl, 8));
 257   __ uxtb (R12, R8);
 258   __ ldrb(R3, Address(R10, R12));
 259   __ orr(R0, R3, AsmOperand(R0, lsl, 8));
 260   __ eor(R0, R0, R11);
 261   __ rev(R0, R0);
 262   __ str(R0, Address(R9, 4, post_indexed));
 263 
 264   __ ldr(R11, Address(LR, 4, post_indexed));
 265   __ ldrb(R0, Address(R10, R6, lsr, 24));
 266   __ ubfx(R12, R7, 16, 8);
 267   __ ldrb(R1, Address(R10, R12));
 268   __ orr(R0, R1, AsmOperand(R0, lsl, 8));
 269   __ ubfx(R12, R8, 8, 8);
 270   __ ldrb(R2, Address(R10, R12));
 271   __ orr(R0, R2, AsmOperand(R0, lsl, 8));
 272   __ uxtb (R12, R5);
 273   __ ldrb(R3, Address(R10, R12));
 274   __ orr(R0, R3, AsmOperand(R0, lsl, 8));
 275   __ eor(R0, R0, R11);
 276   __ rev(R0, R0);
 277 
 278   __ str(R0, Address(R9, 4, post_indexed));
 279   __ ldr(R11, Address(LR, 4, post_indexed));
 280   __ ldrb(R0, Address(R10, R7, lsr, 24));
 281   __ ubfx(R12, R8, 16, 8);
 282   __ ldrb(R1, Address(R10, R12));
 283   __ orr(R0, R1, AsmOperand(R0, lsl, 8));
 284   __ ubfx(R12, R5, 8, 8);
 285   __ ldrb(R2, Address(R10, R12));
 286   __ orr(R0, R2, AsmOperand(R0, lsl, 8));
 287   __ uxtb (R12, R6);
 288   __ ldrb(R3, Address(R10, R12));
 289   __ orr(R0, R3, AsmOperand(R0, lsl, 8));
 290   __ eor(R0, R0, R11);
 291   __ rev(R0, R0);
 292 
 293   __ str(R0, Address(R9, 4, post_indexed));
 294   __ ldr(R11, Address(LR));
 295   __ ldrb(R0, Address(R10, R8, lsr, 24));
 296   __ ubfx(R12, R5, 16, 8);
 297   __ ldrb(R1, Address(R10, R12));
 298   __ orr(R0, R1, AsmOperand(R0, lsl, 8));
 299   __ ubfx(R12, R6, 8, 8);
 300   __ ldrb(R2, Address(R10, R12));
 301   __ orr(R0, R2, AsmOperand(R0, lsl, 8));
 302   __ uxtb (R12, R7);
 303   __ ldrb(R3, Address(R10, R12));
 304   __ orr(R0, R3, AsmOperand(R0, lsl, 8));
 305   __ eor(R0, R0, R11);
 306   __ rev(R0, R0);
 307 
 308   __ str(R0, Address(R9));
 309 
 310   __ add(SP, SP, 32);
 311   __ fldmiad(SP, FloatRegisterSet(D0, 4), writeback);;
 312 
 313   __ pop(RegisterSet(R4, R12) | PC);
 314   return start;
 315 }
 316 
 317 address generate_aescrypt_decryptBlock() {
 318   __ align(CodeEntryAlignment);
 319   StubCodeMark mark(this, "StubRoutines", "aesdecryptBlock");
 320 
 321   address start = __ pc();
 322 
 323   //    Register from = R0; // source byte array
 324   //    Register to = R1;   // destination byte array
 325   //    Register key = R2;  // expanded key array
 326   //    Register tbox = R3; // transposition box reference
 327 
 328   __ push (RegisterSet(R4, R12) | LR);
 329   __ fstmdbd(SP, FloatRegisterSet(D0, 4), writeback);
 330   __ sub(SP, SP, 32);
 331 
 332   // retrieve key length
 333   __ ldr(R9, Address(R2, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 334 
 335   // preserve TBox references
 336   __ add(R3, R3, arrayOopDesc::base_offset_in_bytes(T_INT));
 337   __ str(R3, Address(SP, 16));
 338 
 339 
 340   // Preserve the expanded key pointer
 341   __ fmsr(S8, R2);
 342 
 343   // The first key round is applied to the last round
 344   __ add(LR, R2, 16);
 345 
 346 
 347   __ ldr(R5, Address(R0));
 348   __ ldr(R10, Address(LR, 4, post_indexed));
 349   __ rev(R5, R5);
 350   __ eor(R5, R5, R10);
 351   __ ldr(R6, Address(R0, 4));
 352   __ ldr(R10, Address(LR, 4, post_indexed));
 353   __ rev(R6, R6);
 354   __ eor(R6, R6, R10);
 355   __ ldr(R7, Address(R0, 8));
 356   __ ldr(R10, Address(LR, 4, post_indexed));
 357   __ rev(R7, R7);
 358   __ eor(R7, R7, R10);
 359   __ ldr(R8, Address(R0, 12));
 360   __ ldr(R10, Address(LR, 4, post_indexed));
 361   __ rev(R8, R8);
 362   __ eor(R8, R8, R10);
 363 
 364 
 365   // Store the key size; However before doing that adjust the key to compensate for the Initial and Last rounds
 366   __ sub(R9, R9, 8);
 367   __ fmsr(S7, R1);
 368 
 369   // load transporistion box (T5)
 370   __ ldr(R0, Address(SP, 16));
 371 
 372   Label round;
 373 
 374   __ bind(round);
 375   // each sub-block is treated similary:
 376 
 377   // combine SubBytes|ShiftRows|MixColumn through a precalculated set of tables
 378   // Utilize a Transposition Box lookup along with subsequent shift and EOR with a round key.
 379   // instructions ordering is rearranged to minimize ReadAferWrite dependency. Not that important on A15 target
 380   // with register renaming but performs ~10% better on A9.
 381   __ mov(R12, AsmOperand(R5, lsr, 24));
 382   __ ubfx(R4, R8, 16, 8);
 383   __ ldr (R1, Address(R0, R12, lsl, 2));
 384   __ ldr(R2, Address(R0, R4, lsl, 2));
 385   __ ubfx(R3, R7, 8, 8);
 386   __ eor(R1, R1, AsmOperand(R2, ror, 8));
 387   __ uxtb(R4, R6);
 388   __ ldr(R3, Address(R0, R3, lsl, 2));
 389   __ ldr(R4, Address(R0, R4, lsl, 2));
 390   __ ldr(R12, Address(LR, 4, post_indexed));
 391   __ eor(R1, R1, AsmOperand(R3, ror, 16));
 392   __ eor(R12, R12, AsmOperand(R4, ror, 24));
 393   __ eor(R10, R1, R12);
 394 
 395   __ mov(R12, AsmOperand(R6, lsr, 24));
 396   __ ubfx(R4, R5, 16, 8);
 397   __ ldr (R1, Address(R0, R12, lsl, 2));
 398   __ ldr(R2, Address(R0, R4, lsl, 2));
 399   __ ubfx(R3, R8, 8, 8);
 400   __ eor(R1, R1, AsmOperand(R2, ror, 8));
 401   __ uxtb(R4, R7);
 402   __ ldr(R3, Address(R0, R3, lsl, 2));
 403   __ ldr(R4, Address(R0, R4, lsl, 2));
 404   __ ldr(R12, Address(LR, 4, post_indexed));
 405   __ eor(R1, R1, AsmOperand(R3, ror, 16));
 406   __ eor(R12, R12, AsmOperand(R4, ror, 24));
 407   __ eor(R11, R1, R12);
 408 
 409   __ mov(R12, AsmOperand(R7, lsr, 24));
 410   __ ubfx(R4, R6, 16, 8);
 411   __ ldr (R1, Address(R0, R12, lsl, 2));
 412   __ ldr(R2, Address(R0, R4, lsl, 2));
 413   __ ubfx(R3, R5, 8, 8);
 414   __ eor(R1, R1, AsmOperand(R2, ror, 8));
 415   __ uxtb(R4, R8);
 416   __ ldr(R3, Address(R0, R3, lsl, 2));
 417   __ ldr(R4, Address(R0, R4, lsl, 2));
 418   __ ldr(R12, Address(LR, 4, post_indexed));
 419   __ eor(R1, R1, AsmOperand(R3, ror, 16));
 420   __ eor(R12, R12, AsmOperand(R4, ror, 24));
 421   __ eor(R3, R1, R12);
 422   __ str(R3, Address(SP, 0));
 423 
 424   __ mov(R12, AsmOperand(R8, lsr, 24));
 425   __ ubfx(R4, R7, 16, 8);
 426   __ ldr (R1, Address(R0, R12, lsl, 2));
 427   __ ldr(R2, Address(R0, R4, lsl, 2));
 428   __ ubfx(R3, R6, 8, 8);
 429   __ eor(R1, R1, AsmOperand(R2, ror, 8));
 430   __ uxtb(R4, R5);
 431   __ ldr(R3, Address(R0, R3, lsl, 2));
 432   __ ldr(R4, Address(R0, R4, lsl, 2));
 433   __ ldr(R12, Address(LR, 4, post_indexed));
 434   __ eor(R1, R1, AsmOperand(R3, ror, 16));
 435   __ eor(R12, R12, AsmOperand(R4, ror, 24));
 436   __ eor(R8, R1, R12);
 437 
 438   // update round count
 439   __ subs(R9, R9, 4);
 440 
 441   __ mov(R5, R10);
 442   __ mov(R6, R11);
 443   __ ldr(R7, Address(SP, 0));
 444 
 445   __ b(round, gt);
 446 
 447   // last round - a special case, no MixColumn:
 448 
 449   // Retrieve expanded key pointer
 450   __ fmrs(LR, S8);
 451 
 452   __ mov_slow(R10, (int)SInvBox);
 453 
 454   // output buffer pointer
 455   __ fmrs(R9, S7);
 456 
 457   // process each sub-block in a similar manner:
 458   // 1. load a corresponding round key
 459   __ ldr(R11, Address(LR, 4, post_indexed));
 460   // 2. combine SubBytes and ShiftRows stages
 461   __ ldrb(R0, Address(R10, R5, lsr, 24));
 462   __ ubfx(R12, R8, 16, 8);
 463   __ ldrb(R1, Address(R10, R12));
 464   __ orr(R0, R1, AsmOperand(R0, lsl, 8));
 465   __ ubfx(R12, R7, 8, 8);
 466   __ ldrb(R2, Address(R10, R12));
 467   __ orr(R0, R2, AsmOperand(R0, lsl, 8));
 468   __ uxtb (R12, R6);
 469   __ ldrb(R3, Address(R10, R12));
 470   __ orr(R3, R3, AsmOperand(R0, lsl, 8));
 471   // 3. AddRoundKey stage
 472   __ eor(R0, R3, R11);
 473   // 4. convert the result to LE representation
 474   __ rev(R0, R0);
 475   // 5. store in the output buffer
 476   __ str(R0, Address(R9, 4, post_indexed));
 477 
 478   __ ldr(R11, Address(LR, 4, post_indexed));
 479   __ ldrb(R0, Address(R10, R6, lsr, 24));
 480   __ ubfx(R12, R5, 16, 8);
 481   __ ldrb(R1, Address(R10, R12));
 482   __ orr(R0, R1, AsmOperand(R0, lsl, 8));
 483   __ ubfx(R12, R8, 8, 8);
 484   __ ldrb(R2, Address(R10, R12));
 485   __ orr(R0, R2, AsmOperand(R0, lsl, 8));
 486   __ uxtb (R12, R7);
 487   __ ldrb(R3, Address(R10, R12));
 488   __ orr(R0, R3, AsmOperand(R0, lsl, 8));
 489   __ eor(R0, R0, R11);
 490   __ rev(R0, R0);
 491   __ str(R0, Address(R9, 4, post_indexed));
 492 
 493   __ ldr(R11, Address(LR, 4, post_indexed));
 494   __ ldrb(R0, Address(R10, R7, lsr, 24));
 495   __ ubfx(R12, R6, 16, 8);
 496   __ ldrb(R1, Address(R10, R12));
 497   __ orr(R0, R1, AsmOperand(R0, lsl, 8));
 498   __ ubfx(R12, R5, 8, 8);
 499   __ ldrb(R2, Address(R10, R12));
 500   __ orr(R0, R2, AsmOperand(R0, lsl, 8));
 501   __ uxtb (R12, R8);
 502   __ ldrb(R3, Address(R10, R12));
 503   __ orr(R0, R3, AsmOperand(R0, lsl, 8));
 504   __ eor(R0, R0, R11);
 505   __ rev(R0, R0);
 506   __ str(R0, Address(R9, 4, post_indexed));
 507 
 508   __ ldr(R11, Address(LR));
 509   __ ldrb(R0, Address(R10, R8, lsr, 24));
 510   __ ubfx(R12, R7, 16, 8);
 511   __ ldrb(R1, Address(R10, R12));
 512   __ orr(R0, R1, AsmOperand(R0, lsl, 8));
 513   __ ubfx(R12, R6, 8, 8);
 514   __ ldrb(R2, Address(R10, R12));
 515   __ orr(R0, R2, AsmOperand(R0, lsl, 8));
 516   __ uxtb (R12, R5);
 517   __ ldrb(R3, Address(R10, R12));
 518   __ orr(R0, R3, AsmOperand(R0, lsl, 8));
 519   __ eor(R0, R0, R11);
 520   __ rev(R0, R0);
 521   __ str(R0, Address(R9));
 522 
 523   __ add(SP, SP, 32);
 524   __ fldmiad(SP, FloatRegisterSet(D0, 4), writeback);;
 525   __ pop(RegisterSet(R4, R12) | PC);
 526 
 527   return start;
 528 }
 529 
 530 address generate_cipherBlockChaining_encryptAESCrypt() {
 531   // R0 - plain
 532   // R1 - cipher
 533   // R2 - expanded key
 534   // R3 - Initialization Vector (IV)
 535   // [sp+0] - cipher len
 536   // [sp+4] Transposition Box reference
 537 
 538   __ align(CodeEntryAlignment);
 539   StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
 540 
 541   address start = __ pc();
 542 
 543   __ push(RegisterSet(R4, R12) | LR);
 544   // load cipher length (which is first element on the original calling stack)
 545   __ ldr(R4, Address(SP, 40));
 546 
 547   __ sub(SP, SP, 32);
 548 
 549   // preserve some arguments
 550   __ mov(R5, R1);
 551   __ mov(R6, R2);
 552 
 553   // load IV
 554   __ ldmia(R3, RegisterSet(R9, R12), writeback);
 555 
 556   // preserve original source buffer on stack
 557   __ str(R0, Address(SP, 16));
 558 
 559   Label loop;
 560   __ bind(loop);
 561   __ ldmia(R0, RegisterSet(R0, R1) | RegisterSet(R7, R8));
 562 
 563   __ eor(R0, R0, R9);
 564   __ eor(R1, R1, R10);
 565   __ eor(R7, R7, R11);
 566   __ eor(R8, R8, R12);
 567   __ stmia(SP, RegisterSet(R0, R1) | RegisterSet(R7, R8));
 568 
 569   __ mov(R0, SP);
 570   __ mov(R1, R5);
 571   __ mov(R2, R6);
 572   __ ldr(R3, Address(SP, 40+32+4));
 573 
 574   // near call is sufficient since the target is also in the stubs
 575   __ bl(StubRoutines::_aescrypt_encryptBlock);
 576 
 577   __ subs(R4, R4, 16);
 578   __ ldr(R0, Address(SP, 16), gt);
 579   __ ldmia(R5, RegisterSet(R9, R12), writeback);
 580   __ add(R0, R0, 16, gt);
 581   __ str(R0, Address(SP, 16), gt);
 582   __ b(loop, gt);
 583 
 584   __ add(SP, SP, 32);
 585   __ pop(RegisterSet(R4, R12) | LR);
 586   // return cipher len (copied from the original argument)
 587   __ ldr(R0, Address(SP));
 588   __ bx(LR);
 589 
 590   return start;
 591 }
 592 
 593 
 594 // The CBC decryption could benefit from parallel processing as the blocks could be
 595 // decrypted separatly from each other.
 596 // NEON is utilized (if available) to perform parallel execution on 8 blocks at a time.
 597 // Since Transposition Box (tbox) is used the parallel execution will only apply to an
 598 // Initial Round and the last round. It's not practical to use NEON for a table lookup
 599 // larger than 128 bytes. It also appears to be faster performing  tbox lookup
 600 // sequentially then execute Galois Field calculation in parallel.
 601 
 602 address generate_cipherBlockChaining_decryptAESCrypt() {
 603   __ align(CodeEntryAlignment);
 604   StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
 605 
 606   address start = __ pc();
 607 
 608   Label single_block_done, single_block, cbc_done;
 609   // R0 - cipher
 610   // R1 - plain
 611   // R2 - expanded key
 612   // R3 - Initialization Vector (iv)
 613   // [sp+0] - cipher len
 614   // [sp+4] - Transpotition Box reference
 615 
 616   __ push(RegisterSet(R4, R12) | LR);
 617 
 618   // load cipher len: must be modulo 16
 619   __ ldr(R4, Address(SP, 40));
 620 
 621   if (VM_Version::has_simd()) {
 622     __ andrs(R4, R4, 0x7f);
 623   }
 624 
 625   // preserve registers based arguments
 626   __ mov(R7, R2);
 627   __ mov(R8, R3);
 628 
 629   if (VM_Version::has_simd()) {
 630     __ b(single_block_done, eq);
 631   }
 632 
 633   __ bind(single_block);
 634   // preserve args
 635   __ mov(R5, R0);
 636   __ mov(R6, R1);
 637 
 638   // reload arguments
 639   __ mov(R2, R7);
 640   __ ldr(R3, Address(SP, 40+4));
 641 
 642   // near call is sufficient as the method is part of the StubGenerator
 643   __ bl((address)StubRoutines::_aescrypt_decryptBlock);
 644 
 645   // check remainig cipher size (for individual block processing)
 646   __ subs(R4, R4, 16);
 647   if (VM_Version::has_simd()) {
 648     __ tst(R4, 0x7f);
 649   }
 650 
 651   // load IV (changes based on a CBC schedule)
 652   __ ldmia(R8, RegisterSet(R9, R12));
 653 
 654   // load plaintext from the previous block processing
 655   __ ldmia(R6, RegisterSet(R0, R3));
 656 
 657   // perform IV addition and save the plaintext for good now
 658   __ eor(R0, R0, R9);
 659   __ eor(R1, R1, R10);
 660   __ eor(R2, R2, R11);
 661   __ eor(R3, R3, R12);
 662   __ stmia(R6, RegisterSet(R0, R3));
 663 
 664   // adjust pointers for next block processing
 665   __ mov(R8, R5);
 666   __ add(R0, R5, 16);
 667   __ add(R1, R6, 16);
 668   __ b(single_block, ne);
 669 
 670   __ bind(single_block_done);
 671   if (!VM_Version::has_simd()) {
 672     __ b(cbc_done);
 673   } else {
 674   // done with single blocks.
 675   // check if any 8 block chunks are available for parallel processing
 676   __ ldr(R4, Address(SP, 40));
 677   __ bics(R4, R4, 0x7f);
 678   __ b(cbc_done, eq);
 679 
 680   Label decrypt_8_blocks;
 681   int quad = 1;
 682   // Process 8 blocks in parallel
 683   __ fstmdbd(SP, FloatRegisterSet(D8, 8), writeback);
 684   __ sub(SP, SP, 40);
 685 
 686   // record output buffer end address (used as a block counter)
 687   Address output_buffer_end(SP, 16);
 688   __ add(R5, R1, R4);
 689   __ str(R5, output_buffer_end);
 690 
 691   // preserve key pointer
 692   Address rounds_key(SP, 28);
 693   __ str(R7, rounds_key);
 694   // in decryption the first 16 bytes of expanded key are used in the last round
 695   __ add(LR, R7, 16);
 696 
 697 
 698   // Record the end of the key which is used to indicate a last round
 699   __ ldr(R3, Address(R7, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
 700   __ add(R9, R7, AsmOperand(R3, lsl, 2));
 701 
 702   // preserve IV
 703   Address iv(SP, 36);
 704   __ str(R8, iv);
 705 
 706   __ bind(decrypt_8_blocks);
 707   __ mov(R5, R1);
 708 
 709   // preserve original source pointer
 710   Address original_src(SP, 32);
 711   __ str(R0, original_src);
 712 
 713   // Apply ShiftRow for 8 block at once:
 714   // use output buffer for a temp storage to preload it into cache
 715 
 716   __ vld1(D18, LR, MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 717   __ vld1(D0, Address(R0, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 718   __ vrev(D0, D0, quad, 32, MacroAssembler::VELEM_SIZE_8);
 719   __ veor(D20, D0, D18, quad);
 720   __ vst1(D20, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 721 
 722   __ vld1(D2, Address(R0, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 723   __ vrev(D2, D2, quad, 32, MacroAssembler::VELEM_SIZE_8);
 724   __ veor(D20, D2, D18, quad);
 725   __ vst1(D20, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 726 
 727   __ vld1(D4, Address(R0, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 728   __ vrev(D4, D4, quad, 32, MacroAssembler::VELEM_SIZE_8);
 729   __ veor(D20, D4, D18, quad);
 730   __ vst1(D20, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 731 
 732   __ vld1(D6, Address(R0, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 733   __ vrev(D6, D6, quad, 32, MacroAssembler::VELEM_SIZE_8);
 734   __ veor(D20, D6, D18, quad);
 735   __ vst1(D20, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 736 
 737   __ vld1(D8, Address(R0, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 738   __ vrev(D8, D8, quad, 32, MacroAssembler::VELEM_SIZE_8);
 739   __ veor(D20, D8, D18, quad);
 740   __ vst1(D20, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 741 
 742   __ vld1(D10, Address(R0, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 743   __ vrev(D10, D10, quad, 32, MacroAssembler::VELEM_SIZE_8);
 744   __ veor(D20, D10, D18, quad);
 745   __ vst1(D20, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 746 
 747   __ vld1(D12, Address(R0, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 748   __ vrev(D12, D12, quad, 32, MacroAssembler::VELEM_SIZE_8);
 749   __ veor(D20, D12, D18, quad);
 750   __ vst1(D20, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 751 
 752   __ vld1(D14, Address(R0, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 753   __ vrev(D14, D14, quad, 32, MacroAssembler::VELEM_SIZE_8);
 754   __ veor(D20, D14, D18, quad);
 755   __ vst1(D20, Address(R5, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 756 
 757 
 758   // Local frame map:
 759   // sp+20 - ouput buffer pointer
 760   // sp+28 - key pointer
 761   // sp+32 - original source
 762   // sp+36 - block counter
 763 
 764 
 765   // preserve output buffer pointer
 766   Address block_current_output_buffer(SP, 20);
 767   __ str(R1, block_current_output_buffer);
 768 
 769   // individual rounds in block processing are executed sequentially .
 770   Label block_start;
 771 
 772   // record end of the output buffer
 773   __ add(R0, R1, 128);
 774   __ str(R0, Address(SP, 12));
 775 
 776   __ bind(block_start);
 777 
 778   // load transporistion box reference (T5)
 779   // location of the reference (6th incoming argument, second slot on the stack):
 780   // 10 scalar registers on stack
 781   //  8 double-precision FP registers
 782   // 40 bytes frame size for local storage
 783   //  4 bytes offset to the original arguments list
 784   __ ldr(R0, Address(SP, 40+64+40+4));
 785   __ add(R0, R0, arrayOopDesc::base_offset_in_bytes(T_INT));
 786 
 787   // load rounds key and compensate for the first and last rounds
 788   __ ldr(LR, rounds_key);
 789   __ add(LR, LR, 32);
 790 
 791   // load block data out buffer
 792   __ ldr(R2, block_current_output_buffer);
 793   __ ldmia(R2, RegisterSet(R5, R8));
 794 
 795   Label round;
 796   __ bind(round);
 797 
 798   // Utilize a Transposition Box lookup along with subsequent shift and EOR with a round key.
 799   // instructions ordering is rearranged to minimize ReadAferWrite dependency. Not that important on A15 target
 800   // with register renaming but performs ~10% better on A9.
 801   __ mov(R12, AsmOperand(R5, lsr, 24));
 802   __ ubfx(R4, R8, 16, 8);
 803   __ ldr (R1, Address(R0, R12, lsl, 2));
 804   __ ldr(R2, Address(R0, R4, lsl, 2));
 805   __ ubfx(R3, R7, 8, 8);
 806   __ eor(R1, R1, AsmOperand(R2, ror, 8));
 807   __ uxtb(R4, R6);
 808   __ ldr(R3, Address(R0, R3, lsl, 2));
 809   __ ldr(R4, Address(R0, R4, lsl, 2));
 810   __ ldr(R12, Address(LR, 4, post_indexed));
 811   __ eor(R1, R1, AsmOperand(R3, ror, 16));
 812   __ eor(R12, R12, AsmOperand(R4, ror, 24));
 813   __ eor(R10, R1, R12);
 814 
 815   __ mov(R12, AsmOperand(R6, lsr, 24));
 816   __ ubfx(R4, R5, 16, 8);
 817   __ ldr (R1, Address(R0, R12, lsl, 2));
 818   __ ldr(R2, Address(R0, R4, lsl, 2));
 819   __ ubfx(R3, R8, 8, 8);
 820   __ eor(R1, R1, AsmOperand(R2, ror, 8));
 821   __ uxtb(R4, R7);
 822   __ ldr(R3, Address(R0, R3, lsl, 2));
 823   __ ldr(R4, Address(R0, R4, lsl, 2));
 824   __ ldr(R12, Address(LR, 4, post_indexed));
 825   __ eor(R1, R1, AsmOperand(R3, ror, 16));
 826   __ eor(R12, R12, AsmOperand(R4, ror, 24));
 827   __ eor(R11, R1, R12);
 828 
 829   __ mov(R12, AsmOperand(R7, lsr, 24));
 830   __ ubfx(R4, R6, 16, 8);
 831   __ ldr (R1, Address(R0, R12, lsl, 2));
 832   __ ldr(R2, Address(R0, R4, lsl, 2));
 833   __ ubfx(R3, R5, 8, 8);
 834   __ eor(R1, R1, AsmOperand(R2, ror, 8));
 835   __ uxtb(R4, R8);
 836   __ ldr(R3, Address(R0, R3, lsl, 2));
 837   __ ldr(R4, Address(R0, R4, lsl, 2));
 838   __ ldr(R12, Address(LR, 4, post_indexed));
 839   __ eor(R1, R1, AsmOperand(R3, ror, 16));
 840   __ eor(R12, R12, AsmOperand(R4, ror, 24));
 841   __ eor(R3, R1, R12);
 842   __ str(R3, Address(SP, 0));
 843 
 844   __ mov(R12, AsmOperand(R8, lsr, 24));
 845   __ ubfx(R4, R7, 16, 8);
 846   __ ldr (R1, Address(R0, R12, lsl, 2));
 847   __ ldr(R2, Address(R0, R4, lsl, 2));
 848   __ ubfx(R3, R6, 8, 8);
 849   __ eor(R1, R1, AsmOperand(R2, ror, 8));
 850   __ uxtb(R4, R5);
 851   __ ldr(R3, Address(R0, R3, lsl, 2));
 852   __ ldr(R4, Address(R0, R4, lsl, 2));
 853   __ ldr(R12, Address(LR, 4, post_indexed));
 854   __ eor(R1, R1, AsmOperand(R3, ror, 16));
 855   __ eor(R12, R12, AsmOperand(R4, ror, 24));
 856   __ eor(R8, R1, R12);
 857 
 858   // see if we reached the key array end
 859   __ cmp(R9, LR);
 860 
 861   //  load processed data
 862   __ mov(R5, R10);
 863   __ mov(R6, R11);
 864   __ ldr(R7, Address(SP, 0));
 865 
 866   __ b(round, gt);
 867 
 868 
 869   // last round is special
 870   // this round could be implemented through vtbl instruction in NEON. However vtbl is limited to a 32-byte wide table (4 vectors),
 871   // thus it requires 8 lookup rounds to cover 256-byte wide Si table. On the other hand scalar lookup is independent of the
 872   // lookup table size and thus proves to be faster.
 873   __ ldr(LR, block_current_output_buffer);
 874 
 875   // cipher counter
 876   __ ldr(R11, Address(SP, 12));
 877 
 878   __ mov_slow(R10, (int)SInvBox);
 879   __ ldrb(R0, Address(R10, R5, lsr, 24));
 880   __ ubfx(R12, R8, 16, 8);
 881   __ ldrb (R1, Address(R10, R12));
 882   __ orr(R0, R1, AsmOperand(R0, lsl, 8));
 883   __ ubfx(R12, R7, 8, 8);
 884   __ ldrb(R2, Address(R10, R12));
 885   __ orr(R0, R2, AsmOperand(R0, lsl, 8));
 886   __ uxtb(R12, R6);
 887   __ ldrb(R3, Address(R10, R12));
 888   __ orr(R0, R3, AsmOperand(R0, lsl, 8));
 889   __ str(R0, Address(LR, 4, post_indexed));
 890 
 891   __ ldrb(R0, Address(R10, R6, lsr, 24));
 892   __ ubfx(R12, R5, 16, 8);
 893   __ ldrb (R1, Address(R10, R12));
 894   __ orr(R0, R1, AsmOperand(R0, lsl, 8));
 895   __ ubfx(R12, R8, 8, 8);
 896   __ ldrb(R2, Address(R10, R12));
 897   __ orr(R0, R2, AsmOperand(R0, lsl, 8));
 898   __ uxtb(R12, R7);
 899   __ ldrb(R3, Address(R10, R12));
 900   __ orr(R0, R3, AsmOperand(R0, lsl, 8));
 901   __ str(R0, Address(LR, 4, post_indexed));
 902 
 903 
 904   __ ldrb(R0, Address(R10, R7, lsr, 24));
 905   __ ubfx(R12, R6, 16, 8);
 906   __ ldrb (R1, Address(R10, R12));
 907   __ orr(R0, R1, AsmOperand(R0, lsl, 8));
 908   __ ubfx(R12, R5, 8, 8);
 909   __ ldrb(R2, Address(R10, R12));
 910   __ orr(R0, R2, AsmOperand(R0, lsl, 8));
 911   __ uxtb(R12, R8);
 912   __ ldrb(R3, Address(R10, R12));
 913   __ orr(R0, R3, AsmOperand(R0, lsl, 8));
 914   __ str(R0, Address(LR, 4, post_indexed));
 915 
 916 
 917   __ ldrb(R0, Address(R10, R8, lsr, 24));
 918   __ ubfx(R12, R7, 16, 8);
 919   __ ldrb (R1, Address(R10, R12));
 920   __ orr(R0, R1, AsmOperand(R0, lsl, 8));
 921   __ ubfx(R12, R6, 8, 8);
 922   __ ldrb(R2, Address(R10, R12));
 923   __ orr(R0, R2, AsmOperand(R0, lsl, 8));
 924   __ uxtb(R12, R5);
 925   __ ldrb(R3, Address(R10, R12));
 926   __ orr(R0, R3, AsmOperand(R0, lsl, 8));
 927   __ str(R0, Address(LR, 4, post_indexed));
 928 
 929 
 930   // preserve current scratch buffer pointer
 931   __ cmp(R11, LR);
 932   __ str(LR, block_current_output_buffer);
 933 
 934   // go to the next block processing
 935   __ b(block_start, ne);
 936 
 937 
 938 
 939   // Perform last round AddRoundKey state on all 8 blocks
 940 
 941   // load key pointer (remember that [sp+24]  points to a byte #32 at the key array)
 942   // last round is processed with the key[0 ..3]
 943   __ ldr(LR, rounds_key);
 944 
 945   // retireve original output buffer pointer
 946   __ ldr(R1, block_current_output_buffer);
 947   __ sub(R1, R1, 128);
 948   __ mov(R5, R1);
 949 
 950 
 951   // retrieve original cipher (source) pointer
 952   __ ldr(R0, original_src);
 953 
 954   // retrieve IV (second argument on stack)
 955   __ ldr(R6, iv);
 956 
 957   __ vld1(D20, R6, MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 958   __ vrev(D20, D20, quad, 32, MacroAssembler::VELEM_SIZE_8);
 959 
 960   // perform last AddRoundKey and IV addition
 961   __ vld1(D18, Address(LR, 0, post_indexed), MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 962 
 963   __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 964   __ veor(D22, D22, D18, quad);
 965   __ veor(D22, D22, D20, quad);
 966   __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
 967   __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 968 
 969 
 970   __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 971   __ veor(D22, D22, D18, quad);
 972   __ veor(D22, D22, D0, quad);
 973   __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
 974   __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 975 
 976   __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 977   __ veor(D22, D22, D18, quad);
 978   __ veor(D22, D22, D2, quad);
 979   __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
 980   __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 981 
 982   __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 983   __ veor(D22, D22, D18, quad);
 984   __ veor(D22, D22, D4, quad);
 985   __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
 986   __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 987 
 988   __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 989   __ veor(D22, D22, D18, quad);
 990   __ veor(D22, D22, D6, quad);
 991   __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
 992   __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 993 
 994   __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 995   __ veor(D22, D22, D18, quad);
 996   __ veor(D22, D22, D8, quad);
 997   __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
 998   __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
 999 
1000   __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
1001   __ veor(D22, D22, D18, quad);
1002   __ veor(D22, D22, D10, quad);
1003   __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
1004   __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
1005 
1006   __ vld1(D22, Address(R1, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
1007   __ veor(D22, D22, D18, quad);
1008   __ veor(D22, D22, D12, quad);
1009   __ vrev(D22, D22, quad, 32, MacroAssembler::VELEM_SIZE_8);
1010   __ vst1(D22, Address(R5, 0, post_indexed),  MacroAssembler::VELEM_SIZE_8, MacroAssembler::VLD1_TYPE_2_REGS);
1011 
1012 
1013   // check if we're done
1014   __ ldr(R4, output_buffer_end);
1015   __ cmp(R4, R1);
1016   __ add(R0, R0, 128-16);
1017   __ str(R0, iv);
1018   __ add(R0, R0, 16);
1019 
1020   __ b(decrypt_8_blocks, ne);
1021 
1022   __ add(SP, SP, 40);
1023   __ fldmiad(SP, FloatRegisterSet(D8, 8), writeback);;
1024   }
1025 
1026   __ bind(cbc_done);
1027   __ pop(RegisterSet(R4, R12) | LR);
1028   __ ldr(R0, Address(SP));
1029   __ bx(LR);
1030 
1031   return start;
1032 }
1033 #endif // USE_CRYPTO