< prev index next >

src/cpu/x86/vm/stubGenerator_x86_64.cpp

Print this page

        

@@ -3052,10 +3052,19 @@
     __ emit_data64( 0x0405060700010203, relocInfo::none );
     __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
     return start;
   }
 
+  address generate_counter_shuffle_mask() {
+    __ align(16);
+    StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
+    address start = __ pc();
+    __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
+    __ emit_data64(0x0001020304050607, relocInfo::none);
+    return start;
+  }
+
   // Utility routine for loading a 128-bit key word in little endian format
   // can optionally specify that the shuffle mask is already in an xmmregister
   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
     __ movdqu(xmmdst, Address(key, offset));
     if (xmm_shuf_mask != NULL) {

@@ -3063,10 +3072,22 @@
     } else {
       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
     }
   }
 
+  // Utility routine for increase 128bit counter (iv in CTR mode)
+  void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
+    __ pextrq(reg, xmmdst, 0x0);
+    __ addq(reg, inc_delta);
+    __ pinsrq(xmmdst, reg, 0x0);
+    __ jcc(Assembler::carryClear, next_block); // jump if no carry
+    __ pextrq(reg, xmmdst, 0x01); // Carry
+    __ addq(reg, 0x01);
+    __ pinsrq(xmmdst, reg, 0x01); //Carry end
+    __ BIND(next_block);          // next instruction 
+  }
+  
   // Arguments:
   //
   // Inputs:
   //   c_rarg0   - source byte array address
   //   c_rarg1   - destination byte array address

@@ -3713,10 +3734,223 @@
     __ jmp(L_exit);
 
     return start;
   }
 
+  // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
+  // to hide instruction latency
+  //
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - counter vector byte array address
+  //   c_rarg4   - input length
+  //
+  // Output:
+  //   rax       - input length
+  //
+  address generate_counterMode_AESCrypt_Parallel() {
+    assert(UseAES, "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
+    address start = __ pc();
+    const Register from = c_rarg0; // source array address
+    const Register to = c_rarg1; // destination array address
+    const Register key = c_rarg2; // key array address
+    const Register counter = c_rarg3; // counter byte array initialized from counter array address
+    // and left with the results of the last encryption block
+#ifndef _WIN64
+    const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
+#else
+    const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
+    const Register len_reg = r10; // pick the first volatile windows register
+#endif
+    const Register pos = rax;  // pos is the return value.
+        
+    const int PARALLEL_FACTOR = 6;
+    const XMMRegister xmm_counter_shuf_mask = xmm0;
+    const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
+    const XMMRegister xmm_curr_counter = xmm2;
+    
+    const XMMRegister xmm_key_tmp0 = xmm3;
+    const XMMRegister xmm_key_tmp1 = xmm4;
+    
+    // registers holding the four results in the parallelized loop
+    const XMMRegister xmm_result0 = xmm5;
+    const XMMRegister xmm_result1 = xmm6; 
+    const XMMRegister xmm_result2 = xmm7;
+    const XMMRegister xmm_result3 = xmm8;
+    const XMMRegister xmm_result4 = xmm9;
+    const XMMRegister xmm_result5 = xmm10;
+        
+    const XMMRegister xmm_from0 = xmm11;
+    const XMMRegister xmm_from1 = xmm12;
+    const XMMRegister xmm_from2 = xmm13;
+    const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
+    const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
+    const XMMRegister xmm_from5 = xmm4;
+    
+    //for key_128, key_192, key_256
+    const int rounds[3] = {10, 12, 14};
+    Label L_multiBlock_loopTop[3];
+    Label L_singleBlockLoopTop[3];
+    Label L__incCounter[3][6]; //for 6 blocks
+    Label L__incCounter_single[3]; //for single block, key128, key192, key256
+    
+    Label L_exit;
+    
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+    // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
+    // context for the registers used, where all instructions below are using 128-bit mode
+    // On EVEX without VL and BW, these instructions will all be AVX.
+    if (VM_Version::supports_avx512vlbw()) {
+        __ movl(rax, 0xffff);
+        __ kmovql(k1, rax);
+    }
+    
+#ifdef _WIN64
+    // on win64, fill len_reg from stack position
+    __ movl(len_reg, len_mem);
+    // save the xmm registers which must be preserved 6-14
+    const int XMM_REG_NUM_KEY_LAST = 14;
+    __ subptr(rsp, -rsp_after_call_off * wordSize);
+    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
+      __ movdqu(xmm_save(i), as_XMMRegister(i));
+    }
+#else
+    __ push(len_reg); // Save
+#endif
+    __ push(rbx); // Save RBX 
+    __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
+    __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
+    __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled 
+    __ movptr(pos, 0);
+    
+    // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
+    __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
+    __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+    __ cmpl(rbx, 52);
+    __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
+    __ cmpl(rbx, 60);
+    __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
+
+#define CTR_DoSix(opc, src_reg)                \
+    __ opc(xmm_result0, src_reg);              \
+    __ opc(xmm_result1, src_reg);              \
+    __ opc(xmm_result2, src_reg);              \
+    __ opc(xmm_result3, src_reg);              \
+    __ opc(xmm_result4, src_reg);              \
+    __ opc(xmm_result5, src_reg);
+    
+    // k == 0 :  generate code for key_128
+    // k == 1 :  generate code for key_192
+    // k == 2 :  generate code for key_256
+    for (int k = 0; k < 3; ++k) {
+      //multi blocks starts here  
+      __ align(OptoLoopAlignment);
+      __ BIND(L_multiBlock_loopTop[k]);
+      __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
+      __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
+      load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
+      
+      //load, then increase counters
+      CTR_DoSix(movdqa, xmm_curr_counter);
+      inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
+      inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
+      inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
+      inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
+      inc_counter(rbx, xmm_result5,  0x05, L__incCounter[k][4]);
+      inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);      
+      CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
+      CTR_DoSix(pxor, xmm_key_tmp0);   //PXOR with Round 0 key
+      
+      //load two ROUND_KEYs at a time
+      for (int i = 1; i < rounds[k]; ) {
+        load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
+        load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);   
+        CTR_DoSix(aesenc, xmm_key_tmp1);
+        i++;
+        if (i != rounds[k]) {
+          CTR_DoSix(aesenc, xmm_key_tmp0); 
+        } else {
+          CTR_DoSix(aesenclast, xmm_key_tmp0); 
+        }
+        i++;
+      }      
+      
+      // get next PARALLEL_FACTOR blocks into xmm_result registers
+      __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); 
+      __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
+      __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
+      __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
+      __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
+      __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
+      
+      __ pxor(xmm_result0, xmm_from0);
+      __ pxor(xmm_result1, xmm_from1);
+      __ pxor(xmm_result2, xmm_from2);
+      __ pxor(xmm_result3, xmm_from3);
+      __ pxor(xmm_result4, xmm_from4);
+      __ pxor(xmm_result5, xmm_from5);
+      
+      // store 6 results into the next 64 bytes of output
+      __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
+      __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
+      __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
+      __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
+      __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
+      __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
+      
+      __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
+      __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
+      __ jmp(L_multiBlock_loopTop[k]);
+      
+      // singleBlock starts here
+      __ align(OptoLoopAlignment);
+      __ BIND(L_singleBlockLoopTop[k]);
+      __ cmpptr(len_reg, 0);
+      __ jcc(Assembler::equal, L_exit);
+      load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
+      __ movdqa(xmm_result0, xmm_curr_counter);
+      inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
+      __ pshufb(xmm_result0, xmm_counter_shuf_mask);
+      __ pxor(xmm_result0, xmm_key_tmp0);
+      for (int i = 1; i < rounds[k]; i++) {
+        load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
+        __ aesenc(xmm_result0, xmm_key_tmp0);
+      }
+      load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
+      __ aesenclast(xmm_result0, xmm_key_tmp0);
+      __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
+      __ pxor(xmm_result0, xmm_from0);
+      __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
+      __ addptr(pos, AESBlockSize);
+      __ subptr(len_reg, AESBlockSize);
+      __ jmp(L_singleBlockLoopTop[k]);
+    }
+    
+    __ BIND(L_exit);
+    __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
+    __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
+    __ pop(rbx); // pop the saved RBX.
+#ifdef _WIN64
+    // restore regs belonging to calling function
+    for (int i = 6; i <= XMM_REG_NUM_KEY_LAST; i++) {
+      __ movdqu(as_XMMRegister(i), xmm_save(i));
+    }
+    __ movl(rax, len_mem);
+#else
+    __ pop(rax); // return length
+#endif
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(0);    
+    return start;
+  }
 
   // byte swap x86 long
   address generate_ghash_long_swap_mask() {
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");

@@ -4472,16 +4706,19 @@
     generate_math_stubs();
 
     // don't bother generating these AES intrinsic stubs unless global flag is set
     if (UseAESIntrinsics) {
       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
-
       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
     }
+    if (UseCTRAESIntrinsics){
+      StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
+      StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
+    }
 
     // Generate GHASH intrinsics code
     if (UseGHASHIntrinsics) {
       StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
       StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
< prev index next >