1 /*
   2  * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "runtime/deoptimization.hpp"
  27 #include "runtime/frame.inline.hpp"
  28 #include "runtime/stubRoutines.hpp"
  29 #include "runtime/thread.inline.hpp"
  30 #include "crc32c.h"
  31 
  32 #ifdef _MSC_VER
  33 #define ALIGNED_(x) __declspec(align(x))
  34 #else
  35 #define ALIGNED_(x) __attribute__ ((aligned(x)))
  36 #endif
  37 
  38 // Implementation of the platform-specific part of StubRoutines - for
  39 // a description of how to extend it, see the stubRoutines.hpp file.
  40 
  41 address StubRoutines::x86::_verify_mxcsr_entry = NULL;
  42 address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
  43 address StubRoutines::x86::_counter_shuffle_mask_addr = NULL;
  44 address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL;
  45 address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL;
  46 address StubRoutines::x86::_upper_word_mask_addr = NULL;
  47 address StubRoutines::x86::_shuffle_byte_flip_mask_addr = NULL;
  48 address StubRoutines::x86::_k256_adr = NULL;
  49 #ifdef _LP64
  50 address StubRoutines::x86::_k256_W_adr = NULL;
  51 #endif
  52 address StubRoutines::x86::_pshuffle_byte_flip_mask_addr = NULL;
  53 
  54 //tables common for sin and cos
  55 address StubRoutines::x86::_ONEHALF_adr = NULL;
  56 address StubRoutines::x86::_P_2_adr = NULL;
  57 address StubRoutines::x86::_SC_4_adr = NULL;
  58 address StubRoutines::x86::_Ctable_adr = NULL;
  59 address StubRoutines::x86::_SC_2_adr = NULL;
  60 address StubRoutines::x86::_SC_3_adr = NULL;
  61 address StubRoutines::x86::_SC_1_adr = NULL;
  62 address StubRoutines::x86::_PI_INV_TABLE_adr = NULL;
  63 address StubRoutines::x86::_PI_4_adr = NULL;
  64 address StubRoutines::x86::_PI32INV_adr = NULL;
  65 address StubRoutines::x86::_SIGN_MASK_adr = NULL;
  66 address StubRoutines::x86::_P_1_adr = NULL;
  67 address StubRoutines::x86::_P_3_adr = NULL;
  68 address StubRoutines::x86::_NEG_ZERO_adr = NULL;
  69 
  70 //tables common for sincos and tancot
  71 address StubRoutines::x86::_L_2il0floatpacket_0_adr = NULL;
  72 address StubRoutines::x86::_Pi4Inv_adr = NULL;
  73 address StubRoutines::x86::_Pi4x3_adr = NULL;
  74 address StubRoutines::x86::_Pi4x4_adr = NULL;
  75 address StubRoutines::x86::_ones_adr = NULL;
  76 
  77 uint64_t StubRoutines::x86::_crc_by128_masks[] =
  78 {
  79   /* The fields in this structure are arranged so that they can be
  80    * picked up two at a time with 128-bit loads.
  81    *
  82    * Because of flipped bit order for this CRC polynomials
  83    * the constant for X**N is left-shifted by 1.  This is because
  84    * a 64 x 64 polynomial multiply produces a 127-bit result
  85    * but the highest term is always aligned to bit 0 in the container.
  86    * Pre-shifting by one fixes this, at the cost of potentially making
  87    * the 32-bit constant no longer fit in a 32-bit container (thus the
  88    * use of uint64_t, though this is also the size used by the carry-
  89    * less multiply instruction.
  90    *
  91    * In addition, the flipped bit order and highest-term-at-least-bit
  92    * multiply changes the constants used.  The 96-bit result will be
  93    * aligned to the high-term end of the target 128-bit container,
  94    * not the low-term end; that is, instead of a 512-bit or 576-bit fold,
  95    * instead it is a 480 (=512-32) or 544 (=512+64-32) bit fold.
  96    *
  97    * This cause additional problems in the 128-to-64-bit reduction; see the
  98    * code for details.  By storing a mask in the otherwise unused half of
  99    * a 128-bit constant, bits can be cleared before multiplication without
 100    * storing and reloading.  Note that staying on a 128-bit datapath means
 101    * that some data is uselessly stored and some unused data is intersected
 102    * with an irrelevant constant.
 103    */
 104 
 105   ((uint64_t) 0xffffffffUL),     /* low  of K_M_64    */
 106   ((uint64_t) 0xb1e6b092U << 1), /* high of K_M_64    */
 107   ((uint64_t) 0xba8ccbe8U << 1), /* low  of K_160_96  */
 108   ((uint64_t) 0x6655004fU << 1), /* high of K_160_96  */
 109   ((uint64_t) 0xaa2215eaU << 1), /* low  of K_544_480 */
 110   ((uint64_t) 0xe3720acbU << 1)  /* high of K_544_480 */
 111 };
 112 
 113 /**
 114  *  crc_table[] from jdk/src/share/native/java/util/zip/zlib-1.2.5/crc32.h
 115  */
 116 juint StubRoutines::x86::_crc_table[] =
 117 {
 118     0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
 119     0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
 120     0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
 121     0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
 122     0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
 123     0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
 124     0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
 125     0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
 126     0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
 127     0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
 128     0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
 129     0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
 130     0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
 131     0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
 132     0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
 133     0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
 134     0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
 135     0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
 136     0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
 137     0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
 138     0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
 139     0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
 140     0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
 141     0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
 142     0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
 143     0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
 144     0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
 145     0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
 146     0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
 147     0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
 148     0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
 149     0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
 150     0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
 151     0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
 152     0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
 153     0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
 154     0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
 155     0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
 156     0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
 157     0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
 158     0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
 159     0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
 160     0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
 161     0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
 162     0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
 163     0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
 164     0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
 165     0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
 166     0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
 167     0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
 168     0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
 169     0x2d02ef8dUL
 170 };
 171 
 172 #define D 32
 173 #define P 0x82F63B78 // Reflection of Castagnoli (0x11EDC6F41)
 174 
 175 #define TILL_CYCLE 31
 176 uint32_t _crc32c_pow_2k_table[TILL_CYCLE]; // because _crc32c_pow_2k_table[TILL_CYCLE == 31] == _crc32c_pow_2k_table[0]
 177 
 178 // A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 8
 179 // Listing 1: Multiplication of normalized polynomials
 180 // "a" and "b" occupy D least significant bits.
 181 uint32_t crc32c_multiply(uint32_t a, uint32_t b) {
 182   uint32_t product = 0;
 183   uint32_t b_pow_x_table[D + 1]; // b_pow_x_table[k] = (b * x**k) mod P
 184   b_pow_x_table[0] = b;
 185   for (int k = 0; k < D; ++k) {
 186     // If "a" has non-zero coefficient at x**k,/ add ((b * x**k) mod P) to the result.
 187     if ((a & (((uint32_t)1) << (D - 1 - k))) != 0) product ^= b_pow_x_table[k];
 188 
 189     // Compute b_pow_x_table[k+1] = (b ** x**(k+1)) mod P.
 190     if (b_pow_x_table[k] & 1) {
 191       // If degree of (b_pow_x_table[k] * x) is D, then
 192       // degree of (b_pow_x_table[k] * x - P) is less than D.
 193       b_pow_x_table[k + 1] = (b_pow_x_table[k] >> 1) ^ P;
 194     }
 195     else {
 196       b_pow_x_table[k + 1] = b_pow_x_table[k] >> 1;
 197     }
 198   }
 199   return product;
 200 }
 201 #undef D
 202 #undef P
 203 
 204 // A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 9
 205 void crc32c_init_pow_2k(void) {
 206   // _crc32c_pow_2k_table(0) =
 207   // x^(2^k) mod P(x) = x mod P(x) = x
 208   // Since we are operating on a reflected values
 209   // x = 10b, reflect(x) = 0x40000000
 210   _crc32c_pow_2k_table[0] = 0x40000000;
 211 
 212   for (int k = 1; k < TILL_CYCLE; k++) {
 213     // _crc32c_pow_2k_table(k+1) = _crc32c_pow_2k_table(k-1)^2 mod P(x)
 214     uint32_t tmp = _crc32c_pow_2k_table[k - 1];
 215     _crc32c_pow_2k_table[k] = crc32c_multiply(tmp, tmp);
 216   }
 217 }
 218 
 219 // x^N mod P(x)
 220 uint32_t crc32c_f_pow_n(uint32_t n) {
 221   //            result = 1 (polynomial)
 222   uint32_t one, result = 0x80000000, i = 0;
 223 
 224   while (one = (n & 1), (n == 1 || n - one > 0)) {
 225     if (one) {
 226       result = crc32c_multiply(result, _crc32c_pow_2k_table[i]);
 227     }
 228     n >>= 1;
 229     i++;
 230   }
 231 
 232   return result;
 233 }
 234 
 235 juint *StubRoutines::x86::_crc32c_table;
 236 
 237 void StubRoutines::x86::generate_CRC32C_table(bool is_pclmulqdq_table_supported) {
 238 
 239   static juint pow_n[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
 240 
 241   crc32c_init_pow_2k();
 242 
 243   pow_n[0] = crc32c_f_pow_n(CRC32C_HIGH * 8);      // 8N * 8 = 64N
 244   pow_n[1] = crc32c_f_pow_n(CRC32C_HIGH * 8 * 2);  // 128N
 245 
 246   pow_n[2] = crc32c_f_pow_n(CRC32C_MIDDLE * 8);
 247   pow_n[3] = crc32c_f_pow_n(CRC32C_MIDDLE * 8 * 2);
 248 
 249   pow_n[4] = crc32c_f_pow_n(CRC32C_LOW * 8);
 250   pow_n[CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1] =
 251             crc32c_f_pow_n(CRC32C_LOW * 8 * 2);
 252 
 253   if (is_pclmulqdq_table_supported) {
 254     _crc32c_table = pow_n;
 255   } else {
 256     static julong pclmulqdq_table[CRC32C_NUM_PRECOMPUTED_CONSTANTS * 256];
 257 
 258     for (int j = 0; j < CRC32C_NUM_PRECOMPUTED_CONSTANTS; j++) {
 259       static juint X_CONST = pow_n[j];
 260       for (int64_t i = 0; i < 256; i++) { // to force 64 bit wide computations
 261       // S. Gueron / Information Processing Letters 112 (2012) 184
 262       // Algorithm 3: Generating a carry-less multiplication lookup table.
 263       // Input: A 32-bit constant, X_CONST.
 264       // Output: A table of 256 entries, each one is a 64-bit quadword,
 265       // that can be used for computing "byte" * X_CONST, for a given byte.
 266         pclmulqdq_table[j * 256 + i] =
 267           ((i & 1) * X_CONST) ^ ((i & 2) * X_CONST) ^ ((i & 4) * X_CONST) ^
 268           ((i & 8) * X_CONST) ^ ((i & 16) * X_CONST) ^ ((i & 32) * X_CONST) ^
 269           ((i & 64) * X_CONST) ^ ((i & 128) * X_CONST);
 270       }
 271     }
 272     _crc32c_table = (juint*)pclmulqdq_table;
 273   }
 274 }
 275 
 276 ALIGNED_(64) juint StubRoutines::x86::_k256[] =
 277 {
 278     0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
 279     0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
 280     0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
 281     0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
 282     0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
 283     0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL,
 284     0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL,
 285     0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL,
 286     0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL,
 287     0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
 288     0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL,
 289     0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL,
 290     0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL,
 291     0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL,
 292     0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
 293     0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
 294 };
 295 
 296 #ifdef _LP64
 297 // used in MacroAssembler::sha256_AVX2
 298 // dynamically built from _k256 
 299 ALIGNED_(64) juint StubRoutines::x86::_k256_W[2*sizeof(StubRoutines::x86::_k256)];
 300 #endif